1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 /* 7 * Kernel for PAVP buffer clear. 8 * 9 * 1. Clear all 64 GRF registers assigned to the kernel with designated value; 10 * 2. Write 32x16 block of all "0" to render target buffer which indirectly clears 11 * 512 bytes of Render Cache. 12 */ 13 14 /* Store designated "clear GRF" value */ 15 mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N }; 16 17 /** 18 * Curbe Format 19 * 20 * DW 1.0 - Block Offset to write Render Cache 21 * DW 1.1 [15:0] - Clear Word 22 * DW 1.2 - Delay iterations 23 * DW 1.3 - Enable Instrumentation (only for debug) 24 * DW 1.4 - Rsvd (intended for context ID) 25 * DW 1.5 - [31:16]:SliceCount, [15:0]:SubSlicePerSliceCount 26 * DW 1.6 - Rsvd MBZ (intended for Enable Wait on Total Thread Count) 27 * DW 1.7 - Rsvd MBZ (inteded for Total Thread Count) 28 * 29 * Binding Table 30 * 31 * BTI 0: 2D Surface to help clear L3 (Render/Data Cache) 32 * BTI 1: Wait/Instrumentation Buffer 33 * Size : (SliceCount * SubSliceCount * 16 EUs/SubSlice) rows * (16 threads/EU) cols (Format R32_UINT) 34 * Expected to be initialized to 0 by driver/another kernel 35 * Layout: 36 * RowN: Histogram for EU-N: (SliceID*SubSlicePerSliceCount + SSID)*16 + EUID [assume max 16 EUs / SS] 37 * Col-k[DW-k]: Threads Executed on ThreadID-k for EU-N 38 */ 39 add(1) g1.2<1>UD g1.2<0,1,0>UD 0x00000001UD { align1 1N }; /* Loop count to delay kernel: Init to (g1.2 + 1) */ 40 cmp.z.f0.0(1) null<1>UD g1.3<0,1,0>UD 0x00000000UD { align1 1N }; 41 (+f0.0) jmpi(1) 352D { align1 WE_all 1N }; 42 43 /** 44 * State Register has info on where this thread is running 45 * IVB: sr0.0 :: [15:13]: MBZ, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID 46 * HSW: sr0.0 :: 15: MBZ, [14:13]: SliceID, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID 47 */ 48 mov(8) g3<1>UD 0x00000000UD { align1 1Q }; 49 shr(1) g3<1>D sr0<0,1,0>D 12D { align1 1N }; 50 and(1) g3<1>D g3<0,1,0>D 1D { align1 1N }; /* g3 has HSID */ 51 shr(1) g3.1<1>D sr0<0,1,0>D 13D { align1 1N }; 52 and(1) g3.1<1>D g3.1<0,1,0>D 3D { align1 1N }; /* g3.1 has sliceID */ 53 mul(1) g3.5<1>D g3.1<0,1,0>D g1.10<0,1,0>UW { align1 1N }; 54 add(1) g3<1>D g3<0,1,0>D g3.5<0,1,0>D { align1 1N }; /* g3 = sliceID * SubSlicePerSliceCount + HSID */ 55 shr(1) g3.2<1>D sr0<0,1,0>D 8D { align1 1N }; 56 and(1) g3.2<1>D g3.2<0,1,0>D 15D { align1 1N }; /* g3.2 = EUID */ 57 mul(1) g3.4<1>D g3<0,1,0>D 16D { align1 1N }; 58 add(1) g3.2<1>D g3.2<0,1,0>D g3.4<0,1,0>D { align1 1N }; /* g3.2 now points to EU row number (Y-pixel = V address ) in instrumentation surf */ 59 60 mov(8) g5<1>UD 0x00000000UD { align1 1Q }; 61 and(1) g3.3<1>D sr0<0,1,0>D 7D { align1 1N }; 62 mul(1) g3.3<1>D g3.3<0,1,0>D 4D { align1 1N }; 63 64 mov(8) g4<1>UD g0<8,8,1>UD { align1 1Q }; /* Initialize message header with g0 */ 65 mov(1) g4<1>UD g3.3<0,1,0>UD { align1 1N }; /* Block offset */ 66 mov(1) g4.1<1>UD g3.2<0,1,0>UD { align1 1N }; /* Block offset */ 67 mov(1) g4.2<1>UD 0x00000003UD { align1 1N }; /* Block size (1 row x 4 bytes) */ 68 and(1) g4.3<1>UD g4.3<0,1,0>UW 0xffffffffUD { align1 1N }; 69 70 /* Media block read to fetch current value at specified location in instrumentation buffer */ 71 sendc(8) g5<1>UD g4<8,8,1>F 0x02190001 72 73 render MsgDesc: media block read MsgCtrl = 0x0 Surface = 1 mlen 1 rlen 1 { align1 1Q }; 74 add(1) g5<1>D g5<0,1,0>D 1D { align1 1N }; 75 76 /* Media block write for updated value at specified location in instrumentation buffer */ 77 sendc(8) g5<1>UD g4<8,8,1>F 0x040a8001 78 render MsgDesc: media block write MsgCtrl = 0x0 Surface = 1 mlen 2 rlen 0 { align1 1Q }; 79 80 /* Delay thread for specified parameter */ 81 add.nz.f0.0(1) g1.2<1>UD g1.2<0,1,0>UD -1D { align1 1N }; 82 (+f0.0) jmpi(1) -32D { align1 WE_all 1N }; 83 84 /* Store designated "clear GRF" value */ 85 mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N }; 86 87 /* Initialize looping parameters */ 88 mov(1) a0<1>D 0D { align1 1N }; /* Initialize a0.0:w=0 */ 89 mov(1) a0.4<1>W 127W { align1 1N }; /* Loop count. Each loop contains 16 GRF's */ 90 91 /* Write 32x16 all "0" block */ 92 mov(8) g2<1>UD g0<8,8,1>UD { align1 1Q }; 93 mov(8) g127<1>UD g0<8,8,1>UD { align1 1Q }; 94 mov(2) g2<1>UD g1<2,2,1>UW { align1 1N }; 95 mov(1) g2.2<1>UD 0x000f000fUD { align1 1N }; /* Block size (16x16) */ 96 and(1) g2.3<1>UD g2.3<0,1,0>UW 0xffffffefUD { align1 1N }; 97 mov(16) g3<1>UD 0x00000000UD { align1 1H }; 98 mov(16) g4<1>UD 0x00000000UD { align1 1H }; 99 mov(16) g5<1>UD 0x00000000UD { align1 1H }; 100 mov(16) g6<1>UD 0x00000000UD { align1 1H }; 101 mov(16) g7<1>UD 0x00000000UD { align1 1H }; 102 mov(16) g8<1>UD 0x00000000UD { align1 1H }; 103 mov(16) g9<1>UD 0x00000000UD { align1 1H }; 104 mov(16) g10<1>UD 0x00000000UD { align1 1H }; 105 sendc(8) null<1>UD g2<8,8,1>F 0x120a8000 106 render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q }; 107 add(1) g2<1>UD g1<0,1,0>UW 0x0010UW { align1 1N }; 108 sendc(8) null<1>UD g2<8,8,1>F 0x120a8000 109 render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q }; 110 111 /* Now, clear all GRF registers */ 112 add.nz.f0.0(1) a0.4<1>W a0.4<0,1,0>W -1W { align1 1N }; 113 mov(16) g[a0]<1>UW f0.1<0,1,0>UW { align1 1H }; 114 add(1) a0<1>D a0<0,1,0>D 32D { align1 1N }; 115 (+f0.0) jmpi(1) -64D { align1 WE_all 1N }; 116 117 /* Terminante the thread */ 118 sendc(8) null<1>UD g127<8,8,1>F 0x82000010 119 thread_spawner MsgDesc: mlen 1 rlen 0 { align1 1Q EOT }; 120