1/* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23/* To compile this assembly code: 24 * 25 * Navi1x: 26 * cpp -DASIC_TARGET_NAVI1X=1 cwsr_trap_handler_gfx10.asm -P -o nv1x.sp3 27 * sp3-nv1x nv1x.sp3 -hex nv1x.hex 28 * 29 * Others: 30 * cpp -DASIC_TARGET_NAVI1X=0 cwsr_trap_handler_gfx10.asm -P -o gfx10.sp3 31 * sp3-gfx10 gfx10.sp3 -hex gfx10.hex 32 */ 33 34#define NO_SQC_STORE !ASIC_TARGET_NAVI1X 35 36var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised 37 38var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 39var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 40var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 41var SQ_WAVE_STATUS_HALT_MASK = 0x2000 42 43var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 44var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 45var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 46var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 47var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 48var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 4 49var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24 50var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4 51var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 52var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 53 54var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 55var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF 56var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 57var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 58var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 59var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF 60var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 61var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 62var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 63var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 64var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 65var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 66 67var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 68var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 69var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT = 25 70var SQ_WAVE_IB_STS_REPLAY_W64H_SIZE = 1 71var SQ_WAVE_IB_STS_REPLAY_W64H_MASK = 0x02000000 72var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 73var SQ_WAVE_IB_STS_RCNT_SIZE = 6 74var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000 75var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF 76 77var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 78 79var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 80var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 81 82// bits [31:24] unused by SPI debug data 83var TTMP11_SAVE_REPLAY_W64H_SHIFT = 31 84var TTMP11_SAVE_REPLAY_W64H_MASK = 0x80000000 85var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 24 86var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0x7F000000 87 88// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] 89// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE 90var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 91var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC 92 93var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 94var S_SAVE_SPI_INIT_ATC_SHIFT = 27 95var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 96var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 97var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 98var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 99 100var S_SAVE_PC_HI_RCNT_SHIFT = 26 101var S_SAVE_PC_HI_RCNT_MASK = 0xFC000000 102var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 25 103var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x02000000 104var S_SAVE_PC_HI_REPLAY_W64H_SHIFT = 24 105var S_SAVE_PC_HI_REPLAY_W64H_MASK = 0x01000000 106 107var s_sgpr_save_num = 108 108 109var s_save_spi_init_lo = exec_lo 110var s_save_spi_init_hi = exec_hi 111var s_save_pc_lo = ttmp0 112var s_save_pc_hi = ttmp1 113var s_save_exec_lo = ttmp2 114var s_save_exec_hi = ttmp3 115var s_save_status = ttmp12 116var s_save_trapsts = ttmp15 117var s_save_xnack_mask = s_save_trapsts 118var s_wave_size = ttmp7 119var s_save_buf_rsrc0 = ttmp8 120var s_save_buf_rsrc1 = ttmp9 121var s_save_buf_rsrc2 = ttmp10 122var s_save_buf_rsrc3 = ttmp11 123var s_save_mem_offset = ttmp4 124var s_save_alloc_size = s_save_trapsts 125var s_save_tmp = ttmp14 126var s_save_m0 = ttmp5 127var s_save_ttmps_lo = s_save_tmp 128var s_save_ttmps_hi = s_save_trapsts 129 130var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE 131var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC 132 133var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 134var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 135var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 136var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 137var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 138var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 139var S_WAVE_SIZE = 25 140 141var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT 142var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK 143var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT 144var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK 145 146var s_restore_spi_init_lo = exec_lo 147var s_restore_spi_init_hi = exec_hi 148var s_restore_mem_offset = ttmp12 149var s_restore_alloc_size = ttmp3 150var s_restore_tmp = ttmp2 151var s_restore_mem_offset_save = s_restore_tmp 152var s_restore_m0 = s_restore_alloc_size 153var s_restore_mode = ttmp7 154var s_restore_flat_scratch = s_restore_tmp 155var s_restore_pc_lo = ttmp0 156var s_restore_pc_hi = ttmp1 157var s_restore_exec_lo = ttmp4 158var s_restore_exec_hi = ttmp5 159var s_restore_status = ttmp14 160var s_restore_trapsts = ttmp15 161var s_restore_xnack_mask = ttmp13 162var s_restore_buf_rsrc0 = ttmp8 163var s_restore_buf_rsrc1 = ttmp9 164var s_restore_buf_rsrc2 = ttmp10 165var s_restore_buf_rsrc3 = ttmp11 166var s_restore_size = ttmp6 167var s_restore_ttmps_lo = s_restore_tmp 168var s_restore_ttmps_hi = s_restore_alloc_size 169 170shader main 171 asic(DEFAULT) 172 type(CS) 173 wave_size(32) 174 175 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save 176 177L_JUMP_TO_RESTORE: 178 s_branch L_RESTORE 179 180L_SKIP_RESTORE: 181 s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC 182 s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK 183 184if SINGLE_STEP_MISSED_WORKAROUND 185 // No single step exceptions if MODE.DEBUG_EN=0. 186 s_getreg_b32 ttmp2, hwreg(HW_REG_MODE) 187 s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK 188 s_cbranch_scc0 L_NO_SINGLE_STEP_WORKAROUND 189 190 // Second-level trap already handled exception if STATUS.HALT=1. 191 s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK 192 193 // Prioritize single step exception over context save. 194 // Second-level trap will halt wave and RFE, re-entering for SAVECTX. 195 s_cbranch_scc0 L_FETCH_2ND_TRAP 196 197L_NO_SINGLE_STEP_WORKAROUND: 198end 199 200 201 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) 202 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save 203 s_cbranch_scc1 L_SAVE 204 205 // If STATUS.MEM_VIOL is asserted then halt the wave to prevent 206 // the exception raising again and blocking context save. 207 s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK 208 s_cbranch_scc0 L_FETCH_2ND_TRAP 209 s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK 210 211L_FETCH_2ND_TRAP: 212 213#if ASIC_TARGET_NAVI1X 214 // Preserve and clear scalar XNACK state before issuing scalar loads. 215 // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into 216 // unused space ttmp11[31:24]. 217 s_andn2_b32 ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK) 218 s_getreg_b32 ttmp2, hwreg(HW_REG_IB_STS) 219 s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK 220 s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) 221 s_or_b32 ttmp11, ttmp11, ttmp3 222 s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK 223 s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) 224 s_or_b32 ttmp11, ttmp11, ttmp3 225 s_andn2_b32 ttmp2, ttmp2, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK) 226 s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 227#endif 228 229 // Read second-level TBA/TMA from first-level TMA and jump if available. 230 // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) 231 // ttmp12 holds SQ_WAVE_STATUS 232 s_getreg_b32 ttmp14, hwreg(HW_REG_SHADER_TMA_LO) 233 s_getreg_b32 ttmp15, hwreg(HW_REG_SHADER_TMA_HI) 234 s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 235 s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA 236 s_waitcnt lgkmcnt(0) 237 s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA 238 s_waitcnt lgkmcnt(0) 239 s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] 240 s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set 241 s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler 242 243L_NO_NEXT_TRAP: 244 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) 245 s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK 246 s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. 247 s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 248 s_addc_u32 ttmp1, ttmp1, 0 249L_EXCP_CASE: 250 s_and_b32 ttmp1, ttmp1, 0xFFFF 251 252#if ASIC_TARGET_NAVI1X 253 // Restore SQ_WAVE_IB_STS. 254 s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) 255 s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK 256 s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) 257 s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK 258 s_or_b32 ttmp2, ttmp2, ttmp3 259 s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 260#endif 261 262 // Restore SQ_WAVE_STATUS. 263 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 264 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 265 s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status 266 267 s_rfe_b64 [ttmp0, ttmp1] 268 269L_SAVE: 270 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] 271 s_mov_b32 s_save_tmp, 0 272 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit 273 274#if ASIC_TARGET_NAVI1X 275 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) 276 s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT 277 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp 278 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) 279 s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT 280 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp 281 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT, SQ_WAVE_IB_STS_REPLAY_W64H_SIZE) 282 s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_REPLAY_W64H_SHIFT 283 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp 284 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY and REPLAY_W64H in IB_STS 285 s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG 286 287 s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp 288#endif 289 290 /* inform SPI the readiness and wait for SPI's go signal */ 291 s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI 292 s_mov_b32 s_save_exec_hi, exec_hi 293 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive 294 295 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC 296 297#if ASIC_TARGET_NAVI1X 298L_SLEEP: 299 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause 300 // SQ hang, since the 7,8th wave could not get arbit to exec inst, while 301 // other waves are stuck into the sleep-loop and waiting for wrexec!=0 302 s_sleep 0x2 303 s_cbranch_execz L_SLEEP 304#else 305 s_waitcnt lgkmcnt(0) 306#endif 307 308 // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic 309 // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 310 get_wave_size(s_save_ttmps_hi) 311 get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) 312 s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF 313 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes() 314 s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo 315 s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0 316 317#if ASIC_TARGET_NAVI1X 318 s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1 319 s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1 320 s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1 321#endif 322 323 /* setup Resource Contants */ 324 s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo 325 s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi 326 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE 327 s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited 328 s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC 329 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK 330 s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) 331 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC 332 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK 333 s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) 334 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE 335 336 s_mov_b32 s_save_m0, m0 337 338 /* global mem offset */ 339 s_mov_b32 s_save_mem_offset, 0x0 340 get_wave_size(s_wave_size) 341 342#if ASIC_TARGET_NAVI1X 343 // Save and clear vector XNACK state late to free up SGPRs. 344 s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) 345 s_setreg_imm32_b32 hwreg(HW_REG_SHADER_XNACK_MASK), 0x0 346#endif 347 348 /* save first 4 VGPRs, needed for SGPR save */ 349 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 350 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 351 s_and_b32 m0, m0, 1 352 s_cmp_eq_u32 m0, 1 353 s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI 354 s_mov_b32 exec_hi, 0x00000000 355 s_branch L_SAVE_4VGPR_WAVE32 356L_ENABLE_SAVE_4VGPR_EXEC_HI: 357 s_mov_b32 exec_hi, 0xFFFFFFFF 358 s_branch L_SAVE_4VGPR_WAVE64 359L_SAVE_4VGPR_WAVE32: 360 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 361 362 // VGPR Allocated in 4-GPR granularity 363 364 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 365 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 366 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 367 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 368 s_branch L_SAVE_HWREG 369 370L_SAVE_4VGPR_WAVE64: 371 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 372 373 // VGPR Allocated in 4-GPR granularity 374 375 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 376 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 377 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 378 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 379 380 /* save HW registers */ 381 382L_SAVE_HWREG: 383 // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) 384 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) 385 get_svgpr_size_bytes(s_save_tmp) 386 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 387 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() 388 389 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 390 391#if NO_SQC_STORE 392 v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource 393 v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource 394 v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store 395 s_mov_b32 m0, 0x0 //Next lane of v2 to write to 396#endif 397 398 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 399 write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) 400 write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) 401 write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) 402 write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) 403 write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) 404 405 s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) 406 write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) 407 408 // Not used on Sienna_Cichlid but keep layout same for debugger. 409 write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset) 410 411 s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) 412 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 413 414 s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO) 415 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 416 417 s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI) 418 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 419 420#if NO_SQC_STORE 421 // Write HWREG/SGPRs with 32 VGPR lanes, wave32 is common case. 422 s_mov_b32 exec_hi, 0x0 423 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 424#endif 425 426 /* save SGPRs */ 427 // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... 428 429 // SGPR SR memory offset : size(VGPR)+size(SVGPR) 430 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) 431 get_svgpr_size_bytes(s_save_tmp) 432 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 433 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 434 435#if NO_SQC_STORE 436 s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into 437#else 438 // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 439 s_mov_b32 s_save_xnack_mask, s_save_buf_rsrc0 440 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset 441 s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 442#endif 443 444 s_mov_b32 m0, 0x0 //SGPR initial index value =0 445 s_nop 0x0 //Manually inserted wait states 446L_SAVE_SGPR_LOOP: 447 // SGPR is allocated in 16 SGPR granularity 448 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] 449 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] 450 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] 451 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] 452 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] 453 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] 454 s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] 455 s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] 456 457 write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) 458 459#if NO_SQC_STORE 460 s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? 461 s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE 462 463 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 464 s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 465 s_mov_b32 ttmp13, 0x0 466 v_mov_b32 v2, 0x0 467L_SAVE_SGPR_SKIP_TCP_STORE: 468#endif 469 470 s_add_u32 m0, m0, 16 //next sgpr index 471 s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0 472 s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete? 473 474 //save the rest 12 SGPR 475 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] 476 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] 477 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] 478 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] 479 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] 480 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] 481 write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) 482 483#if NO_SQC_STORE 484 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 485#else 486 // restore s_save_buf_rsrc0,1 487 s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask 488#endif 489 490 /* save LDS */ 491 492L_SAVE_LDS: 493 // Change EXEC to all threads... 494 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 495 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 496 s_and_b32 m0, m0, 1 497 s_cmp_eq_u32 m0, 1 498 s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI 499 s_mov_b32 exec_hi, 0x00000000 500 s_branch L_SAVE_LDS_NORMAL 501L_ENABLE_SAVE_LDS_EXEC_HI: 502 s_mov_b32 exec_hi, 0xFFFFFFFF 503L_SAVE_LDS_NORMAL: 504 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 505 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? 506 s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE 507 508 s_barrier //LDS is used? wait for other waves in the same TG 509 s_and_b32 s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK 510 s_cbranch_scc0 L_SAVE_LDS_DONE 511 512 // first wave do LDS save; 513 514 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw 515 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes 516 s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes 517 518 // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) 519 // 520 get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) 521 get_svgpr_size_bytes(s_save_tmp) 522 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 523 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() 524 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() 525 526 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 527 528 //load 0~63*4(byte address) to vgpr v0 529 v_mbcnt_lo_u32_b32 v0, -1, 0 530 v_mbcnt_hi_u32_b32 v0, -1, v0 531 v_mul_u32_u24 v0, 4, v0 532 533 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 534 s_and_b32 m0, m0, 1 535 s_cmp_eq_u32 m0, 1 536 s_mov_b32 m0, 0x0 537 s_cbranch_scc1 L_SAVE_LDS_W64 538 539L_SAVE_LDS_W32: 540 s_mov_b32 s3, 128 541 s_nop 0 542 s_nop 0 543 s_nop 0 544L_SAVE_LDS_LOOP_W32: 545 ds_read_b32 v1, v0 546 s_waitcnt 0 547 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 548 549 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes 550 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 551 v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes 552 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 553 s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete? 554 555 s_branch L_SAVE_LDS_DONE 556 557L_SAVE_LDS_W64: 558 s_mov_b32 s3, 256 559 s_nop 0 560 s_nop 0 561 s_nop 0 562L_SAVE_LDS_LOOP_W64: 563 ds_read_b32 v1, v0 564 s_waitcnt 0 565 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 566 567 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes 568 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 569 v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes 570 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 571 s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete? 572 573L_SAVE_LDS_DONE: 574 /* save VGPRs - set the Rest VGPRs */ 575L_SAVE_VGPR: 576 // VGPR SR memory offset: 0 577 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 578 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 579 s_and_b32 m0, m0, 1 580 s_cmp_eq_u32 m0, 1 581 s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI 582 s_mov_b32 s_save_mem_offset, (0+128*4) // for the rest VGPRs 583 s_mov_b32 exec_hi, 0x00000000 584 s_branch L_SAVE_VGPR_NORMAL 585L_ENABLE_SAVE_VGPR_EXEC_HI: 586 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs 587 s_mov_b32 exec_hi, 0xFFFFFFFF 588L_SAVE_VGPR_NORMAL: 589 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 590 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 591 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) 592 //determine it is wave32 or wave64 593 s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE 594 s_and_b32 m0, m0, 1 595 s_cmp_eq_u32 m0, 1 596 s_cbranch_scc1 L_SAVE_VGPR_WAVE64 597 598 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 599 600 // VGPR Allocated in 4-GPR granularity 601 602 // VGPR store using dw burst 603 s_mov_b32 m0, 0x4 //VGPR initial index value =4 604 s_cmp_lt_u32 m0, s_save_alloc_size 605 s_cbranch_scc0 L_SAVE_VGPR_END 606 607L_SAVE_VGPR_W32_LOOP: 608 v_movrels_b32 v0, v0 //v0 = v[0+m0] 609 v_movrels_b32 v1, v1 //v1 = v[1+m0] 610 v_movrels_b32 v2, v2 //v2 = v[2+m0] 611 v_movrels_b32 v3, v3 //v3 = v[3+m0] 612 613 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 614 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 615 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 616 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 617 618 s_add_u32 m0, m0, 4 //next vgpr index 619 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes 620 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 621 s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP //VGPR save is complete? 622 623 s_branch L_SAVE_VGPR_END 624 625L_SAVE_VGPR_WAVE64: 626 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 627 628 // VGPR store using dw burst 629 s_mov_b32 m0, 0x4 //VGPR initial index value =4 630 s_cmp_lt_u32 m0, s_save_alloc_size 631 s_cbranch_scc0 L_SAVE_VGPR_END 632 633L_SAVE_VGPR_W64_LOOP: 634 v_movrels_b32 v0, v0 //v0 = v[0+m0] 635 v_movrels_b32 v1, v1 //v1 = v[1+m0] 636 v_movrels_b32 v2, v2 //v2 = v[2+m0] 637 v_movrels_b32 v3, v3 //v3 = v[3+m0] 638 639 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 640 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 641 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 642 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 643 644 s_add_u32 m0, m0, 4 //next vgpr index 645 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes 646 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 647 s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete? 648 649 //Below part will be the save shared vgpr part (new for gfx10) 650 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) 651 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? 652 s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS 653 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) 654 //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. 655 //save shared_vgpr will start from the index of m0 656 s_add_u32 s_save_alloc_size, s_save_alloc_size, m0 657 s_mov_b32 exec_lo, 0xFFFFFFFF 658 s_mov_b32 exec_hi, 0x00000000 659L_SAVE_SHARED_VGPR_WAVE64_LOOP: 660 v_movrels_b32 v0, v0 //v0 = v[0+m0] 661 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 662 s_add_u32 m0, m0, 1 //next vgpr index 663 s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 664 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 665 s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete? 666 667L_SAVE_VGPR_END: 668 s_branch L_END_PGM 669 670L_RESTORE: 671 /* Setup Resource Contants */ 672 s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo 673 s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi 674 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE 675 s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) 676 s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC 677 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK 678 s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) 679 s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC 680 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK 681 s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) 682 s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE 683 //determine it is wave32 or wave64 684 get_wave_size(s_restore_size) 685 686 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK 687 s_cbranch_scc0 L_RESTORE_VGPR 688 689 /* restore LDS */ 690L_RESTORE_LDS: 691 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 692 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 693 s_and_b32 m0, m0, 1 694 s_cmp_eq_u32 m0, 1 695 s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI 696 s_mov_b32 exec_hi, 0x00000000 697 s_branch L_RESTORE_LDS_NORMAL 698L_ENABLE_RESTORE_LDS_EXEC_HI: 699 s_mov_b32 exec_hi, 0xFFFFFFFF 700L_RESTORE_LDS_NORMAL: 701 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 702 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? 703 s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR 704 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw 705 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes 706 s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes 707 708 // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) 709 // 710 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) 711 get_svgpr_size_bytes(s_restore_tmp) 712 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 713 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() 714 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() 715 716 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 717 718 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 719 s_and_b32 m0, m0, 1 720 s_cmp_eq_u32 m0, 1 721 s_mov_b32 m0, 0x0 722 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 723 724L_RESTORE_LDS_LOOP_W32: 725 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW 726 s_add_u32 m0, m0, 128 // 128 DW 727 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW 728 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 729 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete? 730 s_branch L_RESTORE_VGPR 731 732L_RESTORE_LDS_LOOP_W64: 733 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW 734 s_add_u32 m0, m0, 256 // 256 DW 735 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW 736 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 737 s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete? 738 739 /* restore VGPRs */ 740L_RESTORE_VGPR: 741 // VGPR SR memory offset : 0 742 s_mov_b32 s_restore_mem_offset, 0x0 743 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 744 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 745 s_and_b32 m0, m0, 1 746 s_cmp_eq_u32 m0, 1 747 s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI 748 s_mov_b32 exec_hi, 0x00000000 749 s_branch L_RESTORE_VGPR_NORMAL 750L_ENABLE_RESTORE_VGPR_EXEC_HI: 751 s_mov_b32 exec_hi, 0xFFFFFFFF 752L_RESTORE_VGPR_NORMAL: 753 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 754 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 755 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) 756 //determine it is wave32 or wave64 757 s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE 758 s_and_b32 m0, m0, 1 759 s_cmp_eq_u32 m0, 1 760 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64 761 762 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 763 764 // VGPR load using dw burst 765 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last 766 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 767 s_mov_b32 m0, 4 //VGPR initial index value = 4 768 769L_RESTORE_VGPR_WAVE32_LOOP: 770 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 771 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128 772 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2 773 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3 774 s_waitcnt vmcnt(0) 775 v_movreld_b32 v0, v0 //v[0+m0] = v0 776 v_movreld_b32 v1, v1 777 v_movreld_b32 v2, v2 778 v_movreld_b32 v3, v3 779 s_add_u32 m0, m0, 4 //next vgpr index 780 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 //every buffer_load_dword does 128 bytes 781 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 782 s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? 783 784 /* VGPR restore on v0 */ 785 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 786 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128 787 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2 788 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3 789 790 s_branch L_RESTORE_SGPR 791 792L_RESTORE_VGPR_WAVE64: 793 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 794 795 // VGPR load using dw burst 796 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last 797 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 798 s_mov_b32 m0, 4 //VGPR initial index value = 4 799 800L_RESTORE_VGPR_WAVE64_LOOP: 801 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 802 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 803 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 804 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 805 s_waitcnt vmcnt(0) 806 v_movreld_b32 v0, v0 //v[0+m0] = v0 807 v_movreld_b32 v1, v1 808 v_movreld_b32 v2, v2 809 v_movreld_b32 v3, v3 810 s_add_u32 m0, m0, 4 //next vgpr index 811 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes 812 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 813 s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? 814 815 //Below part will be the restore shared vgpr part (new for gfx10) 816 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size 817 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? 818 s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? 819 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) 820 //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. 821 //restore shared_vgpr will start from the index of m0 822 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0 823 s_mov_b32 exec_lo, 0xFFFFFFFF 824 s_mov_b32 exec_hi, 0x00000000 825L_RESTORE_SHARED_VGPR_WAVE64_LOOP: 826 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 827 s_waitcnt vmcnt(0) 828 v_movreld_b32 v0, v0 //v[0+m0] = v0 829 s_add_u32 m0, m0, 1 //next vgpr index 830 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 831 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 832 s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? 833 834 s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!! 835 836 /* VGPR restore on v0 */ 837L_RESTORE_V0: 838 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 839 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 840 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 841 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 842 s_waitcnt vmcnt(0) 843 844 /* restore SGPRs */ 845 //will be 2+8+16*6 846 // SGPR SR memory offset : size(VGPR)+size(SVGPR) 847L_RESTORE_SGPR: 848 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) 849 get_svgpr_size_bytes(s_restore_tmp) 850 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 851 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() 852 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 20*4 //s108~s127 is not saved 853 854 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 855 856 s_mov_b32 m0, s_sgpr_save_num 857 858 read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) 859 s_waitcnt lgkmcnt(0) 860 861 s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104] 862 s_nop 0 // hazard SALU M0=> S_MOVREL 863 864 s_movreld_b64 s0, s0 //s[0+m0] = s0 865 s_movreld_b64 s2, s2 866 867 read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) 868 s_waitcnt lgkmcnt(0) 869 870 s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96] 871 s_nop 0 // hazard SALU M0=> S_MOVREL 872 873 s_movreld_b64 s0, s0 //s[0+m0] = s0 874 s_movreld_b64 s2, s2 875 s_movreld_b64 s4, s4 876 s_movreld_b64 s6, s6 877 878 L_RESTORE_SGPR_LOOP: 879 read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) 880 s_waitcnt lgkmcnt(0) 881 882 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] 883 s_nop 0 // hazard SALU M0=> S_MOVREL 884 885 s_movreld_b64 s0, s0 //s[0+m0] = s0 886 s_movreld_b64 s2, s2 887 s_movreld_b64 s4, s4 888 s_movreld_b64 s6, s6 889 s_movreld_b64 s8, s8 890 s_movreld_b64 s10, s10 891 s_movreld_b64 s12, s12 892 s_movreld_b64 s14, s14 893 894 s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0 895 s_cbranch_scc0 L_RESTORE_SGPR_LOOP 896 897 // s_barrier with MODE.DEBUG_EN=1, STATUS.PRIV=1 incorrectly asserts debug exception. 898 // Clear DEBUG_EN before and restore MODE after the barrier. 899 s_setreg_imm32_b32 hwreg(HW_REG_MODE), 0 900 s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG 901 902 /* restore HW registers */ 903L_RESTORE_HWREG: 904 // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) 905 get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) 906 get_svgpr_size_bytes(s_restore_tmp) 907 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 908 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() 909 910 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 911 912 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) 913 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) 914 read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) 915 read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) 916 read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) 917 read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) 918 read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) 919 read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset) 920 read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) 921 read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) 922 s_waitcnt lgkmcnt(0) 923 924 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch 925 926 read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) 927 s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS 928 929 s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch 930 931 s_mov_b32 m0, s_restore_m0 932 s_mov_b32 exec_lo, s_restore_exec_lo 933 s_mov_b32 exec_hi, s_restore_exec_hi 934 935 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts 936 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 937 938#if ASIC_TARGET_NAVI1X 939 s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask 940#endif 941 942 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts 943 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT 944 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 945 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode 946 947 // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic 948 // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 949 get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size) 950 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes() 951 s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 952 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 953 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF 954 s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1 955 s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1 956 s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1 957 s_waitcnt lgkmcnt(0) 958 959#if ASIC_TARGET_NAVI1X 960 s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK 961 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT 962 s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT 963 s_mov_b32 s_restore_tmp, 0x0 964 s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 965 s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK 966 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT 967 s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT 968 s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 969 s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_REPLAY_W64H_MASK 970 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_REPLAY_W64H_SHIFT 971 s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT 972 s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 973 974 s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK 975 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT 976 s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp 977#endif 978 979 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS 980 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 981 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 982 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu 983 984 s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution 985 986L_END_PGM: 987 s_endpgm 988end 989 990function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) 991#if NO_SQC_STORE 992 // Copy into VGPR for later TCP store. 993 v_writelane_b32 v2, s, m0 994 s_add_u32 m0, m0, 0x1 995#else 996 s_mov_b32 exec_lo, m0 997 s_mov_b32 m0, s_mem_offset 998 s_buffer_store_dword s, s_rsrc, m0 glc:1 999 s_add_u32 s_mem_offset, s_mem_offset, 4 1000 s_mov_b32 m0, exec_lo 1001#endif 1002end 1003 1004 1005function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) 1006#if NO_SQC_STORE 1007 // Copy into VGPR for later TCP store. 1008 for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++ 1009 v_writelane_b32 v2, s[sgpr_idx], ttmp13 1010 s_add_u32 ttmp13, ttmp13, 0x1 1011 end 1012#else 1013 s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 1014 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 1015 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 1016 s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 1017 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 1018 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 1019#endif 1020end 1021 1022function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset) 1023#if NO_SQC_STORE 1024 // Copy into VGPR for later TCP store. 1025 for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++ 1026 v_writelane_b32 v2, s[sgpr_idx], ttmp13 1027 s_add_u32 ttmp13, ttmp13, 0x1 1028 end 1029#else 1030 s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 1031 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 1032 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 1033 s_add_u32 s_rsrc[0], s_rsrc[0], 4*12 1034 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 1035#endif 1036end 1037 1038function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) 1039 s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 1040 s_add_u32 s_mem_offset, s_mem_offset, 4 1041end 1042 1043function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) 1044 s_sub_u32 s_mem_offset, s_mem_offset, 4*16 1045 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 1046end 1047 1048function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset) 1049 s_sub_u32 s_mem_offset, s_mem_offset, 4*8 1050 s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset glc:1 1051end 1052 1053function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset) 1054 s_sub_u32 s_mem_offset, s_mem_offset, 4*4 1055 s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset glc:1 1056end 1057 1058 1059function get_lds_size_bytes(s_lds_size_byte) 1060 s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 1061 s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW 1062end 1063 1064function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) 1065 s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 1066 s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 1067 s_bitcmp1_b32 s_size, S_WAVE_SIZE 1068 s_cbranch_scc1 L_ENABLE_SHIFT_W64 1069 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value) 1070 s_branch L_SHIFT_DONE 1071L_ENABLE_SHIFT_W64: 1072 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) 1073L_SHIFT_DONE: 1074end 1075 1076function get_svgpr_size_bytes(s_svgpr_size_byte) 1077 s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) 1078 s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7) 1079end 1080 1081function get_sgpr_size_bytes 1082 return 512 1083end 1084 1085function get_hwreg_size_bytes 1086 return 128 1087end 1088 1089function get_wave_size(s_reg) 1090 s_getreg_b32 s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) 1091 s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE 1092 s_or_b32 s_reg, s_save_spi_init_hi, s_reg //share with exec_hi, it's at bit25 1093end 1094