1/* 2 * Copyright 2015-2017 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23#if 0 24HW (VI) source code for CWSR trap handler 25#Version 18 + multiple trap handler 26 27// this performance-optimal version was originally from Seven Xu at SRDC 28 29// Revison #18 --... 30/* Rev History 31** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) 32** #4. SR Memory Layout: 33** 1. VGPR-SGPR-HWREG-{LDS} 34** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. 35** #5. Update: 1. Accurate g8sr_ts_save_d timestamp 36** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) 37** #7. Update: 1. don't barrier if noLDS 38** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version 39** 2. Fix SQ issue by s_sleep 2 40** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last 41** 2. optimize s_buffer save by burst 16sgprs... 42** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. 43** #11. Update 1. Add 2 more timestamp for debug version 44** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance 45** #13. Integ 1. Always use MUBUF for PV trap shader... 46** #14. Update 1. s_buffer_store soft clause... 47** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. 48** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree 49** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] 50** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... 51** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 52** 2. FUNC - Handle non-CWSR traps 53*/ 54 55var G8SR_WDMEM_HWREG_OFFSET = 0 56var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes 57 58// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. 59 60var G8SR_DEBUG_TIMESTAMP = 0 61var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset 62var s_g8sr_ts_save_s = s[34:35] // save start 63var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi 64var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ 65var s_g8sr_ts_save_d = s[40:41] // save end 66var s_g8sr_ts_restore_s = s[42:43] // restore start 67var s_g8sr_ts_restore_d = s[44:45] // restore end 68 69var G8SR_VGPR_SR_IN_DWX4 = 0 70var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes 71var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 72 73 74/*************************************************************************/ 75/* control on how to run the shader */ 76/*************************************************************************/ 77//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) 78var EMU_RUN_HACK = 0 79var EMU_RUN_HACK_RESTORE_NORMAL = 0 80var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 81var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 82var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK 83var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK 84var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK 85var SAVE_LDS = 1 86var WG_BASE_ADDR_LO = 0x9000a000 87var WG_BASE_ADDR_HI = 0x0 88var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem 89var CTX_SAVE_CONTROL = 0x0 90var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL 91var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) 92var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write 93var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes 94var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing 95 96/**************************************************************************/ 97/* variables */ 98/**************************************************************************/ 99var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 100var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 101var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 102 103var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 104var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 105var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 106var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 107var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 108var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits 109 110var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 111var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask 112var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 113var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 114var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 115var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF 116var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 117var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 118var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 119var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 120var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 121 122var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME 123var SQ_WAVE_IB_STS_RCNT_SIZE = 4 //FIXME 124var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME 125var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME 126var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME 127 128var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 129var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 130 131 132/* Save */ 133var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes 134var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE 135 136var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit 137var S_SAVE_SPI_INIT_ATC_SHIFT = 27 138var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype 139var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 140var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG 141var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 142 143var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used 144var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME 145var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME 146var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME 147 148var s_save_spi_init_lo = exec_lo 149var s_save_spi_init_hi = exec_hi 150 151 //tba_lo and tba_hi need to be saved/restored 152var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} 153var s_save_pc_hi = ttmp1 154var s_save_exec_lo = ttmp2 155var s_save_exec_hi = ttmp3 156var s_save_status = ttmp4 157var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine 158var s_save_xnack_mask_lo = ttmp6 159var s_save_xnack_mask_hi = ttmp7 160var s_save_buf_rsrc0 = ttmp8 161var s_save_buf_rsrc1 = ttmp9 162var s_save_buf_rsrc2 = ttmp10 163var s_save_buf_rsrc3 = ttmp11 164 165var s_save_mem_offset = tma_lo 166var s_save_alloc_size = s_save_trapsts //conflict 167var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) 168var s_save_m0 = tma_hi 169 170/* Restore */ 171var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE 172var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC 173 174var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit 175var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 176var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype 177var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 178var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG 179var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 180 181var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT 182var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK 183var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT 184var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK 185 186var s_restore_spi_init_lo = exec_lo 187var s_restore_spi_init_hi = exec_hi 188 189var s_restore_mem_offset = ttmp2 190var s_restore_alloc_size = ttmp3 191var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored 192var s_restore_mem_offset_save = s_restore_tmp //no conflict 193 194var s_restore_m0 = s_restore_alloc_size //no conflict 195 196var s_restore_mode = ttmp7 197 198var s_restore_pc_lo = ttmp0 199var s_restore_pc_hi = ttmp1 200var s_restore_exec_lo = tma_lo //no conflict 201var s_restore_exec_hi = tma_hi //no conflict 202var s_restore_status = ttmp4 203var s_restore_trapsts = ttmp5 204var s_restore_xnack_mask_lo = xnack_mask_lo 205var s_restore_xnack_mask_hi = xnack_mask_hi 206var s_restore_buf_rsrc0 = ttmp8 207var s_restore_buf_rsrc1 = ttmp9 208var s_restore_buf_rsrc2 = ttmp10 209var s_restore_buf_rsrc3 = ttmp11 210 211/**************************************************************************/ 212/* trap handler entry points */ 213/**************************************************************************/ 214/* Shader Main*/ 215 216shader main 217 asic(VI) 218 type(CS) 219 220 221 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore 222 //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC 223 s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC 224 s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. 225 s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE 226 //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE 227 s_branch L_SKIP_RESTORE //NOT restore, SAVE actually 228 else 229 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save 230 end 231 232L_JUMP_TO_RESTORE: 233 s_branch L_RESTORE //restore 234 235L_SKIP_RESTORE: 236 237 s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC 238 s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save 239 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) 240 s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save 241 s_cbranch_scc1 L_SAVE //this is the operation for save 242 243 // ********* Handle non-CWSR traps ******************* 244if (!EMU_RUN_HACK) 245 /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ 246 s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0 247 s_waitcnt lgkmcnt(0) 248 s_or_b32 ttmp7, ttmp8, ttmp9 249 s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set 250 s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) 251 s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler 252 253L_NO_NEXT_TRAP: 254 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) 255 s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception 256 s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. 257 s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 258 s_addc_u32 ttmp1, ttmp1, 0 259L_EXCP_CASE: 260 s_and_b32 ttmp1, ttmp1, 0xFFFF 261 s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) 262 s_rfe_b64 [ttmp0, ttmp1] 263end 264 // ********* End handling of non-CWSR traps ******************* 265 266/**************************************************************************/ 267/* save routine */ 268/**************************************************************************/ 269 270L_SAVE: 271 272if G8SR_DEBUG_TIMESTAMP 273 s_memrealtime s_g8sr_ts_save_s 274 s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? 275end 276 277 //check whether there is mem_viol 278 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) 279 s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK 280 s_cbranch_scc0 L_NO_PC_REWIND 281 282 //if so, need rewind PC assuming GDS operation gets NACKed 283 s_mov_b32 s_save_tmp, 0 //clear mem_viol bit 284 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit 285 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] 286 s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 287 s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc 288 289L_NO_PC_REWIND: 290 s_mov_b32 s_save_tmp, 0 //clear saveCtx bit 291 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit 292 293 s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK 294 s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation 295 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT 296 s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT 297 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp 298 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY 299 s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT 300 s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp 301 s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS 302 s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG 303 304 s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp 305 306 /* inform SPI the readiness and wait for SPI's go signal */ 307 s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI 308 s_mov_b32 s_save_exec_hi, exec_hi 309 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive 310 311if G8SR_DEBUG_TIMESTAMP 312 s_memrealtime s_g8sr_ts_sq_save_msg 313 s_waitcnt lgkmcnt(0) 314end 315 316 if (EMU_RUN_HACK) 317 318 else 319 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC 320 end 321 322 L_SLEEP: 323 s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 324 325 if (EMU_RUN_HACK) 326 327 else 328 s_cbranch_execz L_SLEEP 329 end 330 331if G8SR_DEBUG_TIMESTAMP 332 s_memrealtime s_g8sr_ts_spi_wrexec 333 s_waitcnt lgkmcnt(0) 334end 335 336 /* setup Resource Contants */ 337 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) 338 //calculate wd_addr using absolute thread id 339 v_readlane_b32 s_save_tmp, v9, 0 340 s_lshr_b32 s_save_tmp, s_save_tmp, 6 341 s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE 342 s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO 343 s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI 344 s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL 345 else 346 end 347 if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) 348 s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO 349 s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI 350 s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL 351 else 352 end 353 354 355 s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo 356 s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi 357 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE 358 s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited 359 s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC 360 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK 361 s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position 362 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC 363 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK 364 s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position 365 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE 366 367 //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) 368 s_mov_b32 s_save_m0, m0 //save M0 369 370 /* global mem offset */ 371 s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 372 373 374 375 376 /* save HW registers */ 377 ////////////////////////////// 378 379 L_SAVE_HWREG: 380 // HWREG SR memory offset : size(VGPR)+size(SGPR) 381 get_vgpr_size_bytes(s_save_mem_offset) 382 get_sgpr_size_bytes(s_save_tmp) 383 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 384 385 386 s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes 387 if (SWIZZLE_EN) 388 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 389 else 390 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 391 end 392 393 394 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 395 396 if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) 397 s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 398 s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over 399 s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO 400 s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI 401 end 402 403 write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC 404 write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) 405 write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC 406 write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) 407 write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS 408 409 //s_save_trapsts conflicts with s_save_alloc_size 410 s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) 411 write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS 412 413 write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO 414 write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI 415 416 //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 417 s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE 418 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 419 write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO 420 write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI 421 422 423 424 /* the first wave in the threadgroup */ 425 // save fist_wave bits in tba_hi unused bit.26 426 s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit 427 //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26] 428 s_mov_b32 s_save_exec_hi, 0x0 429 s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] 430 431 432 /* save SGPRs */ 433 // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... 434 ////////////////////////////// 435 436 // SGPR SR memory offset : size(VGPR) 437 get_vgpr_size_bytes(s_save_mem_offset) 438 // TODO, change RSRC word to rearrange memory layout for SGPRS 439 440 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size 441 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 442 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) 443 444 if (SGPR_SAVE_USE_SQC) 445 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes 446 else 447 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) 448 end 449 450 if (SWIZZLE_EN) 451 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 452 else 453 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 454 end 455 456 457 // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 458 //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 459 s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 460 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset 461 s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 462 463 s_mov_b32 m0, 0x0 //SGPR initial index value =0 464 L_SAVE_SGPR_LOOP: 465 // SGPR is allocated in 16 SGPR granularity 466 s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] 467 s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] 468 s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] 469 s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] 470 s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] 471 s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] 472 s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] 473 s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] 474 475 write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 476 s_add_u32 m0, m0, 16 //next sgpr index 477 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 478 s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? 479 // restore s_save_buf_rsrc0,1 480 //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo 481 s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo 482 483 484 485 486 /* save first 4 VGPR, then LDS save could use */ 487 // each wave will alloc 4 vgprs at least... 488 ///////////////////////////////////////////////////////////////////////////////////// 489 490 s_mov_b32 s_save_mem_offset, 0 491 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 492 s_mov_b32 exec_hi, 0xFFFFFFFF 493 494 if (SWIZZLE_EN) 495 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 496 else 497 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 498 end 499 500 501 // VGPR Allocated in 4-GPR granularity 502 503if G8SR_VGPR_SR_IN_DWX4 504 // the const stride for DWx4 is 4*4 bytes 505 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 506 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes 507 508 buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 509 510 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 511 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes 512else 513 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 514 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 515 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 516 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 517end 518 519 520 521 /* save LDS */ 522 ////////////////////////////// 523 524 L_SAVE_LDS: 525 526 // Change EXEC to all threads... 527 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 528 s_mov_b32 exec_hi, 0xFFFFFFFF 529 530 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size 531 s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? 532 s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE 533 534 s_barrier //LDS is used? wait for other waves in the same TG 535 //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here 536 s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here 537 s_cbranch_scc0 L_SAVE_LDS_DONE 538 539 // first wave do LDS save; 540 541 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw 542 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes 543 s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes 544 545 // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) 546 // 547 get_vgpr_size_bytes(s_save_mem_offset) 548 get_sgpr_size_bytes(s_save_tmp) 549 s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 550 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() 551 552 553 if (SWIZZLE_EN) 554 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 555 else 556 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 557 end 558 559 s_mov_b32 m0, 0x0 //lds_offset initial value = 0 560 561 562var LDS_DMA_ENABLE = 0 563var UNROLL = 0 564if UNROLL==0 && LDS_DMA_ENABLE==1 565 s_mov_b32 s3, 256*2 566 s_nop 0 567 s_nop 0 568 s_nop 0 569 L_SAVE_LDS_LOOP: 570 //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? 571 if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity 572 buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW 573 buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW 574 end 575 576 s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes 577 s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes 578 s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 579 s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? 580 581elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss 582 // store from higest LDS address to lowest 583 s_mov_b32 s3, 256*2 584 s_sub_u32 m0, s_save_alloc_size, s3 585 s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 586 s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... 587 s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest 588 s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction 589 s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc 590 s_nop 0 591 s_nop 0 592 s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes 593 s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved 594 s_add_u32 s0, s0,s_save_alloc_size 595 s_addc_u32 s1, s1, 0 596 s_setpc_b64 s[0:1] 597 598 599 for var i =0; i< 128; i++ 600 // be careful to make here a 64Byte aligned address, which could improve performance... 601 buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW 602 buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW 603 604 if i!=127 605 s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline 606 s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 607 end 608 end 609 610else // BUFFER_STORE 611 v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 612 v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid 613 v_mul_i32_i24 v2, v3, 8 // tid*8 614 v_mov_b32 v3, 256*2 615 s_mov_b32 m0, 0x10000 616 s_mov_b32 s0, s_save_buf_rsrc3 617 s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid 618 s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT 619 620L_SAVE_LDS_LOOP_VECTOR: 621 ds_read_b64 v[0:1], v2 //x =LDS[a], byte address 622 s_waitcnt lgkmcnt(0) 623 buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 624// s_waitcnt vmcnt(0) 625 v_add_u32 v2, vcc[0:1], v2, v3 626 v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size 627 s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR 628 629 // restore rsrc3 630 s_mov_b32 s_save_buf_rsrc3, s0 631 632end 633 634L_SAVE_LDS_DONE: 635 636 637 /* save VGPRs - set the Rest VGPRs */ 638 ////////////////////////////////////////////////////////////////////////////////////// 639 L_SAVE_VGPR: 640 // VGPR SR memory offset: 0 641 // TODO rearrange the RSRC words to use swizzle for VGPR save... 642 643 s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs 644 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 645 s_mov_b32 exec_hi, 0xFFFFFFFF 646 647 s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size 648 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 649 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible 650 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) 651 if (SWIZZLE_EN) 652 s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 653 else 654 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 655 end 656 657 658 // VGPR Allocated in 4-GPR granularity 659 660if G8SR_VGPR_SR_IN_DWX4 661 // the const stride for DWx4 is 4*4 bytes 662 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 663 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes 664 665 s_mov_b32 m0, 4 // skip first 4 VGPRs 666 s_cmp_lt_u32 m0, s_save_alloc_size 667 s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs 668 669 s_set_gpr_idx_on m0, 0x1 // This will change M0 670 s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 671L_SAVE_VGPR_LOOP: 672 v_mov_b32 v0, v0 // v0 = v[0+m0] 673 v_mov_b32 v1, v1 674 v_mov_b32 v2, v2 675 v_mov_b32 v3, v3 676 677 678 buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 679 s_add_u32 m0, m0, 4 680 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 681 s_cmp_lt_u32 m0, s_save_alloc_size 682 s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? 683 s_set_gpr_idx_off 684L_SAVE_VGPR_LOOP_END: 685 686 s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 687 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes 688else 689 // VGPR store using dw burst 690 s_mov_b32 m0, 0x4 //VGPR initial index value =0 691 s_cmp_lt_u32 m0, s_save_alloc_size 692 s_cbranch_scc0 L_SAVE_VGPR_END 693 694 695 s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 696 s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later 697 698 L_SAVE_VGPR_LOOP: 699 v_mov_b32 v0, v0 //v0 = v[0+m0] 700 v_mov_b32 v1, v1 //v0 = v[0+m0] 701 v_mov_b32 v2, v2 //v0 = v[0+m0] 702 v_mov_b32 v3, v3 //v0 = v[0+m0] 703 704 if(USE_MTBUF_INSTEAD_OF_MUBUF) 705 tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 706 else 707 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 708 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 709 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 710 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 711 end 712 713 s_add_u32 m0, m0, 4 //next vgpr index 714 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes 715 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 716 s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? 717 s_set_gpr_idx_off 718end 719 720L_SAVE_VGPR_END: 721 722 723 724 725 726 727 /* S_PGM_END_SAVED */ //FIXME graphics ONLY 728 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) 729 s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] 730 s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 731 s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over 732 s_rfe_b64 s_save_pc_lo //Return to the main shader program 733 else 734 end 735 736// Save Done timestamp 737if G8SR_DEBUG_TIMESTAMP 738 s_memrealtime s_g8sr_ts_save_d 739 // SGPR SR memory offset : size(VGPR) 740 get_vgpr_size_bytes(s_save_mem_offset) 741 s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET 742 s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? 743 // Need reset rsrc2?? 744 s_mov_b32 m0, s_save_mem_offset 745 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 746 s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 747end 748 749 750 s_branch L_END_PGM 751 752 753 754/**************************************************************************/ 755/* restore routine */ 756/**************************************************************************/ 757 758L_RESTORE: 759 /* Setup Resource Contants */ 760 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) 761 //calculate wd_addr using absolute thread id 762 v_readlane_b32 s_restore_tmp, v9, 0 763 s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 764 s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE 765 s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO 766 s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI 767 s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL 768 else 769 end 770 771if G8SR_DEBUG_TIMESTAMP 772 s_memrealtime s_g8sr_ts_restore_s 773 s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? 774 // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... 775 s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] 776 s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. 777end 778 779 780 781 s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo 782 s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi 783 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE 784 s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) 785 s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC 786 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK 787 s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position 788 s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC 789 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK 790 s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position 791 s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE 792 793 /* global mem offset */ 794// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 795 796 /* the first wave in the threadgroup */ 797 s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK 798 s_cbranch_scc0 L_RESTORE_VGPR 799 800 /* restore LDS */ 801 ////////////////////////////// 802 L_RESTORE_LDS: 803 804 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead 805 s_mov_b32 exec_hi, 0xFFFFFFFF 806 807 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size 808 s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? 809 s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR 810 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw 811 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes 812 s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes 813 814 // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) 815 // 816 get_vgpr_size_bytes(s_restore_mem_offset) 817 get_sgpr_size_bytes(s_restore_tmp) 818 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 819 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? 820 821 822 if (SWIZZLE_EN) 823 s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 824 else 825 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 826 end 827 s_mov_b32 m0, 0x0 //lds_offset initial value = 0 828 829 L_RESTORE_LDS_LOOP: 830 if (SAVE_LDS) 831 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW 832 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW 833 end 834 s_add_u32 m0, m0, 256*2 // 128 DW 835 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW 836 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 837 s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? 838 839 840 /* restore VGPRs */ 841 ////////////////////////////// 842 L_RESTORE_VGPR: 843 // VGPR SR memory offset : 0 844 s_mov_b32 s_restore_mem_offset, 0x0 845 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead 846 s_mov_b32 exec_hi, 0xFFFFFFFF 847 848 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size 849 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 850 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) 851 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) 852 if (SWIZZLE_EN) 853 s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 854 else 855 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 856 end 857 858if G8SR_VGPR_SR_IN_DWX4 859 get_vgpr_size_bytes(s_restore_mem_offset) 860 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 861 862 // the const stride for DWx4 is 4*4 bytes 863 s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 864 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes 865 866 s_mov_b32 m0, s_restore_alloc_size 867 s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 868 869L_RESTORE_VGPR_LOOP: 870 buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 871 s_waitcnt vmcnt(0) 872 s_sub_u32 m0, m0, 4 873 v_mov_b32 v0, v0 // v[0+m0] = v0 874 v_mov_b32 v1, v1 875 v_mov_b32 v2, v2 876 v_mov_b32 v3, v3 877 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 878 s_cmp_eq_u32 m0, 0x8000 879 s_cbranch_scc0 L_RESTORE_VGPR_LOOP 880 s_set_gpr_idx_off 881 882 s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 883 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes 884 885else 886 // VGPR load using dw burst 887 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last 888 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 889 s_mov_b32 m0, 4 //VGPR initial index value = 1 890 s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 891 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later 892 893 L_RESTORE_VGPR_LOOP: 894 if(USE_MTBUF_INSTEAD_OF_MUBUF) 895 tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 896 else 897 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 898 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 899 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 900 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 901 end 902 s_waitcnt vmcnt(0) //ensure data ready 903 v_mov_b32 v0, v0 //v[0+m0] = v0 904 v_mov_b32 v1, v1 905 v_mov_b32 v2, v2 906 v_mov_b32 v3, v3 907 s_add_u32 m0, m0, 4 //next vgpr index 908 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes 909 s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 910 s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? 911 s_set_gpr_idx_off 912 /* VGPR restore on v0 */ 913 if(USE_MTBUF_INSTEAD_OF_MUBUF) 914 tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 915 else 916 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 917 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 918 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 919 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 920 end 921 922end 923 924 /* restore SGPRs */ 925 ////////////////////////////// 926 927 // SGPR SR memory offset : size(VGPR) 928 get_vgpr_size_bytes(s_restore_mem_offset) 929 get_sgpr_size_bytes(s_restore_tmp) 930 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 931 s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group 932 // TODO, change RSRC word to rearrange memory layout for SGPRS 933 934 s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size 935 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 936 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) 937 938 if (SGPR_SAVE_USE_SQC) 939 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes 940 else 941 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) 942 end 943 if (SWIZZLE_EN) 944 s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 945 else 946 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 947 end 948 949 /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), 950 However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG 951 */ 952 s_mov_b32 m0, s_restore_alloc_size 953 954 L_RESTORE_SGPR_LOOP: 955 read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made 956 s_waitcnt lgkmcnt(0) //ensure data ready 957 958 s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] 959 960 s_movreld_b64 s0, s0 //s[0+m0] = s0 961 s_movreld_b64 s2, s2 962 s_movreld_b64 s4, s4 963 s_movreld_b64 s6, s6 964 s_movreld_b64 s8, s8 965 s_movreld_b64 s10, s10 966 s_movreld_b64 s12, s12 967 s_movreld_b64 s14, s14 968 969 s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 970 s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? 971 972 /* restore HW registers */ 973 ////////////////////////////// 974 L_RESTORE_HWREG: 975 976 977if G8SR_DEBUG_TIMESTAMP 978 s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo 979 s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi 980end 981 982 // HWREG SR memory offset : size(VGPR)+size(SGPR) 983 get_vgpr_size_bytes(s_restore_mem_offset) 984 get_sgpr_size_bytes(s_restore_tmp) 985 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 986 987 988 s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes 989 if (SWIZZLE_EN) 990 s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 991 else 992 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 993 end 994 995 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 996 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC 997 read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) 998 read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC 999 read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) 1000 read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS 1001 read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS 1002 read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO 1003 read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI 1004 read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE 1005 read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO 1006 read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI 1007 1008 s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS 1009 1010 s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS 1011 1012 //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: 1013 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) 1014 s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) 1015 s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over 1016 end 1017 if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) 1018 s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal 1019 s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over 1020 end 1021 1022 s_mov_b32 m0, s_restore_m0 1023 s_mov_b32 exec_lo, s_restore_exec_lo 1024 s_mov_b32 exec_hi, s_restore_exec_hi 1025 1026 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts 1027 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 1028 s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts 1029 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT 1030 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 1031 //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore 1032 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode 1033 //reuse s_restore_m0 as a temp register 1034 s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK 1035 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT 1036 s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT 1037 s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero 1038 s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 1039 s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK 1040 s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT 1041 s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT 1042 s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 1043 s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK 1044 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT 1045 s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp 1046 1047 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 1048 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 1049 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu 1050 1051 s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time 1052 1053if G8SR_DEBUG_TIMESTAMP 1054 s_memrealtime s_g8sr_ts_restore_d 1055 s_waitcnt lgkmcnt(0) 1056end 1057 1058// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution 1059 s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc 1060 1061 1062/**************************************************************************/ 1063/* the END */ 1064/**************************************************************************/ 1065L_END_PGM: 1066 s_endpgm 1067 1068end 1069 1070 1071/**************************************************************************/ 1072/* the helper functions */ 1073/**************************************************************************/ 1074 1075//Only for save hwreg to mem 1076function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) 1077 s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on 1078 s_mov_b32 m0, s_mem_offset 1079 s_buffer_store_dword s, s_rsrc, m0 glc:1 1080 s_add_u32 s_mem_offset, s_mem_offset, 4 1081 s_mov_b32 m0, exec_lo 1082end 1083 1084 1085// HWREG are saved before SGPRs, so all HWREG could be use. 1086function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) 1087 1088 s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 1089 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 1090 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 1091 s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 1092 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 1093 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc 1094end 1095 1096 1097function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) 1098 s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 1099 s_add_u32 s_mem_offset, s_mem_offset, 4 1100end 1101 1102function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) 1103 s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 1104 s_sub_u32 s_mem_offset, s_mem_offset, 4*16 1105end 1106 1107 1108 1109function get_lds_size_bytes(s_lds_size_byte) 1110 // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW 1111 s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size 1112 s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW 1113end 1114 1115function get_vgpr_size_bytes(s_vgpr_size_byte) 1116 s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size 1117 s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 1118 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible 1119end 1120 1121function get_sgpr_size_bytes(s_sgpr_size_byte) 1122 s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size 1123 s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 1124 s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) 1125end 1126 1127function get_hwreg_size_bytes 1128 return 128 //HWREG size 128 bytes 1129end 1130 1131 1132#endif 1133 1134static const uint32_t cwsr_trap_gfx8_hex[] = { 1135 0xbf820001, 0xbf820123, 1136 0xb8f4f802, 0x89748674, 1137 0xb8f5f803, 0x8675ff75, 1138 0x00000400, 0xbf850011, 1139 0xc00a1e37, 0x00000000, 1140 0xbf8c007f, 0x87777978, 1141 0xbf840002, 0xb974f802, 1142 0xbe801d78, 0xb8f5f803, 1143 0x8675ff75, 0x000001ff, 1144 0xbf850002, 0x80708470, 1145 0x82718071, 0x8671ff71, 1146 0x0000ffff, 0xb974f802, 1147 0xbe801f70, 0xb8f5f803, 1148 0x8675ff75, 0x00000100, 1149 0xbf840006, 0xbefa0080, 1150 0xb97a0203, 0x8671ff71, 1151 0x0000ffff, 0x80f08870, 1152 0x82f18071, 0xbefa0080, 1153 0xb97a0283, 0xbef60068, 1154 0xbef70069, 0xb8fa1c07, 1155 0x8e7a9c7a, 0x87717a71, 1156 0xb8fa03c7, 0x8e7a9b7a, 1157 0x87717a71, 0xb8faf807, 1158 0x867aff7a, 0x00007fff, 1159 0xb97af807, 0xbef2007e, 1160 0xbef3007f, 0xbefe0180, 1161 0xbf900004, 0xbf8e0002, 1162 0xbf88fffe, 0xbef8007e, 1163 0x8679ff7f, 0x0000ffff, 1164 0x8779ff79, 0x00040000, 1165 0xbefa0080, 0xbefb00ff, 1166 0x00807fac, 0x867aff7f, 1167 0x08000000, 0x8f7a837a, 1168 0x877b7a7b, 0x867aff7f, 1169 0x70000000, 0x8f7a817a, 1170 0x877b7a7b, 0xbeef007c, 1171 0xbeee0080, 0xb8ee2a05, 1172 0x806e816e, 0x8e6e8a6e, 1173 0xb8fa1605, 0x807a817a, 1174 0x8e7a867a, 0x806e7a6e, 1175 0xbefa0084, 0xbefa00ff, 1176 0x01000000, 0xbefe007c, 1177 0xbefc006e, 0xc0611bfc, 1178 0x0000007c, 0x806e846e, 1179 0xbefc007e, 0xbefe007c, 1180 0xbefc006e, 0xc0611c3c, 1181 0x0000007c, 0x806e846e, 1182 0xbefc007e, 0xbefe007c, 1183 0xbefc006e, 0xc0611c7c, 1184 0x0000007c, 0x806e846e, 1185 0xbefc007e, 0xbefe007c, 1186 0xbefc006e, 0xc0611cbc, 1187 0x0000007c, 0x806e846e, 1188 0xbefc007e, 0xbefe007c, 1189 0xbefc006e, 0xc0611cfc, 1190 0x0000007c, 0x806e846e, 1191 0xbefc007e, 0xbefe007c, 1192 0xbefc006e, 0xc0611d3c, 1193 0x0000007c, 0x806e846e, 1194 0xbefc007e, 0xb8f5f803, 1195 0xbefe007c, 0xbefc006e, 1196 0xc0611d7c, 0x0000007c, 1197 0x806e846e, 0xbefc007e, 1198 0xbefe007c, 0xbefc006e, 1199 0xc0611dbc, 0x0000007c, 1200 0x806e846e, 0xbefc007e, 1201 0xbefe007c, 0xbefc006e, 1202 0xc0611dfc, 0x0000007c, 1203 0x806e846e, 0xbefc007e, 1204 0xb8eff801, 0xbefe007c, 1205 0xbefc006e, 0xc0611bfc, 1206 0x0000007c, 0x806e846e, 1207 0xbefc007e, 0xbefe007c, 1208 0xbefc006e, 0xc0611b3c, 1209 0x0000007c, 0x806e846e, 1210 0xbefc007e, 0xbefe007c, 1211 0xbefc006e, 0xc0611b7c, 1212 0x0000007c, 0x806e846e, 1213 0xbefc007e, 0x867aff7f, 1214 0x04000000, 0xbef30080, 1215 0x8773737a, 0xb8ee2a05, 1216 0x806e816e, 0x8e6e8a6e, 1217 0xb8f51605, 0x80758175, 1218 0x8e758475, 0x8e7a8275, 1219 0xbefa00ff, 0x01000000, 1220 0xbef60178, 0x80786e78, 1221 0x82798079, 0xbefc0080, 1222 0xbe802b00, 0xbe822b02, 1223 0xbe842b04, 0xbe862b06, 1224 0xbe882b08, 0xbe8a2b0a, 1225 0xbe8c2b0c, 0xbe8e2b0e, 1226 0xc06b003c, 0x00000000, 1227 0xc06b013c, 0x00000010, 1228 0xc06b023c, 0x00000020, 1229 0xc06b033c, 0x00000030, 1230 0x8078c078, 0x82798079, 1231 0x807c907c, 0xbf0a757c, 1232 0xbf85ffeb, 0xbef80176, 1233 0xbeee0080, 0xbefe00c1, 1234 0xbeff00c1, 0xbefa00ff, 1235 0x01000000, 0xe0724000, 1236 0x6e1e0000, 0xe0724100, 1237 0x6e1e0100, 0xe0724200, 1238 0x6e1e0200, 0xe0724300, 1239 0x6e1e0300, 0xbefe00c1, 1240 0xbeff00c1, 0xb8f54306, 1241 0x8675c175, 0xbf84002c, 1242 0xbf8a0000, 0x867aff73, 1243 0x04000000, 0xbf840028, 1244 0x8e758675, 0x8e758275, 1245 0xbefa0075, 0xb8ee2a05, 1246 0x806e816e, 0x8e6e8a6e, 1247 0xb8fa1605, 0x807a817a, 1248 0x8e7a867a, 0x806e7a6e, 1249 0x806eff6e, 0x00000080, 1250 0xbefa00ff, 0x01000000, 1251 0xbefc0080, 0xd28c0002, 1252 0x000100c1, 0xd28d0003, 1253 0x000204c1, 0xd1060002, 1254 0x00011103, 0x7e0602ff, 1255 0x00000200, 0xbefc00ff, 1256 0x00010000, 0xbe80007b, 1257 0x867bff7b, 0xff7fffff, 1258 0x877bff7b, 0x00058000, 1259 0xd8ec0000, 0x00000002, 1260 0xbf8c007f, 0xe0765000, 1261 0x6e1e0002, 0x32040702, 1262 0xd0c9006a, 0x0000eb02, 1263 0xbf87fff7, 0xbefb0000, 1264 0xbeee00ff, 0x00000400, 1265 0xbefe00c1, 0xbeff00c1, 1266 0xb8f52a05, 0x80758175, 1267 0x8e758275, 0x8e7a8875, 1268 0xbefa00ff, 0x01000000, 1269 0xbefc0084, 0xbf0a757c, 1270 0xbf840015, 0xbf11017c, 1271 0x8075ff75, 0x00001000, 1272 0x7e000300, 0x7e020301, 1273 0x7e040302, 0x7e060303, 1274 0xe0724000, 0x6e1e0000, 1275 0xe0724100, 0x6e1e0100, 1276 0xe0724200, 0x6e1e0200, 1277 0xe0724300, 0x6e1e0300, 1278 0x807c847c, 0x806eff6e, 1279 0x00000400, 0xbf0a757c, 1280 0xbf85ffef, 0xbf9c0000, 1281 0xbf8200ca, 0xbef8007e, 1282 0x8679ff7f, 0x0000ffff, 1283 0x8779ff79, 0x00040000, 1284 0xbefa0080, 0xbefb00ff, 1285 0x00807fac, 0x8676ff7f, 1286 0x08000000, 0x8f768376, 1287 0x877b767b, 0x8676ff7f, 1288 0x70000000, 0x8f768176, 1289 0x877b767b, 0x8676ff7f, 1290 0x04000000, 0xbf84001e, 1291 0xbefe00c1, 0xbeff00c1, 1292 0xb8f34306, 0x8673c173, 1293 0xbf840019, 0x8e738673, 1294 0x8e738273, 0xbefa0073, 1295 0xb8f22a05, 0x80728172, 1296 0x8e728a72, 0xb8f61605, 1297 0x80768176, 0x8e768676, 1298 0x80727672, 0x8072ff72, 1299 0x00000080, 0xbefa00ff, 1300 0x01000000, 0xbefc0080, 1301 0xe0510000, 0x721e0000, 1302 0xe0510100, 0x721e0000, 1303 0x807cff7c, 0x00000200, 1304 0x8072ff72, 0x00000200, 1305 0xbf0a737c, 0xbf85fff6, 1306 0xbef20080, 0xbefe00c1, 1307 0xbeff00c1, 0xb8f32a05, 1308 0x80738173, 0x8e738273, 1309 0x8e7a8873, 0xbefa00ff, 1310 0x01000000, 0xbef60072, 1311 0x8072ff72, 0x00000400, 1312 0xbefc0084, 0xbf11087c, 1313 0x8073ff73, 0x00008000, 1314 0xe0524000, 0x721e0000, 1315 0xe0524100, 0x721e0100, 1316 0xe0524200, 0x721e0200, 1317 0xe0524300, 0x721e0300, 1318 0xbf8c0f70, 0x7e000300, 1319 0x7e020301, 0x7e040302, 1320 0x7e060303, 0x807c847c, 1321 0x8072ff72, 0x00000400, 1322 0xbf0a737c, 0xbf85ffee, 1323 0xbf9c0000, 0xe0524000, 1324 0x761e0000, 0xe0524100, 1325 0x761e0100, 0xe0524200, 1326 0x761e0200, 0xe0524300, 1327 0x761e0300, 0xb8f22a05, 1328 0x80728172, 0x8e728a72, 1329 0xb8f61605, 0x80768176, 1330 0x8e768676, 0x80727672, 1331 0x80f2c072, 0xb8f31605, 1332 0x80738173, 0x8e738473, 1333 0x8e7a8273, 0xbefa00ff, 1334 0x01000000, 0xbefc0073, 1335 0xc031003c, 0x00000072, 1336 0x80f2c072, 0xbf8c007f, 1337 0x80fc907c, 0xbe802d00, 1338 0xbe822d02, 0xbe842d04, 1339 0xbe862d06, 0xbe882d08, 1340 0xbe8a2d0a, 0xbe8c2d0c, 1341 0xbe8e2d0e, 0xbf06807c, 1342 0xbf84fff1, 0xb8f22a05, 1343 0x80728172, 0x8e728a72, 1344 0xb8f61605, 0x80768176, 1345 0x8e768676, 0x80727672, 1346 0xbefa0084, 0xbefa00ff, 1347 0x01000000, 0xc0211cfc, 1348 0x00000072, 0x80728472, 1349 0xc0211c3c, 0x00000072, 1350 0x80728472, 0xc0211c7c, 1351 0x00000072, 0x80728472, 1352 0xc0211bbc, 0x00000072, 1353 0x80728472, 0xc0211bfc, 1354 0x00000072, 0x80728472, 1355 0xc0211d3c, 0x00000072, 1356 0x80728472, 0xc0211d7c, 1357 0x00000072, 0x80728472, 1358 0xc0211a3c, 0x00000072, 1359 0x80728472, 0xc0211a7c, 1360 0x00000072, 0x80728472, 1361 0xc0211dfc, 0x00000072, 1362 0x80728472, 0xc0211b3c, 1363 0x00000072, 0x80728472, 1364 0xc0211b7c, 0x00000072, 1365 0x80728472, 0xbf8c007f, 1366 0x8671ff71, 0x0000ffff, 1367 0xbefc0073, 0xbefe006e, 1368 0xbeff006f, 0x867375ff, 1369 0x000003ff, 0xb9734803, 1370 0x867375ff, 0xfffff800, 1371 0x8f738b73, 0xb973a2c3, 1372 0xb977f801, 0x8673ff71, 1373 0xf0000000, 0x8f739c73, 1374 0x8e739073, 0xbef60080, 1375 0x87767376, 0x8673ff71, 1376 0x08000000, 0x8f739b73, 1377 0x8e738f73, 0x87767376, 1378 0x8673ff74, 0x00800000, 1379 0x8f739773, 0xb976f807, 1380 0x86fe7e7e, 0x86ea6a6a, 1381 0xb974f802, 0xbf8a0000, 1382 0x95807370, 0xbf810000, 1383}; 1384 1385