1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23
24shader main
25
26asic(DEFAULT)
27
28type(CS)
29
30wave_size(32)
31/*************************************************************************/
32/*					control on how to run the shader					 */
33/*************************************************************************/
34//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
35var EMU_RUN_HACK					=	0
36var EMU_RUN_HACK_RESTORE_NORMAL		=	0
37var EMU_RUN_HACK_SAVE_NORMAL_EXIT	=	0
38var	EMU_RUN_HACK_SAVE_SINGLE_WAVE	=	0
39var EMU_RUN_HACK_SAVE_FIRST_TIME	= 	0					//for interrupted restore in which the first save is through EMU_RUN_HACK
40var SAVE_LDS						= 	0
41var WG_BASE_ADDR_LO					=   0x9000a000
42var WG_BASE_ADDR_HI					=	0x0
43var WAVE_SPACE						=	0x9000				//memory size that each wave occupies in workgroup state mem, increase from 5000 to 9000 for more SGPR need to be saved
44var CTX_SAVE_CONTROL				=	0x0
45var CTX_RESTORE_CONTROL				=	CTX_SAVE_CONTROL
46var SIM_RUN_HACK					=	0					//any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
47var	SGPR_SAVE_USE_SQC				=	0					//use SQC D$ to do the write
48var USE_MTBUF_INSTEAD_OF_MUBUF		=	0					//need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC)
49var SWIZZLE_EN						=	0					//whether we use swizzled buffer addressing
50var SAVE_RESTORE_HWID_DDID          =   0
51var RESTORE_DDID_IN_SGPR18          =   0
52/**************************************************************************/
53/*                     	variables							              */
54/**************************************************************************/
55var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
56var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
57var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
58
59var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT	= 12
60var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE		= 9
61var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT	= 8
62var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE	= 6
63var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT	= 24
64var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE	= 4						//FIXME	 sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
65var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT    = 24
66var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE     = 4
67var SQ_WAVE_IB_STS2_WAVE64_SHIFT        = 11
68var SQ_WAVE_IB_STS2_WAVE64_SIZE         = 1
69
70var	SQ_WAVE_TRAPSTS_SAVECTX_MASK	=	0x400
71var SQ_WAVE_TRAPSTS_EXCE_MASK       =   0x1FF          			// Exception mask
72var	SQ_WAVE_TRAPSTS_SAVECTX_SHIFT	=	10
73var	SQ_WAVE_TRAPSTS_MEM_VIOL_MASK	=	0x100
74var	SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT	=	8
75var	SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK 	=	0x3FF
76var	SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT 	=	0x0
77var	SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE 	=	10
78var	SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK 	=	0xFFFFF800
79var	SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT 	=	11
80var	SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE 	=	21
81
82var SQ_WAVE_IB_STS_RCNT_SHIFT			=	16					//FIXME
83var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT	=	15					//FIXME
84var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE    =   1                   //FIXME
85var SQ_WAVE_IB_STS_RCNT_SIZE            =   6                   //FIXME
86var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG	= 0x00007FFF	//FIXME
87
88var	SQ_BUF_RSRC_WORD1_ATC_SHIFT		=	24
89var	SQ_BUF_RSRC_WORD3_MTYPE_SHIFT	=	27
90
91
92/*      Save        */
93var	S_SAVE_BUF_RSRC_WORD1_STRIDE		=	0x00040000  		//stride is 4 bytes
94var	S_SAVE_BUF_RSRC_WORD3_MISC			= 	0x00807FAC			//SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
95
96var	S_SAVE_SPI_INIT_ATC_MASK			=	0x08000000			//bit[27]: ATC bit
97var	S_SAVE_SPI_INIT_ATC_SHIFT			=	27
98var	S_SAVE_SPI_INIT_MTYPE_MASK			=	0x70000000			//bit[30:28]: Mtype
99var	S_SAVE_SPI_INIT_MTYPE_SHIFT			=	28
100var	S_SAVE_SPI_INIT_FIRST_WAVE_MASK		=	0x04000000			//bit[26]: FirstWaveInTG
101var	S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT	=	26
102
103var S_SAVE_PC_HI_RCNT_SHIFT				=	28					//FIXME	 check with Brian to ensure all fields other than PC[47:0] can be used
104var S_SAVE_PC_HI_RCNT_MASK				=   0xF0000000			//FIXME
105var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT		=	27					//FIXME
106var S_SAVE_PC_HI_FIRST_REPLAY_MASK		=	0x08000000			//FIXME
107
108var	s_save_spi_init_lo				=	exec_lo
109var s_save_spi_init_hi				=	exec_hi
110
111var	s_save_pc_lo			=	ttmp0			//{TTMP1, TTMP0} = {3��h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
112var	s_save_pc_hi			=	ttmp1
113var s_save_exec_lo			=	ttmp2
114var s_save_exec_hi			= 	ttmp3
115var	s_save_status			=	ttmp4
116var	s_save_trapsts			=	ttmp5			//not really used until the end of the SAVE routine
117var s_wave_size         	=	ttmp6           //ttmp6 is not needed now, since it's only 32bit xnack mask, now use it to determine wave32 or wave64 in EMU_HACK
118var s_save_xnack_mask	    =	ttmp7
119var	s_save_buf_rsrc0		=	ttmp8
120var	s_save_buf_rsrc1		=	ttmp9
121var	s_save_buf_rsrc2		=	ttmp10
122var	s_save_buf_rsrc3		=	ttmp11
123
124var s_save_mem_offset		= 	ttmp14
125var s_sgpr_save_num         =   106                     //in gfx10, all sgpr must be saved
126var s_save_alloc_size		=	s_save_trapsts			//conflict
127var s_save_tmp              =   s_save_buf_rsrc2       	//shared with s_save_buf_rsrc2  (conflict: should not use mem access with s_save_tmp at the same time)
128var s_save_m0				=	ttmp15
129
130/*      Restore     */
131var	S_RESTORE_BUF_RSRC_WORD1_STRIDE			=	S_SAVE_BUF_RSRC_WORD1_STRIDE
132var	S_RESTORE_BUF_RSRC_WORD3_MISC			= 	S_SAVE_BUF_RSRC_WORD3_MISC
133
134var	S_RESTORE_SPI_INIT_ATC_MASK			    =	0x08000000			//bit[27]: ATC bit
135var	S_RESTORE_SPI_INIT_ATC_SHIFT			=	27
136var	S_RESTORE_SPI_INIT_MTYPE_MASK			=	0x70000000			//bit[30:28]: Mtype
137var	S_RESTORE_SPI_INIT_MTYPE_SHIFT			=	28
138var	S_RESTORE_SPI_INIT_FIRST_WAVE_MASK		=	0x04000000			//bit[26]: FirstWaveInTG
139var	S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT	    =	26
140
141var S_RESTORE_PC_HI_RCNT_SHIFT				=	S_SAVE_PC_HI_RCNT_SHIFT
142var S_RESTORE_PC_HI_RCNT_MASK				=   S_SAVE_PC_HI_RCNT_MASK
143var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT		=	S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
144var S_RESTORE_PC_HI_FIRST_REPLAY_MASK		=	S_SAVE_PC_HI_FIRST_REPLAY_MASK
145
146var s_restore_spi_init_lo                   =   exec_lo
147var s_restore_spi_init_hi                   =   exec_hi
148
149var s_restore_mem_offset		= 	ttmp12
150var s_restore_alloc_size		=	ttmp3
151var s_restore_tmp           	=   ttmp6
152var s_restore_mem_offset_save	= 	s_restore_tmp 		//no conflict
153
154var s_restore_m0			=	s_restore_alloc_size	//no conflict
155
156var s_restore_mode			=  	ttmp13
157var s_restore_hwid1         =  ttmp2
158var s_restore_ddid          =  s_restore_hwid1
159var	s_restore_pc_lo		    =	ttmp0
160var	s_restore_pc_hi		    =	ttmp1
161var s_restore_exec_lo		=	ttmp14
162var s_restore_exec_hi		= 	ttmp15
163var	s_restore_status	    =	ttmp4
164var	s_restore_trapsts	    =	ttmp5
165//var s_restore_xnack_mask_lo	=	xnack_mask_lo
166//var s_restore_xnack_mask_hi	=	xnack_mask_hi
167var s_restore_xnack_mask    =   ttmp7
168var	s_restore_buf_rsrc0		=	ttmp8
169var	s_restore_buf_rsrc1		=	ttmp9
170var	s_restore_buf_rsrc2		=	ttmp10
171var	s_restore_buf_rsrc3		=	ttmp11
172var s_restore_size         	=	ttmp13                  //ttmp13 has no conflict
173
174/**************************************************************************/
175/*                     	trap handler entry points			              */
176/**************************************************************************/
177    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) 					//hack to use trap_id for determining save/restore
178		//FIXME VCCZ un-init assertion s_getreg_b32  	s_save_status, hwreg(HW_REG_STATUS)			//save STATUS since we will change SCC
179		s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 				//change SCC
180    	s_cmp_eq_u32 s_save_tmp, 0x007e0000  						//Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
181    	s_cbranch_scc0 L_JUMP_TO_RESTORE							//do not need to recover STATUS here  since we are going to RESTORE
182		//FIXME  s_setreg_b32 	hwreg(HW_REG_STATUS), 	s_save_status		//need to recover STATUS since we are going to SAVE
183		s_branch L_SKIP_RESTORE 									//NOT restore, SAVE actually
184	else
185		s_branch L_SKIP_RESTORE 									//NOT restore. might be a regular trap or save
186    end
187
188L_JUMP_TO_RESTORE:
189    s_branch L_RESTORE												//restore
190
191L_SKIP_RESTORE:
192
193	s_getreg_b32  	s_save_status, hwreg(HW_REG_STATUS)								//save STATUS since we will change SCC
194    s_andn2_b32		s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK      //check whether this is for save
195	s_getreg_b32  	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
196	s_and_b32		s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK	//check whether this is for save
197	s_cbranch_scc1	L_SAVE															//this is the operation for save
198
199    // *********    Handle non-CWSR traps       *******************
200    if (!EMU_RUN_HACK)
201		s_getreg_b32     s_save_trapsts, hwreg(HW_REG_TRAPSTS)
202		s_and_b32        s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
203		s_cbranch_scc1  L_EXCP_CASE   // Exception, jump back to the shader program directly.
204		s_add_u32    ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0
205
206		L_EXCP_CASE:
207		s_and_b32    ttmp1, ttmp1, 0xFFFF
208		s_rfe_b64    [ttmp0, ttmp1]
209	end
210    // *********        End handling of non-CWSR traps   *******************
211
212/**************************************************************************/
213/*                     	save routine						              */
214/**************************************************************************/
215
216L_SAVE:
217
218	//check whether there is mem_viol
219	s_getreg_b32  	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
220	s_and_b32		s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
221	s_cbranch_scc0	L_NO_PC_REWIND
222
223	//if so, need rewind PC assuming GDS operation gets NACKed
224	s_mov_b32       s_save_tmp, 0															//clear mem_viol bit
225	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp	//clear mem_viol bit
226	s_and_b32 		s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
227	s_sub_u32 		s_save_pc_lo, s_save_pc_lo, 8             //pc[31:0]-8
228	s_subb_u32 		s_save_pc_hi, s_save_pc_hi, 0x0			  // -scc
229
230L_NO_PC_REWIND:
231    s_mov_b32       s_save_tmp, 0															//clear saveCtx bit
232	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp		//clear saveCtx bit
233
234	//s_mov_b32		s_save_xnack_mask_lo,	xnack_mask_lo									//save XNACK_MASK
235	//s_mov_b32		s_save_xnack_mask_hi,	xnack_mask_hi
236    s_getreg_b32	s_save_xnack_mask,  hwreg(HW_REG_SHADER_XNACK_MASK)
237	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)					//save RCNT
238	s_lshl_b32		s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
239	s_or_b32		s_save_pc_hi, s_save_pc_hi, s_save_tmp
240	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)	//save FIRST_REPLAY
241	s_lshl_b32		s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
242	s_or_b32		s_save_pc_hi, s_save_pc_hi, s_save_tmp
243	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS)										//clear RCNT and FIRST_REPLAY in IB_STS
244	s_and_b32		s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
245
246	s_setreg_b32	hwreg(HW_REG_IB_STS), s_save_tmp
247
248	/*		inform SPI the readiness and wait for SPI's go signal */
249	s_mov_b32		s_save_exec_lo,	exec_lo													//save EXEC and use EXEC for the go signal from SPI
250	s_mov_b32		s_save_exec_hi,	exec_hi
251	s_mov_b64		exec, 	0x0																//clear EXEC to get ready to receive
252	if (EMU_RUN_HACK)
253
254	else
255		s_sendmsg	sendmsg(MSG_SAVEWAVE)													//send SPI a message and wait for SPI's write to EXEC
256	end
257
258  L_SLEEP:
259	s_sleep 0x2
260
261	if (EMU_RUN_HACK)
262
263	else
264		s_cbranch_execz	L_SLEEP
265	end
266
267
268	/*      setup Resource Contants    */
269	if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
270		//calculate wd_addr using absolute thread id
271		v_readlane_b32 s_save_tmp, v9, 0
272        //determine it is wave32 or wave64
273        s_getreg_b32 	s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
274        s_cmp_eq_u32    s_wave_size, 0
275        s_cbranch_scc1  L_SAVE_WAVE32
276        s_lshr_b32 s_save_tmp, s_save_tmp, 6 //SAVE WAVE64
277        s_branch    L_SAVE_CON
278    L_SAVE_WAVE32:
279        s_lshr_b32 s_save_tmp, s_save_tmp, 5 //SAVE WAVE32
280    L_SAVE_CON:
281		s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
282		s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
283		s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
284		s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
285	else
286	end
287	if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
288		s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
289		s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
290		s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
291	else
292	end
293
294
295	s_mov_b32		s_save_buf_rsrc0, 	s_save_spi_init_lo														//base_addr_lo
296	s_and_b32		s_save_buf_rsrc1, 	s_save_spi_init_hi, 0x0000FFFF											//base_addr_hi
297	s_or_b32		s_save_buf_rsrc1, 	s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
298    s_mov_b32       s_save_buf_rsrc2,   0                                               						//NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
299	s_mov_b32		s_save_buf_rsrc3, 	S_SAVE_BUF_RSRC_WORD3_MISC
300	s_and_b32		s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
301	s_lshr_b32		s_save_tmp,  		s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)			//get ATC bit into position
302	s_or_b32		s_save_buf_rsrc3, 	s_save_buf_rsrc3,  s_save_tmp											//or ATC
303	s_and_b32		s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
304	s_lshr_b32		s_save_tmp,  		s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)		//get MTYPE bits into position
305	s_or_b32		s_save_buf_rsrc3, 	s_save_buf_rsrc3,  s_save_tmp											//or MTYPE
306
307	s_mov_b32		s_save_m0,			m0																	//save M0
308
309	/* 		global mem offset			*/
310	s_mov_b32		s_save_mem_offset, 	0x0																		//mem offset initial value = 0
311    s_getreg_b32 	s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //get wave_save_size
312    s_or_b32        s_wave_size, s_save_spi_init_hi,    s_wave_size                                             //share s_wave_size with exec_hi
313
314    /*      	save VGPRs	    */
315	//////////////////////////////
316  L_SAVE_VGPR:
317
318 	s_mov_b32		exec_lo, 0xFFFFFFFF 											//need every thread from now on
319    s_and_b32       m0, s_wave_size, 1
320    s_cmp_eq_u32    m0, 1
321    s_cbranch_scc1  L_ENABLE_SAVE_VGPR_EXEC_HI
322    s_mov_b32		exec_hi, 0x00000000
323    s_branch        L_SAVE_VGPR_NORMAL
324  L_ENABLE_SAVE_VGPR_EXEC_HI:
325	s_mov_b32		exec_hi, 0xFFFFFFFF
326  L_SAVE_VGPR_NORMAL:
327	s_getreg_b32 	s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 					//vpgr_size
328	//for wave32 and wave64, the num of vgpr function is the same?
329    s_add_u32 		s_save_alloc_size, s_save_alloc_size, 1
330	s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 2 						//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)   //FIXME for GFX, zero is possible
331    //determine it is wave32 or wave64
332    s_and_b32       m0, s_wave_size, 1
333    s_cmp_eq_u32    m0, 1
334    s_cbranch_scc1  L_SAVE_VGPR_WAVE64
335
336    //zhenxu added it for save vgpr for wave32
337	s_lshl_b32		s_save_buf_rsrc2,  s_save_alloc_size, 7							//NUM_RECORDS in bytes (32 threads*4)
338	if (SWIZZLE_EN)
339		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
340	else
341		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
342	end
343
344    s_mov_b32 		m0, 0x0 														//VGPR initial index value =0
345	//s_set_gpr_idx_on  m0, 0x1														//M0[7:0] = M0[7:0] and M0[15:12] = 0x1
346    //s_add_u32		s_save_alloc_size, s_save_alloc_size, 0x1000					//add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
347
348  L_SAVE_VGPR_WAVE32_LOOP:
349	v_movrels_b32 		v0, v0															//v0 = v[0+m0]
350
351    if(USE_MTBUF_INSTEAD_OF_MUBUF)
352		tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
353    else
354		buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
355	end
356
357    s_add_u32		m0, m0, 1														//next vgpr index
358	s_add_u32		s_save_mem_offset, s_save_mem_offset, 128						//every buffer_store_dword does 128 bytes
359	s_cmp_lt_u32 	m0,	s_save_alloc_size 											//scc = (m0 < s_save_alloc_size) ? 1 : 0
360	s_cbranch_scc1 	L_SAVE_VGPR_WAVE32_LOOP												//VGPR save is complete?
361    s_branch    L_SAVE_LDS
362    //save vgpr for wave32 ends
363
364  L_SAVE_VGPR_WAVE64:
365	s_lshl_b32		s_save_buf_rsrc2,  s_save_alloc_size, 8							//NUM_RECORDS in bytes (64 threads*4)
366	if (SWIZZLE_EN)
367		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
368	else
369		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
370	end
371
372    s_mov_b32 		m0, 0x0 														//VGPR initial index value =0
373	//s_set_gpr_idx_on  m0, 0x1														//M0[7:0] = M0[7:0] and M0[15:12] = 0x1
374    //s_add_u32		s_save_alloc_size, s_save_alloc_size, 0x1000					//add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
375
376  L_SAVE_VGPR_WAVE64_LOOP:
377	v_movrels_b32 		v0, v0															//v0 = v[0+m0]
378
379    if(USE_MTBUF_INSTEAD_OF_MUBUF)
380		tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
381    else
382		buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
383	end
384
385    s_add_u32		m0, m0, 1														//next vgpr index
386	s_add_u32		s_save_mem_offset, s_save_mem_offset, 256						//every buffer_store_dword does 256 bytes
387	s_cmp_lt_u32 	m0,	s_save_alloc_size 											//scc = (m0 < s_save_alloc_size) ? 1 : 0
388	s_cbranch_scc1 	L_SAVE_VGPR_WAVE64_LOOP												//VGPR save is complete?
389	//s_set_gpr_idx_off
390    //
391    //Below part will be the save shared vgpr part (new for gfx10)
392    s_getreg_b32 	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) 			//shared_vgpr_size
393    s_and_b32		s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF				//shared_vgpr_size is zero?
394    s_cbranch_scc0	L_SAVE_LDS													    //no shared_vgpr used? jump to L_SAVE_LDS
395    s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 3 						//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
396    //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
397    //save shared_vgpr will start from the index of m0
398    s_add_u32       s_save_alloc_size, s_save_alloc_size, m0
399    s_mov_b32		exec_lo, 0xFFFFFFFF
400    s_mov_b32		exec_hi, 0x00000000
401    L_SAVE_SHARED_VGPR_WAVE64_LOOP:
402	v_movrels_b32 		v0, v0															//v0 = v[0+m0]
403	buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
404    s_add_u32		m0, m0, 1														//next vgpr index
405	s_add_u32		s_save_mem_offset, s_save_mem_offset, 128						//every buffer_store_dword does 256 bytes
406	s_cmp_lt_u32 	m0,	s_save_alloc_size 											//scc = (m0 < s_save_alloc_size) ? 1 : 0
407	s_cbranch_scc1 	L_SAVE_SHARED_VGPR_WAVE64_LOOP									//SHARED_VGPR save is complete?
408
409	/*      	save LDS	    */
410	//////////////////////////////
411  L_SAVE_LDS:
412
413    //Only check the first wave need LDS
414	/*      the first wave in the threadgroup    */
415	s_barrier																		//FIXME  not performance-optimal "LDS is used? wait for other waves in the same TG"
416	s_and_b32		s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK								//exec is still used here
417	s_cbranch_scc0	L_SAVE_SGPR
418
419	s_mov_b32		exec_lo, 0xFFFFFFFF 											//need every thread from now on
420    s_and_b32       m0, s_wave_size, 1
421    s_cmp_eq_u32    m0, 1
422    s_cbranch_scc1  L_ENABLE_SAVE_LDS_EXEC_HI
423    s_mov_b32		exec_hi, 0x00000000
424    s_branch        L_SAVE_LDS_NORMAL
425  L_ENABLE_SAVE_LDS_EXEC_HI:
426	s_mov_b32		exec_hi, 0xFFFFFFFF
427  L_SAVE_LDS_NORMAL:
428	s_getreg_b32 	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 			//lds_size
429	s_and_b32		s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF				//lds_size is zero?
430	s_cbranch_scc0	L_SAVE_SGPR														//no lds used? jump to L_SAVE_VGPR
431	s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 6 						//LDS size in dwords = lds_size * 64dw
432	s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 2 						//LDS size in bytes
433	s_mov_b32		s_save_buf_rsrc2,  s_save_alloc_size  							//NUM_RECORDS in bytes
434	if (SWIZZLE_EN)
435		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
436	else
437		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
438	end
439
440    //load 0~63*4(byte address) to vgpr v15
441    v_mbcnt_lo_u32_b32 v0, -1, 0
442    v_mbcnt_hi_u32_b32 v0, -1, v0
443    v_mul_u32_u24 v0, 4, v0
444
445    s_and_b32       m0, s_wave_size, 1
446    s_cmp_eq_u32    m0, 1
447    s_mov_b32 		m0, 0x0
448    s_cbranch_scc1  L_SAVE_LDS_LOOP_W64
449
450  L_SAVE_LDS_LOOP_W32:
451	if (SAVE_LDS)
452    ds_read_b32 v1, v0
453    s_waitcnt 0														    //ensure data ready
454    buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
455	//buffer_store_lds_dword	s_save_buf_rsrc0, s_save_mem_offset lds:1               //save lds to memory doesn't exist in 10
456	end
457	s_add_u32		m0, m0, 128															//every buffer_store_lds does 128 bytes
458	s_add_u32		s_save_mem_offset, s_save_mem_offset, 128							//mem offset increased by 128 bytes
459    v_add_nc_u32    v0, v0, 128
460	s_cmp_lt_u32	m0, s_save_alloc_size												//scc=(m0 < s_save_alloc_size) ? 1 : 0
461	s_cbranch_scc1  L_SAVE_LDS_LOOP_W32													//LDS save is complete?
462    s_branch        L_SAVE_SGPR
463
464  L_SAVE_LDS_LOOP_W64:
465	if (SAVE_LDS)
466    ds_read_b32 v1, v0
467    s_waitcnt 0														    //ensure data ready
468    buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
469	//buffer_store_lds_dword	s_save_buf_rsrc0, s_save_mem_offset lds:1               //save lds to memory doesn't exist in 10
470	end
471	s_add_u32		m0, m0, 256															//every buffer_store_lds does 256 bytes
472	s_add_u32		s_save_mem_offset, s_save_mem_offset, 256							//mem offset increased by 256 bytes
473    v_add_nc_u32    v0, v0, 256
474	s_cmp_lt_u32	m0, s_save_alloc_size												//scc=(m0 < s_save_alloc_size) ? 1 : 0
475	s_cbranch_scc1  L_SAVE_LDS_LOOP_W64													//LDS save is complete?
476
477
478	/*      	save SGPRs	    */
479	//////////////////////////////
480	//s_getreg_b32 	s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) 				//spgr_size
481	//s_add_u32 		s_save_alloc_size, s_save_alloc_size, 1
482	//s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 4 						//Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
483	//s_lshl_b32 		s_save_alloc_size, s_save_alloc_size, 3 						//In gfx10, Number of SGPRs = (sgpr_size + 1) * 8   (non-zero value)
484  L_SAVE_SGPR:
485    //need to look at it is wave32 or wave64
486    s_and_b32       m0, s_wave_size, 1
487    s_cmp_eq_u32    m0, 1
488    s_cbranch_scc1  L_SAVE_SGPR_VMEM_WAVE64
489    if (SGPR_SAVE_USE_SQC)
490		s_lshl_b32		s_save_buf_rsrc2,	s_sgpr_save_num, 2					//NUM_RECORDS in bytes
491    else
492        s_lshl_b32		s_save_buf_rsrc2,	s_sgpr_save_num, 7					//NUM_RECORDS in bytes (32 threads)
493    end
494    s_branch    L_SAVE_SGPR_CONT
495  L_SAVE_SGPR_VMEM_WAVE64:
496	if (SGPR_SAVE_USE_SQC)
497		s_lshl_b32		s_save_buf_rsrc2,	s_sgpr_save_num, 2					//NUM_RECORDS in bytes
498	else
499		s_lshl_b32		s_save_buf_rsrc2,	s_sgpr_save_num, 8					//NUM_RECORDS in bytes (64 threads)
500	end
501  L_SAVE_SGPR_CONT:
502	if (SWIZZLE_EN)
503		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
504	else
505		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
506	end
507
508	//s_mov_b32 		m0, 0x0 														//SGPR initial index value =0
509    //s_nop           0x0                                                             //Manually inserted wait states
510
511    s_and_b32       m0, s_wave_size, 1
512    s_cmp_eq_u32    m0, 1
513
514    s_mov_b32 		m0, 0x0 														//SGPR initial index value =0
515    s_nop           0x0                                                             //Manually inserted wait states
516
517    s_cbranch_scc1  L_SAVE_SGPR_LOOP_WAVE64
518
519  L_SAVE_SGPR_LOOP_WAVE32:
520	s_movrels_b32 	s0, s0 															//s0 = s[0+m0]
521    //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change
522	write_sgpr_to_mem_wave32(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)							//PV: the best performance should be using s_buffer_store_dwordx4
523	s_add_u32		m0, m0, 1														//next sgpr index
524	s_cmp_lt_u32 	m0, s_sgpr_save_num 											//scc = (m0 < s_sgpr_save_num) ? 1 : 0
525	s_cbranch_scc1 	L_SAVE_SGPR_LOOP_WAVE32												//SGPR save is complete?
526    s_branch    L_SAVE_HWREG
527
528  L_SAVE_SGPR_LOOP_WAVE64:
529	s_movrels_b32 	s0, s0 															//s0 = s[0+m0]
530    //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change
531	write_sgpr_to_mem_wave64(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)							//PV: the best performance should be using s_buffer_store_dwordx4
532	s_add_u32		m0, m0, 1														//next sgpr index
533	s_cmp_lt_u32 	m0, s_sgpr_save_num 											//scc = (m0 < s_sgpr_save_num) ? 1 : 0
534	s_cbranch_scc1 	L_SAVE_SGPR_LOOP_WAVE64												//SGPR save is complete?
535
536
537	/* 		save HW registers	*/
538	//////////////////////////////
539  L_SAVE_HWREG:
540    s_mov_b32		s_save_buf_rsrc2, 0x4								//NUM_RECORDS	in bytes
541	if (SWIZZLE_EN)
542		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
543	else
544		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
545	end
546
547    s_and_b32       m0, s_wave_size, 1
548    s_cmp_eq_u32    m0, 1
549    s_cbranch_scc1  L_SAVE_HWREG_WAVE64
550
551	write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)					//M0
552
553	if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
554		s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
555		s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0			//carry bit over
556	end
557
558	write_sgpr_to_mem_wave32(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)					//PC
559	write_sgpr_to_mem_wave32(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
560	write_sgpr_to_mem_wave32(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)				//EXEC
561	write_sgpr_to_mem_wave32(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
562	write_sgpr_to_mem_wave32(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)				//STATUS
563
564	//s_save_trapsts conflicts with s_save_alloc_size
565	s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
566	write_sgpr_to_mem_wave32(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)				//TRAPSTS
567
568	//write_sgpr_to_mem_wave32(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)			//XNACK_MASK_LO
569	write_sgpr_to_mem_wave32(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)			//XNACK_MASK_HI
570
571	//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
572	s_getreg_b32 	s_save_m0, hwreg(HW_REG_MODE)																						//MODE
573	write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
574    if(SAVE_RESTORE_HWID_DDID)
575    s_getreg_b32 	s_save_m0, hwreg(HW_REG_HW_ID1)																						//HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
576    write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
577    end
578    s_branch   L_S_PGM_END_SAVED
579
580  L_SAVE_HWREG_WAVE64:
581    write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)					//M0
582
583	if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
584		s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
585		s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0			//carry bit over
586	end
587
588	write_sgpr_to_mem_wave64(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)					//PC
589	write_sgpr_to_mem_wave64(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
590	write_sgpr_to_mem_wave64(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)				//EXEC
591	write_sgpr_to_mem_wave64(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
592	write_sgpr_to_mem_wave64(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)				//STATUS
593
594	//s_save_trapsts conflicts with s_save_alloc_size
595	s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
596	write_sgpr_to_mem_wave64(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)				//TRAPSTS
597
598	//write_sgpr_to_mem_wave64(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)			//XNACK_MASK_LO
599	write_sgpr_to_mem_wave64(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)			//XNACK_MASK_HI
600
601	//use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
602	s_getreg_b32 	s_save_m0, hwreg(HW_REG_MODE)																						//MODE
603	write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
604
605
606    if(SAVE_RESTORE_HWID_DDID)
607    s_getreg_b32 	s_save_m0, hwreg(HW_REG_HW_ID1)																						//HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
608    write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
609
610	/* 		save DDID	*/
611	//////////////////////////////
612  L_SAVE_DDID:
613    //EXEC has been saved, no vector inst following
614    s_mov_b32	exec_lo, 0x80000000    //Set MSB to 1. Cleared when draw index is returned
615    s_sendmsg sendmsg(MSG_GET_DDID)
616
617  L_WAIT_DDID_LOOP:
618    s_nop		7			// sleep a bit
619    s_bitcmp0_b32 exec_lo, 31	// test to see if MSB is cleared, meaning done
620    s_cbranch_scc0	L_WAIT_DDID_LOOP
621
622    s_mov_b32	s_save_m0, exec_lo
623
624
625    s_mov_b32		s_save_buf_rsrc2, 0x4								//NUM_RECORDS	in bytes
626	if (SWIZZLE_EN)
627		s_add_u32		s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
628	else
629		s_mov_b32		s_save_buf_rsrc2,  0x1000000								//NUM_RECORDS in bytes
630	end
631    s_and_b32       m0, s_wave_size, 1
632    s_cmp_eq_u32    m0, 1
633    s_cbranch_scc1  L_SAVE_DDID_WAVE64
634
635    write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
636
637  L_SAVE_DDID_WAVE64:
638    write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
639
640    end
641
642  L_S_PGM_END_SAVED:
643	/*     S_PGM_END_SAVED  */    							//FIXME  graphics ONLY
644	if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
645		s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
646		s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
647		s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0			//carry bit over
648		s_rfe_b64 s_save_pc_lo                              //Return to the main shader program
649	else
650	end
651
652
653    s_branch	L_END_PGM
654
655
656
657/**************************************************************************/
658/*                     	restore routine						              */
659/**************************************************************************/
660
661L_RESTORE:
662    /*      Setup Resource Contants    */
663    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
664		//calculate wd_addr using absolute thread id
665		v_readlane_b32 s_restore_tmp, v9, 0
666        //determine it is wave32 or wave64
667        s_getreg_b32 	s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //change to ttmp13
668        s_cmp_eq_u32    s_restore_size, 0
669        s_cbranch_scc1  L_RESTORE_WAVE32
670        s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 //SAVE WAVE64
671        s_branch    L_RESTORE_CON
672    L_RESTORE_WAVE32:
673        s_lshr_b32 s_restore_tmp, s_restore_tmp, 5 //SAVE WAVE32
674    L_RESTORE_CON:
675		s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
676		s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
677		s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
678		s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
679	else
680	end
681
682    s_mov_b32		s_restore_buf_rsrc0, 	s_restore_spi_init_lo															//base_addr_lo
683	s_and_b32		s_restore_buf_rsrc1, 	s_restore_spi_init_hi, 0x0000FFFF												//base_addr_hi
684	s_or_b32		s_restore_buf_rsrc1, 	s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
685    s_mov_b32       s_restore_buf_rsrc2,   	0                                               								//NUM_RECORDS initial value = 0 (in bytes)
686	s_mov_b32		s_restore_buf_rsrc3, 	S_RESTORE_BUF_RSRC_WORD3_MISC
687	s_and_b32		s_restore_tmp,         	s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
688	s_lshr_b32		s_restore_tmp,  		s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)		//get ATC bit into position
689	s_or_b32		s_restore_buf_rsrc3, 	s_restore_buf_rsrc3,  s_restore_tmp												//or ATC
690	s_and_b32		s_restore_tmp,         	s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
691	s_lshr_b32		s_restore_tmp,  		s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)	//get MTYPE bits into position
692	s_or_b32		s_restore_buf_rsrc3, 	s_restore_buf_rsrc3,  s_restore_tmp												//or MTYPE
693    //determine it is wave32 or wave64
694    s_getreg_b32 	s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
695    s_or_b32        s_restore_size, s_restore_spi_init_hi,    s_restore_size                                             //share s_wave_size with exec_hi
696
697	/* 		global mem offset			*/
698	s_mov_b32		s_restore_mem_offset, 0x0								//mem offset initial value = 0
699
700        /*      	restore VGPRs	    */
701	//////////////////////////////
702  L_RESTORE_VGPR:
703
704 	s_mov_b32		exec_lo, 0xFFFFFFFF 													//need every thread from now on   //be consistent with SAVE although can be moved ahead
705    s_and_b32       m0, s_restore_size, 1
706    s_cmp_eq_u32    m0, 1
707    s_cbranch_scc1  L_ENABLE_RESTORE_VGPR_EXEC_HI
708    s_mov_b32		exec_hi, 0x00000000
709    s_branch        L_RESTORE_VGPR_NORMAL
710  L_ENABLE_RESTORE_VGPR_EXEC_HI:
711	s_mov_b32		exec_hi, 0xFFFFFFFF
712  L_RESTORE_VGPR_NORMAL:
713	s_getreg_b32 	s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) 	//vpgr_size
714	s_add_u32 		s_restore_alloc_size, s_restore_alloc_size, 1
715	s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 2 							//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
716    //determine it is wave32 or wave64
717    s_and_b32       m0, s_restore_size, 1
718    s_cmp_eq_u32    m0, 1
719    s_cbranch_scc1  L_RESTORE_VGPR_WAVE64
720
721    s_lshl_b32		s_restore_buf_rsrc2,  s_restore_alloc_size, 7						    //NUM_RECORDS in bytes (32 threads*4)
722	if (SWIZZLE_EN)
723		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
724	else
725		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
726	end
727
728	s_mov_b32		s_restore_mem_offset_save, s_restore_mem_offset							// restore start with v1, v0 will be the last
729	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 128
730    s_mov_b32 		m0, 1 																	//VGPR initial index value = 1
731	//s_set_gpr_idx_on  m0, 0x8																//M0[7:0] = M0[7:0] and M0[15:12] = 0x8
732    //s_add_u32		s_restore_alloc_size, s_restore_alloc_size, 0x8000						//add 0x8000 since we compare m0 against it later, might not need this in gfx10
733
734  L_RESTORE_VGPR_WAVE32_LOOP:
735    if(USE_MTBUF_INSTEAD_OF_MUBUF)
736		tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
737    else
738		buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset	slc:1 glc:1
739	end
740	s_waitcnt		vmcnt(0)																//ensure data ready
741	v_movreld_b32		v0, v0																	//v[0+m0] = v0
742    s_add_u32		m0, m0, 1																//next vgpr index
743	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 128							//every buffer_load_dword does 128 bytes
744	s_cmp_lt_u32 	m0,	s_restore_alloc_size 												//scc = (m0 < s_restore_alloc_size) ? 1 : 0
745	s_cbranch_scc1 	L_RESTORE_VGPR_WAVE32_LOOP														//VGPR restore (except v0) is complete?
746	//s_set_gpr_idx_off
747																							/* VGPR restore on v0 */
748    if(USE_MTBUF_INSTEAD_OF_MUBUF)
749		tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
750    else
751		buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save	slc:1 glc:1
752	end
753
754    s_branch    L_RESTORE_LDS
755
756  L_RESTORE_VGPR_WAVE64:
757    s_lshl_b32		s_restore_buf_rsrc2,  s_restore_alloc_size, 8						    //NUM_RECORDS in bytes (64 threads*4)
758	if (SWIZZLE_EN)
759		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
760	else
761		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
762	end
763
764	s_mov_b32		s_restore_mem_offset_save, s_restore_mem_offset							// restore start with v1, v0 will be the last
765	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 256
766    s_mov_b32 		m0, 1 																	//VGPR initial index value = 1
767  L_RESTORE_VGPR_WAVE64_LOOP:
768    if(USE_MTBUF_INSTEAD_OF_MUBUF)
769		tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
770    else
771		buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset	slc:1 glc:1
772	end
773	s_waitcnt		vmcnt(0)																//ensure data ready
774	v_movreld_b32		v0, v0																	//v[0+m0] = v0
775    s_add_u32		m0, m0, 1																//next vgpr index
776	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 256							//every buffer_load_dword does 256 bytes
777	s_cmp_lt_u32 	m0,	s_restore_alloc_size 												//scc = (m0 < s_restore_alloc_size) ? 1 : 0
778	s_cbranch_scc1 	L_RESTORE_VGPR_WAVE64_LOOP														//VGPR restore (except v0) is complete?
779	//s_set_gpr_idx_off
780    //
781    //Below part will be the restore shared vgpr part (new for gfx10)
782    s_getreg_b32 	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) 			//shared_vgpr_size
783    s_and_b32		s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF				//shared_vgpr_size is zero?
784    s_cbranch_scc0	L_RESTORE_V0													    //no shared_vgpr used? jump to L_SAVE_LDS
785    s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 3 						//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
786    //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
787    //restore shared_vgpr will start from the index of m0
788    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, m0
789    s_mov_b32		exec_lo, 0xFFFFFFFF
790    s_mov_b32		exec_hi, 0x00000000
791    L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
792    buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset	slc:1 glc:1
793    s_waitcnt		vmcnt(0)																//ensure data ready
794	v_movreld_b32		v0, v0																	//v[0+m0] = v0
795    s_add_u32		m0, m0, 1																//next vgpr index
796	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 128							//every buffer_load_dword does 256 bytes
797	s_cmp_lt_u32 	m0,	s_restore_alloc_size 												//scc = (m0 < s_restore_alloc_size) ? 1 : 0
798	s_cbranch_scc1 	L_RESTORE_SHARED_VGPR_WAVE64_LOOP														//VGPR restore (except v0) is complete?
799
800    s_mov_b32 exec_hi, 0xFFFFFFFF                                                           //restore back exec_hi before restoring V0!!
801
802    /* VGPR restore on v0 */
803  L_RESTORE_V0:
804    if(USE_MTBUF_INSTEAD_OF_MUBUF)
805		tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
806    else
807		buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save	slc:1 glc:1
808	end
809
810
811    /*      	restore LDS	    */
812	//////////////////////////////
813  L_RESTORE_LDS:
814
815    //Only need to check the first wave
816	/*      the first wave in the threadgroup    */
817	s_and_b32		s_restore_tmp, s_restore_size, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
818	s_cbranch_scc0	L_RESTORE_SGPR
819
820    s_mov_b32		exec_lo, 0xFFFFFFFF 													//need every thread from now on   //be consistent with SAVE although can be moved ahead
821    s_and_b32       m0, s_restore_size, 1
822    s_cmp_eq_u32    m0, 1
823    s_cbranch_scc1  L_ENABLE_RESTORE_LDS_EXEC_HI
824    s_mov_b32		exec_hi, 0x00000000
825    s_branch        L_RESTORE_LDS_NORMAL
826  L_ENABLE_RESTORE_LDS_EXEC_HI:
827	s_mov_b32		exec_hi, 0xFFFFFFFF
828  L_RESTORE_LDS_NORMAL:
829	s_getreg_b32 	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) 				//lds_size
830	s_and_b32		s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF					//lds_size is zero?
831	s_cbranch_scc0	L_RESTORE_SGPR															//no lds used? jump to L_RESTORE_VGPR
832	s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 6 							//LDS size in dwords = lds_size * 64dw
833	s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 2 							//LDS size in bytes
834	s_mov_b32		s_restore_buf_rsrc2,	s_restore_alloc_size							//NUM_RECORDS in bytes
835	if (SWIZZLE_EN)
836		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
837	else
838		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
839	end
840
841    s_and_b32       m0, s_wave_size, 1
842    s_cmp_eq_u32    m0, 1
843    s_mov_b32 		m0, 0x0
844    s_cbranch_scc1  L_RESTORE_LDS_LOOP_W64
845
846  L_RESTORE_LDS_LOOP_W32:
847	if (SAVE_LDS)
848	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
849    s_waitcnt 0
850	end
851    s_add_u32		m0, m0, 128																//every buffer_load_dword does 256 bytes
852	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 128						//mem offset increased by 256 bytes
853	s_cmp_lt_u32	m0, s_restore_alloc_size												//scc=(m0 < s_restore_alloc_size) ? 1 : 0
854	s_cbranch_scc1  L_RESTORE_LDS_LOOP_W32														//LDS restore is complete?
855    s_branch        L_RESTORE_SGPR
856
857  L_RESTORE_LDS_LOOP_W64:
858	if (SAVE_LDS)
859	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
860    s_waitcnt 0
861	end
862    s_add_u32		m0, m0, 256																//every buffer_load_dword does 256 bytes
863	s_add_u32		s_restore_mem_offset, s_restore_mem_offset, 256							//mem offset increased by 256 bytes
864	s_cmp_lt_u32	m0, s_restore_alloc_size												//scc=(m0 < s_restore_alloc_size) ? 1 : 0
865	s_cbranch_scc1  L_RESTORE_LDS_LOOP_W64														//LDS restore is complete?
866
867
868    /*      	restore SGPRs	    */
869	//////////////////////////////
870	//s_getreg_b32 	s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) 				//spgr_size
871	//s_add_u32 		s_restore_alloc_size, s_restore_alloc_size, 1
872	//s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 4 							//Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
873	//s_lshl_b32 		s_restore_alloc_size, s_restore_alloc_size, 3 							//Number of SGPRs = (sgpr_size + 1) * 8   (non-zero value)
874  L_RESTORE_SGPR:
875    //need to look at it is wave32 or wave64
876    s_and_b32       m0, s_restore_size, 1
877    s_cmp_eq_u32    m0, 1
878    s_cbranch_scc1  L_RESTORE_SGPR_VMEM_WAVE64
879	if (SGPR_SAVE_USE_SQC)
880		s_lshl_b32		s_restore_buf_rsrc2,	s_sgpr_save_num, 2						//NUM_RECORDS in bytes
881	else
882        s_lshl_b32		s_restore_buf_rsrc2,	s_sgpr_save_num, 7						//NUM_RECORDS in bytes (32 threads)
883    end
884    s_branch        L_RESTORE_SGPR_CONT
885  L_RESTORE_SGPR_VMEM_WAVE64:
886    if (SGPR_SAVE_USE_SQC)
887		s_lshl_b32		s_restore_buf_rsrc2,	s_sgpr_save_num, 2						//NUM_RECORDS in bytes
888	else
889		s_lshl_b32		s_restore_buf_rsrc2,	s_sgpr_save_num, 8						//NUM_RECORDS in bytes (64 threads)
890	end
891
892  L_RESTORE_SGPR_CONT:
893	if (SWIZZLE_EN)
894		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
895	else
896		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
897	end
898
899    s_and_b32       m0, s_restore_size, 1
900    s_cmp_eq_u32    m0, 1
901    s_cbranch_scc1  L_RESTORE_SGPR_WAVE64
902
903    read_sgpr_from_mem_wave32(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)		//save s0 to s_restore_tmp
904	s_mov_b32 		m0, 0x1
905
906  L_RESTORE_SGPR_LOOP_WAVE32:
907    read_sgpr_from_mem_wave32(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)															//PV: further performance improvement can be made
908	s_waitcnt		lgkmcnt(0)																//ensure data ready
909	s_movreld_b32 	s0, s0                                                                  //s[0+m0] = s0
910    s_nop 0                                                                                 // hazard SALU M0=> S_MOVREL
911	s_add_u32		m0, m0, 1																//next sgpr index
912	s_cmp_lt_u32 	m0, s_sgpr_save_num												//scc = (m0 < s_restore_alloc_size) ? 1 : 0
913	s_cbranch_scc1 	L_RESTORE_SGPR_LOOP_WAVE32														//SGPR restore (except s0) is complete?
914	s_mov_b32		s0, s_restore_tmp															/* SGPR restore on s0 */
915    s_branch        L_RESTORE_HWREG
916
917  L_RESTORE_SGPR_WAVE64:
918	read_sgpr_from_mem_wave64(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)		//save s0 to s_restore_tmp
919	s_mov_b32 		m0, 0x1																				//SGPR initial index value =1	//go on with with s1
920
921  L_RESTORE_SGPR_LOOP_WAVE64:
922	read_sgpr_from_mem_wave64(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)															//PV: further performance improvement can be made
923	s_waitcnt		lgkmcnt(0)																//ensure data ready
924	s_movreld_b32 	s0, s0                                                                  //s[0+m0] = s0
925    s_nop 0                                                                                 // hazard SALU M0=> S_MOVREL
926	s_add_u32		m0, m0, 1																//next sgpr index
927	s_cmp_lt_u32 	m0, s_sgpr_save_num												//scc = (m0 < s_restore_alloc_size) ? 1 : 0
928	s_cbranch_scc1 	L_RESTORE_SGPR_LOOP_WAVE64														//SGPR restore (except s0) is complete?
929	s_mov_b32		s0, s_restore_tmp															/* SGPR restore on s0 */
930
931
932    /* 		restore HW registers	*/
933	//////////////////////////////
934  L_RESTORE_HWREG:
935    s_mov_b32		s_restore_buf_rsrc2, 0x4												//NUM_RECORDS	in bytes
936	if (SWIZZLE_EN)
937		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
938	else
939		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
940	end
941
942    s_and_b32       m0, s_restore_size, 1
943    s_cmp_eq_u32    m0, 1
944    s_cbranch_scc1  L_RESTORE_HWREG_WAVE64
945
946    read_sgpr_from_mem_wave32(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//M0
947	read_sgpr_from_mem_wave32(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//PC
948	read_sgpr_from_mem_wave32(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
949	read_sgpr_from_mem_wave32(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//EXEC
950	read_sgpr_from_mem_wave32(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
951	read_sgpr_from_mem_wave32(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//STATUS
952	read_sgpr_from_mem_wave32(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//TRAPSTS
953    //read_sgpr_from_mem_wave32(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//XNACK_MASK_LO
954	//read_sgpr_from_mem_wave32(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//XNACK_MASK_HI
955    read_sgpr_from_mem_wave32(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//XNACK_MASK
956	read_sgpr_from_mem_wave32(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//MODE
957    if(SAVE_RESTORE_HWID_DDID)
958    read_sgpr_from_mem_wave32(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//HW_ID1
959    end
960    s_branch        L_RESTORE_HWREG_FINISH
961
962  L_RESTORE_HWREG_WAVE64:
963	read_sgpr_from_mem_wave64(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//M0
964	read_sgpr_from_mem_wave64(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//PC
965	read_sgpr_from_mem_wave64(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
966	read_sgpr_from_mem_wave64(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//EXEC
967	read_sgpr_from_mem_wave64(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
968	read_sgpr_from_mem_wave64(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//STATUS
969	read_sgpr_from_mem_wave64(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//TRAPSTS
970    //read_sgpr_from_mem_wave64(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//XNACK_MASK_LO
971	//read_sgpr_from_mem_wave64(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//XNACK_MASK_HI
972    read_sgpr_from_mem_wave64(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)					//XNACK_MASK
973	read_sgpr_from_mem_wave64(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//MODE
974    if(SAVE_RESTORE_HWID_DDID)
975    read_sgpr_from_mem_wave64(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)				//HW_ID1
976    end
977  L_RESTORE_HWREG_FINISH:
978	s_waitcnt		lgkmcnt(0)																						//from now on, it is safe to restore STATUS and IB_STS
979
980
981
982    if(SAVE_RESTORE_HWID_DDID)
983  L_RESTORE_DDID:
984    s_mov_b32      m0, s_restore_hwid1                                                      //virture ttrace support: The save-context handler records the SE/SA/WGP/SIMD/wave of the original wave
985    s_ttracedata                                                                            //and then can output it as SHADER_DATA to ttrace on restore to provide a correlation across the save-restore
986
987    s_mov_b32		s_restore_buf_rsrc2, 0x4												//NUM_RECORDS	in bytes
988	if (SWIZZLE_EN)
989		s_add_u32		s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0						//FIXME need to use swizzle to enable bounds checking?
990	else
991		s_mov_b32		s_restore_buf_rsrc2,  0x1000000										//NUM_RECORDS in bytes
992	end
993
994    s_and_b32       m0, s_restore_size, 1
995    s_cmp_eq_u32    m0, 1
996    s_cbranch_scc1  L_RESTORE_DDID_WAVE64
997
998    read_sgpr_from_mem_wave32(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
999    s_branch        L_RESTORE_DDID_FINISH
1000  L_RESTORE_DDID_WAVE64:
1001    read_sgpr_from_mem_wave64(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
1002
1003  L_RESTORE_DDID_FINISH:
1004    s_waitcnt		lgkmcnt(0)
1005    //s_mov_b32      m0, s_restore_ddid
1006    //s_ttracedata
1007    if (RESTORE_DDID_IN_SGPR18)
1008        s_mov_b32   s18, s_restore_ddid
1009	end
1010
1011    end
1012
1013	s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff    	//pc[47:32]        //Do it here in order not to affect STATUS
1014
1015	//for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
1016	if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
1017		s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8            //pc[31:0]+8	  //two back-to-back s_trap are used (first for save and second for restore)
1018		s_addc_u32	s_restore_pc_hi, s_restore_pc_hi, 0x0		 //carry bit over
1019	end
1020	if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
1021		s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4            //pc[31:0]+4     // save is hack through s_trap but restore is normal
1022		s_addc_u32	s_restore_pc_hi, s_restore_pc_hi, 0x0		 //carry bit over
1023	end
1024
1025	s_mov_b32 		m0, 		s_restore_m0
1026	s_mov_b32 		exec_lo, 	s_restore_exec_lo
1027	s_mov_b32 		exec_hi, 	s_restore_exec_hi
1028
1029	s_and_b32		s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
1030	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
1031    s_setreg_b32    hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask         //restore xnack_mask
1032	s_and_b32		s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
1033	s_lshr_b32		s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
1034	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
1035	//s_setreg_b32 	hwreg(HW_REG_TRAPSTS), 	s_restore_trapsts      //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
1036	s_setreg_b32 	hwreg(HW_REG_MODE), 	s_restore_mode
1037	//reuse s_restore_m0 as a temp register
1038	s_and_b32		s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
1039	s_lshr_b32		s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
1040	s_lshl_b32		s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
1041	s_mov_b32		s_restore_tmp, 0x0																				//IB_STS is zero
1042	s_or_b32		s_restore_tmp, s_restore_tmp, s_restore_m0
1043	s_and_b32		s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
1044	s_lshr_b32		s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
1045	s_lshl_b32		s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
1046	s_or_b32		s_restore_tmp, s_restore_tmp, s_restore_m0
1047    s_and_b32       s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
1048    s_lshr_b32		s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
1049	s_setreg_b32 	hwreg(HW_REG_IB_STS), 	s_restore_tmp
1050	s_setreg_b32 	hwreg(HW_REG_STATUS), 	s_restore_status
1051
1052	s_barrier													//barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
1053
1054
1055//	s_rfe_b64 s_restore_pc_lo                              		//Return to the main shader program and resume execution
1056    s_rfe_b64  s_restore_pc_lo            // s_restore_m0[0] is used to set STATUS.inst_atc
1057
1058
1059/**************************************************************************/
1060/*                     	the END								              */
1061/**************************************************************************/
1062L_END_PGM:
1063	s_endpgm
1064
1065end
1066
1067
1068/**************************************************************************/
1069/*                     	the helper functions							  */
1070/**************************************************************************/
1071function write_sgpr_to_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
1072	if (use_sqc)
1073		s_mov_b32 exec_lo, m0					//assuming exec_lo is not needed anymore from this point on
1074		s_mov_b32 m0, s_mem_offset
1075		s_buffer_store_dword s, s_rsrc, m0		glc:1
1076		s_add_u32		s_mem_offset, s_mem_offset, 4
1077		s_mov_b32	m0, exec_lo
1078    elsif (use_mtbuf)
1079        v_mov_b32	v0,	s
1080        tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
1081		s_add_u32		s_mem_offset, s_mem_offset, 128
1082    else
1083        v_mov_b32	v0,	s
1084		buffer_store_dword	v0, v0, s_rsrc, s_mem_offset	slc:1 glc:1
1085        s_add_u32		s_mem_offset, s_mem_offset, 128
1086	end
1087end
1088
1089function write_sgpr_to_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
1090	if (use_sqc)
1091		s_mov_b32 exec_lo, m0					//assuming exec_lo is not needed anymore from this point on
1092		s_mov_b32 m0, s_mem_offset
1093		s_buffer_store_dword s, s_rsrc, m0		glc:1
1094		s_add_u32		s_mem_offset, s_mem_offset, 4
1095		s_mov_b32	m0, exec_lo
1096    elsif (use_mtbuf)
1097        v_mov_b32	v0,	s
1098        tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
1099		s_add_u32		s_mem_offset, s_mem_offset, 256
1100    else
1101        v_mov_b32	v0,	s
1102		buffer_store_dword	v0, v0, s_rsrc, s_mem_offset	slc:1 glc:1
1103        s_add_u32		s_mem_offset, s_mem_offset, 256
1104	end
1105end
1106
1107function read_sgpr_from_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc)
1108	s_buffer_load_dword s, s_rsrc, s_mem_offset		glc:1
1109	if (use_sqc)
1110		s_add_u32		s_mem_offset, s_mem_offset, 4
1111	else
1112        s_add_u32		s_mem_offset, s_mem_offset, 128
1113	end
1114end
1115
1116function read_sgpr_from_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc)
1117	s_buffer_load_dword s, s_rsrc, s_mem_offset		glc:1
1118	if (use_sqc)
1119		s_add_u32		s_mem_offset, s_mem_offset, 4
1120	else
1121        s_add_u32		s_mem_offset, s_mem_offset, 256
1122	end
1123end
1124
1125