1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23/* To compile this assembly code:
24 *
25 * Navi1x:
26 *   cpp -DASIC_FAMILY=CHIP_NAVI10 cwsr_trap_handler_gfx10.asm -P -o nv1x.sp3
27 *   sp3 nv1x.sp3 -hex nv1x.hex
28 *
29 * gfx10:
30 *   cpp -DASIC_FAMILY=CHIP_SIENNA_CICHLID cwsr_trap_handler_gfx10.asm -P -o gfx10.sp3
31 *   sp3 gfx10.sp3 -hex gfx10.hex
32 *
33 * gfx11:
34 *   cpp -DASIC_FAMILY=CHIP_PLUM_BONITO cwsr_trap_handler_gfx10.asm -P -o gfx11.sp3
35 *   sp3 gfx11.sp3 -hex gfx11.hex
36 */
37
38#define CHIP_NAVI10 26
39#define CHIP_SIENNA_CICHLID 30
40#define CHIP_PLUM_BONITO 36
41
42#define NO_SQC_STORE (ASIC_FAMILY >= CHIP_SIENNA_CICHLID)
43#define HAVE_XNACK (ASIC_FAMILY < CHIP_SIENNA_CICHLID)
44#define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO)
45#define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO)
46#define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO)
47
48var SINGLE_STEP_MISSED_WORKAROUND		= 1	//workaround for lost MODE.DEBUG_EN exception when SAVECTX raised
49
50var SQ_WAVE_STATUS_SPI_PRIO_MASK		= 0x00000006
51var SQ_WAVE_STATUS_HALT_MASK			= 0x2000
52var SQ_WAVE_STATUS_ECC_ERR_MASK			= 0x20000
53var SQ_WAVE_STATUS_TRAP_EN_SHIFT		= 6
54
55var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT		= 12
56var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE		= 9
57var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE		= 8
58var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT	= 24
59var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE	= 4
60var SQ_WAVE_IB_STS2_WAVE64_SHIFT		= 11
61var SQ_WAVE_IB_STS2_WAVE64_SIZE			= 1
62
63#if ASIC_FAMILY < CHIP_PLUM_BONITO
64var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT		= 8
65#else
66var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT		= 12
67#endif
68
69var SQ_WAVE_TRAPSTS_SAVECTX_MASK		= 0x400
70var SQ_WAVE_TRAPSTS_EXCP_MASK			= 0x1FF
71var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT		= 10
72var SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK		= 0x80
73var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT		= 7
74var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK		= 0x100
75var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT		= 8
76var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK		= 0x3FF
77var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT		= 0x0
78var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE		= 10
79var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK		= 0xFFFFF800
80var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT		= 11
81var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE		= 21
82var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK		= 0x800
83var SQ_WAVE_TRAPSTS_EXCP_HI_MASK		= 0x7000
84
85var SQ_WAVE_MODE_EXCP_EN_SHIFT			= 12
86var SQ_WAVE_MODE_EXCP_EN_ADDR_WATCH_SHIFT	= 19
87
88var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT		= 15
89var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT		= 25
90var SQ_WAVE_IB_STS_REPLAY_W64H_MASK		= 0x02000000
91var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK	= 0x003F8000
92
93var SQ_WAVE_MODE_DEBUG_EN_MASK			= 0x800
94
95// bits [31:24] unused by SPI debug data
96var TTMP11_SAVE_REPLAY_W64H_SHIFT		= 31
97var TTMP11_SAVE_REPLAY_W64H_MASK		= 0x80000000
98var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT		= 24
99var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK		= 0x7F000000
100var TTMP11_DEBUG_TRAP_ENABLED_SHIFT		= 23
101var TTMP11_DEBUG_TRAP_ENABLED_MASK		= 0x800000
102
103// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
104// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
105var S_SAVE_BUF_RSRC_WORD1_STRIDE		= 0x00040000
106var S_SAVE_BUF_RSRC_WORD3_MISC			= 0x10807FAC
107var S_SAVE_PC_HI_TRAP_ID_MASK			= 0x00FF0000
108var S_SAVE_PC_HI_HT_MASK			= 0x01000000
109var S_SAVE_SPI_INIT_FIRST_WAVE_MASK		= 0x04000000
110var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT		= 26
111
112var S_SAVE_PC_HI_FIRST_WAVE_MASK		= 0x80000000
113var S_SAVE_PC_HI_FIRST_WAVE_SHIFT		= 31
114
115var s_sgpr_save_num				= 108
116
117var s_save_spi_init_lo				= exec_lo
118var s_save_spi_init_hi				= exec_hi
119var s_save_pc_lo				= ttmp0
120var s_save_pc_hi				= ttmp1
121var s_save_exec_lo				= ttmp2
122var s_save_exec_hi				= ttmp3
123var s_save_status				= ttmp12
124var s_save_trapsts				= ttmp15
125var s_save_xnack_mask				= s_save_trapsts
126var s_wave_size					= ttmp7
127var s_save_buf_rsrc0				= ttmp8
128var s_save_buf_rsrc1				= ttmp9
129var s_save_buf_rsrc2				= ttmp10
130var s_save_buf_rsrc3				= ttmp11
131var s_save_mem_offset				= ttmp4
132var s_save_alloc_size				= s_save_trapsts
133var s_save_tmp					= ttmp14
134var s_save_m0					= ttmp5
135var s_save_ttmps_lo				= s_save_tmp
136var s_save_ttmps_hi				= s_save_trapsts
137
138var S_RESTORE_BUF_RSRC_WORD1_STRIDE		= S_SAVE_BUF_RSRC_WORD1_STRIDE
139var S_RESTORE_BUF_RSRC_WORD3_MISC		= S_SAVE_BUF_RSRC_WORD3_MISC
140
141var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK		= 0x04000000
142var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT		= 26
143var S_WAVE_SIZE					= 25
144
145var s_restore_spi_init_lo			= exec_lo
146var s_restore_spi_init_hi			= exec_hi
147var s_restore_mem_offset			= ttmp12
148var s_restore_alloc_size			= ttmp3
149var s_restore_tmp				= ttmp2
150var s_restore_mem_offset_save			= s_restore_tmp
151var s_restore_m0				= s_restore_alloc_size
152var s_restore_mode				= ttmp7
153var s_restore_flat_scratch			= s_restore_tmp
154var s_restore_pc_lo				= ttmp0
155var s_restore_pc_hi				= ttmp1
156var s_restore_exec_lo				= ttmp4
157var s_restore_exec_hi				= ttmp5
158var s_restore_status				= ttmp14
159var s_restore_trapsts				= ttmp15
160var s_restore_xnack_mask			= ttmp13
161var s_restore_buf_rsrc0				= ttmp8
162var s_restore_buf_rsrc1				= ttmp9
163var s_restore_buf_rsrc2				= ttmp10
164var s_restore_buf_rsrc3				= ttmp11
165var s_restore_size				= ttmp6
166var s_restore_ttmps_lo				= s_restore_tmp
167var s_restore_ttmps_hi				= s_restore_alloc_size
168
169shader main
170	asic(DEFAULT)
171	type(CS)
172	wave_size(32)
173
174	s_branch	L_SKIP_RESTORE						//NOT restore. might be a regular trap or save
175
176L_JUMP_TO_RESTORE:
177	s_branch	L_RESTORE
178
179L_SKIP_RESTORE:
180	s_getreg_b32	s_save_status, hwreg(HW_REG_STATUS)			//save STATUS since we will change SCC
181
182	// Clear SPI_PRIO: do not save with elevated priority.
183	// Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
184	s_andn2_b32	s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK
185
186	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
187
188#if SW_SA_TRAP
189	// If ttmp1[31] is set then trap may occur early.
190	// Spin wait until SAVECTX exception is raised.
191	s_bitcmp1_b32	s_save_pc_hi, 31
192	s_cbranch_scc1  L_CHECK_SAVE
193#endif
194
195	s_and_b32       ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
196	s_cbranch_scc0	L_NOT_HALTED
197
198L_HALTED:
199	// Host trap may occur while wave is halted.
200	s_and_b32	ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
201	s_cbranch_scc1	L_FETCH_2ND_TRAP
202
203L_CHECK_SAVE:
204	s_and_b32	ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK
205	s_cbranch_scc1	L_SAVE
206
207	// Wave is halted but neither host trap nor SAVECTX is raised.
208	// Caused by instruction fetch memory violation.
209	// Spin wait until context saved to prevent interrupt storm.
210	s_sleep		0x10
211	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
212	s_branch	L_CHECK_SAVE
213
214L_NOT_HALTED:
215	// Let second-level handle non-SAVECTX exception or trap.
216	// Any concurrent SAVECTX will be handled upon re-entry once halted.
217
218	// Check non-maskable exceptions. memory_violation, illegal_instruction
219	// and xnack_error exceptions always cause the wave to enter the trap
220	// handler.
221	s_and_b32	ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
222	s_cbranch_scc1	L_FETCH_2ND_TRAP
223
224	// Check for maskable exceptions in trapsts.excp and trapsts.excp_hi.
225	// Maskable exceptions only cause the wave to enter the trap handler if
226	// their respective bit in mode.excp_en is set.
227	s_and_b32	ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCP_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
228	s_cbranch_scc0	L_CHECK_TRAP_ID
229
230	s_and_b32	ttmp3, s_save_trapsts, SQ_WAVE_TRAPSTS_ADDR_WATCH_MASK|SQ_WAVE_TRAPSTS_EXCP_HI_MASK
231	s_cbranch_scc0	L_NOT_ADDR_WATCH
232	s_bitset1_b32	ttmp2, SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT // Check all addr_watch[123] exceptions against excp_en.addr_watch
233
234L_NOT_ADDR_WATCH:
235	s_getreg_b32	ttmp3, hwreg(HW_REG_MODE)
236	s_lshl_b32	ttmp2, ttmp2, SQ_WAVE_MODE_EXCP_EN_SHIFT
237	s_and_b32	ttmp2, ttmp2, ttmp3
238	s_cbranch_scc1	L_FETCH_2ND_TRAP
239
240L_CHECK_TRAP_ID:
241	// Check trap_id != 0
242	s_and_b32	ttmp2, s_save_pc_hi, S_SAVE_PC_HI_TRAP_ID_MASK
243	s_cbranch_scc1	L_FETCH_2ND_TRAP
244
245if SINGLE_STEP_MISSED_WORKAROUND
246	// Prioritize single step exception over context save.
247	// Second-level trap will halt wave and RFE, re-entering for SAVECTX.
248	s_getreg_b32	ttmp2, hwreg(HW_REG_MODE)
249	s_and_b32	ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK
250	s_cbranch_scc1	L_FETCH_2ND_TRAP
251end
252
253	s_and_b32	ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK
254	s_cbranch_scc1	L_SAVE
255
256L_FETCH_2ND_TRAP:
257#if HAVE_XNACK
258	save_and_clear_ib_sts(ttmp14, ttmp15)
259#endif
260
261	// Read second-level TBA/TMA from first-level TMA and jump if available.
262	// ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
263	// ttmp12 holds SQ_WAVE_STATUS
264#if HAVE_SENDMSG_RTN
265	s_sendmsg_rtn_b64       [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA)
266	s_waitcnt       lgkmcnt(0)
267#else
268	s_getreg_b32	ttmp14, hwreg(HW_REG_SHADER_TMA_LO)
269	s_getreg_b32	ttmp15, hwreg(HW_REG_SHADER_TMA_HI)
270#endif
271	s_lshl_b64	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
272
273	s_load_dword    ttmp2, [ttmp14, ttmp15], 0x10 glc:1			// debug trap enabled flag
274	s_waitcnt       lgkmcnt(0)
275	s_lshl_b32      ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
276	s_andn2_b32     ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
277	s_or_b32        ttmp11, ttmp11, ttmp2
278
279	s_load_dwordx2	[ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1		// second-level TBA
280	s_waitcnt	lgkmcnt(0)
281	s_load_dwordx2	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1		// second-level TMA
282	s_waitcnt	lgkmcnt(0)
283
284	s_and_b64	[ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
285	s_cbranch_scc0	L_NO_NEXT_TRAP						// second-level trap handler not been set
286	s_setpc_b64	[ttmp2, ttmp3]						// jump to second-level trap handler
287
288L_NO_NEXT_TRAP:
289	// If not caused by trap then halt wave to prevent re-entry.
290	s_and_b32	ttmp2, s_save_pc_hi, (S_SAVE_PC_HI_TRAP_ID_MASK|S_SAVE_PC_HI_HT_MASK)
291	s_cbranch_scc1	L_TRAP_CASE
292	s_or_b32	s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
293
294	// If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set.
295	// Rewind the PC to prevent this from occurring.
296	s_sub_u32	ttmp0, ttmp0, 0x8
297	s_subb_u32	ttmp1, ttmp1, 0x0
298
299	s_branch	L_EXIT_TRAP
300
301L_TRAP_CASE:
302	// Host trap will not cause trap re-entry.
303	s_and_b32	ttmp2, s_save_pc_hi, S_SAVE_PC_HI_HT_MASK
304	s_cbranch_scc1	L_EXIT_TRAP
305
306	// Advance past trap instruction to prevent re-entry.
307	s_add_u32	ttmp0, ttmp0, 0x4
308	s_addc_u32	ttmp1, ttmp1, 0x0
309
310L_EXIT_TRAP:
311	s_and_b32	ttmp1, ttmp1, 0xFFFF
312
313#if HAVE_XNACK
314	restore_ib_sts(ttmp14, ttmp15)
315#endif
316
317	// Restore SQ_WAVE_STATUS.
318	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
319	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
320	s_setreg_b32	hwreg(HW_REG_STATUS), s_save_status
321
322	s_rfe_b64	[ttmp0, ttmp1]
323
324L_SAVE:
325	s_and_b32	s_save_pc_hi, s_save_pc_hi, 0x0000ffff			//pc[47:32]
326	s_mov_b32	s_save_tmp, 0
327	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp	//clear saveCtx bit
328
329#if HAVE_XNACK
330	save_and_clear_ib_sts(s_save_tmp, s_save_trapsts)
331#endif
332
333	/* inform SPI the readiness and wait for SPI's go signal */
334	s_mov_b32	s_save_exec_lo, exec_lo					//save EXEC and use EXEC for the go signal from SPI
335	s_mov_b32	s_save_exec_hi, exec_hi
336	s_mov_b64	exec, 0x0						//clear EXEC to get ready to receive
337
338#if HAVE_SENDMSG_RTN
339	s_sendmsg_rtn_b64       [exec_lo, exec_hi], sendmsg(MSG_RTN_SAVE_WAVE)
340#else
341	s_sendmsg	sendmsg(MSG_SAVEWAVE)					//send SPI a message and wait for SPI's write to EXEC
342#endif
343
344#if ASIC_FAMILY < CHIP_SIENNA_CICHLID
345L_SLEEP:
346	// sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause
347	// SQ hang, since the 7,8th wave could not get arbit to exec inst, while
348	// other waves are stuck into the sleep-loop and waiting for wrexec!=0
349	s_sleep		0x2
350	s_cbranch_execz	L_SLEEP
351#else
352	s_waitcnt	lgkmcnt(0)
353#endif
354
355	// Save first_wave flag so we can clear high bits of save address.
356	s_and_b32	s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
357	s_lshl_b32	s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT - S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT)
358	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
359
360#if NO_SQC_STORE
361	// Trap temporaries must be saved via VGPR but all VGPRs are in use.
362	// There is no ttmp space to hold the resource constant for VGPR save.
363	// Save v0 by itself since it requires only two SGPRs.
364	s_mov_b32	s_save_ttmps_lo, exec_lo
365	s_and_b32	s_save_ttmps_hi, exec_hi, 0xFFFF
366	s_mov_b32	exec_lo, 0xFFFFFFFF
367	s_mov_b32	exec_hi, 0xFFFFFFFF
368	global_store_dword_addtid	v0, [s_save_ttmps_lo, s_save_ttmps_hi] slc:1 glc:1
369	v_mov_b32	v0, 0x0
370	s_mov_b32	exec_lo, s_save_ttmps_lo
371	s_mov_b32	exec_hi, s_save_ttmps_hi
372#endif
373
374	// Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
375	// ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
376	get_wave_size(s_save_ttmps_hi)
377	get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi)
378	get_svgpr_size_bytes(s_save_ttmps_hi)
379	s_add_u32	s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
380	s_and_b32	s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF
381	s_add_u32	s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes()
382	s_add_u32	s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
383	s_addc_u32	s_save_ttmps_hi, s_save_ttmps_hi, 0x0
384
385#if NO_SQC_STORE
386	v_writelane_b32	v0, ttmp4, 0x4
387	v_writelane_b32	v0, ttmp5, 0x5
388	v_writelane_b32	v0, ttmp6, 0x6
389	v_writelane_b32	v0, ttmp7, 0x7
390	v_writelane_b32	v0, ttmp8, 0x8
391	v_writelane_b32	v0, ttmp9, 0x9
392	v_writelane_b32	v0, ttmp10, 0xA
393	v_writelane_b32	v0, ttmp11, 0xB
394	v_writelane_b32	v0, ttmp13, 0xD
395	v_writelane_b32	v0, exec_lo, 0xE
396	v_writelane_b32	v0, exec_hi, 0xF
397
398	s_mov_b32	exec_lo, 0x3FFF
399	s_mov_b32	exec_hi, 0x0
400	global_store_dword_addtid	v0, [s_save_ttmps_lo, s_save_ttmps_hi] inst_offset:0x40 slc:1 glc:1
401	v_readlane_b32	ttmp14, v0, 0xE
402	v_readlane_b32	ttmp15, v0, 0xF
403	s_mov_b32	exec_lo, ttmp14
404	s_mov_b32	exec_hi, ttmp15
405#else
406	s_store_dwordx4	[ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1
407	s_store_dwordx4	[ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1
408	s_store_dword   ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1
409#endif
410
411	/* setup Resource Contants */
412	s_mov_b32	s_save_buf_rsrc0, s_save_spi_init_lo			//base_addr_lo
413	s_and_b32	s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF	//base_addr_hi
414	s_or_b32	s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
415	s_mov_b32	s_save_buf_rsrc2, 0					//NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
416	s_mov_b32	s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
417
418	s_mov_b32	s_save_m0, m0
419
420	/* global mem offset */
421	s_mov_b32	s_save_mem_offset, 0x0
422	get_wave_size(s_wave_size)
423
424#if HAVE_XNACK
425	// Save and clear vector XNACK state late to free up SGPRs.
426	s_getreg_b32	s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
427	s_setreg_imm32_b32	hwreg(HW_REG_SHADER_XNACK_MASK), 0x0
428#endif
429
430	/* save first 4 VGPRs, needed for SGPR save */
431	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
432	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
433	s_and_b32	m0, m0, 1
434	s_cmp_eq_u32	m0, 1
435	s_cbranch_scc1	L_ENABLE_SAVE_4VGPR_EXEC_HI
436	s_mov_b32	exec_hi, 0x00000000
437	s_branch	L_SAVE_4VGPR_WAVE32
438L_ENABLE_SAVE_4VGPR_EXEC_HI:
439	s_mov_b32	exec_hi, 0xFFFFFFFF
440	s_branch	L_SAVE_4VGPR_WAVE64
441L_SAVE_4VGPR_WAVE32:
442	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
443
444	// VGPR Allocated in 4-GPR granularity
445
446#if !NO_SQC_STORE
447	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
448#endif
449	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
450	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
451	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
452	s_branch	L_SAVE_HWREG
453
454L_SAVE_4VGPR_WAVE64:
455	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
456
457	// VGPR Allocated in 4-GPR granularity
458
459#if !NO_SQC_STORE
460	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
461#endif
462	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
463	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
464	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
465
466	/* save HW registers */
467
468L_SAVE_HWREG:
469	// HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
470	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
471	get_svgpr_size_bytes(s_save_tmp)
472	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
473	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
474
475	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
476
477#if NO_SQC_STORE
478	v_mov_b32	v0, 0x0							//Offset[31:0] from buffer resource
479	v_mov_b32	v1, 0x0							//Offset[63:32] from buffer resource
480	v_mov_b32	v2, 0x0							//Set of SGPRs for TCP store
481	s_mov_b32	m0, 0x0							//Next lane of v2 to write to
482#endif
483
484	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
485	write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)
486	s_andn2_b32	s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
487	write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
488	write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)
489	write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
490	write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)
491
492	s_getreg_b32	s_save_tmp, hwreg(HW_REG_TRAPSTS)
493	write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset)
494
495	// Not used on Sienna_Cichlid but keep layout same for debugger.
496	write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset)
497
498	s_getreg_b32	s_save_m0, hwreg(HW_REG_MODE)
499	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
500
501	s_getreg_b32	s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO)
502	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
503
504	s_getreg_b32	s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI)
505	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
506
507#if NO_SQC_STORE
508	// Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this.
509	s_mov_b32       exec_lo, 0xFFFF
510	s_mov_b32	exec_hi, 0x0
511	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
512
513	// Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode.
514	s_mov_b32       exec_lo, 0xFFFFFFFF
515#endif
516
517	/* save SGPRs */
518	// Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
519
520	// SGPR SR memory offset : size(VGPR)+size(SVGPR)
521	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
522	get_svgpr_size_bytes(s_save_tmp)
523	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
524	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
525
526#if NO_SQC_STORE
527	s_mov_b32	ttmp13, 0x0						//next VGPR lane to copy SGPR into
528#else
529	// backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
530	s_mov_b32	s_save_xnack_mask, s_save_buf_rsrc0
531	s_add_u32	s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
532	s_addc_u32	s_save_buf_rsrc1, s_save_buf_rsrc1, 0
533#endif
534
535	s_mov_b32	m0, 0x0							//SGPR initial index value =0
536	s_nop		0x0							//Manually inserted wait states
537L_SAVE_SGPR_LOOP:
538	// SGPR is allocated in 16 SGPR granularity
539	s_movrels_b64	s0, s0							//s0 = s[0+m0], s1 = s[1+m0]
540	s_movrels_b64	s2, s2							//s2 = s[2+m0], s3 = s[3+m0]
541	s_movrels_b64	s4, s4							//s4 = s[4+m0], s5 = s[5+m0]
542	s_movrels_b64	s6, s6							//s6 = s[6+m0], s7 = s[7+m0]
543	s_movrels_b64	s8, s8							//s8 = s[8+m0], s9 = s[9+m0]
544	s_movrels_b64	s10, s10						//s10 = s[10+m0], s11 = s[11+m0]
545	s_movrels_b64	s12, s12						//s12 = s[12+m0], s13 = s[13+m0]
546	s_movrels_b64	s14, s14						//s14 = s[14+m0], s15 = s[15+m0]
547
548	write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
549
550#if NO_SQC_STORE
551	s_cmp_eq_u32	ttmp13, 0x20						//have 32 VGPR lanes filled?
552	s_cbranch_scc0	L_SAVE_SGPR_SKIP_TCP_STORE
553
554	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
555	s_add_u32	s_save_mem_offset, s_save_mem_offset, 0x80
556	s_mov_b32	ttmp13, 0x0
557	v_mov_b32	v2, 0x0
558L_SAVE_SGPR_SKIP_TCP_STORE:
559#endif
560
561	s_add_u32	m0, m0, 16						//next sgpr index
562	s_cmp_lt_u32	m0, 96							//scc = (m0 < first 96 SGPR) ? 1 : 0
563	s_cbranch_scc1	L_SAVE_SGPR_LOOP					//first 96 SGPR save is complete?
564
565	//save the rest 12 SGPR
566	s_movrels_b64	s0, s0							//s0 = s[0+m0], s1 = s[1+m0]
567	s_movrels_b64	s2, s2							//s2 = s[2+m0], s3 = s[3+m0]
568	s_movrels_b64	s4, s4							//s4 = s[4+m0], s5 = s[5+m0]
569	s_movrels_b64	s6, s6							//s6 = s[6+m0], s7 = s[7+m0]
570	s_movrels_b64	s8, s8							//s8 = s[8+m0], s9 = s[9+m0]
571	s_movrels_b64	s10, s10						//s10 = s[10+m0], s11 = s[11+m0]
572	write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
573
574#if NO_SQC_STORE
575	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
576#else
577	// restore s_save_buf_rsrc0,1
578	s_mov_b32	s_save_buf_rsrc0, s_save_xnack_mask
579#endif
580
581	/* save LDS */
582
583L_SAVE_LDS:
584	// Change EXEC to all threads...
585	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
586	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
587	s_and_b32	m0, m0, 1
588	s_cmp_eq_u32	m0, 1
589	s_cbranch_scc1	L_ENABLE_SAVE_LDS_EXEC_HI
590	s_mov_b32	exec_hi, 0x00000000
591	s_branch	L_SAVE_LDS_NORMAL
592L_ENABLE_SAVE_LDS_EXEC_HI:
593	s_mov_b32	exec_hi, 0xFFFFFFFF
594L_SAVE_LDS_NORMAL:
595	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
596	s_and_b32	s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF	//lds_size is zero?
597	s_cbranch_scc0	L_SAVE_LDS_DONE						//no lds used? jump to L_SAVE_DONE
598
599	s_barrier								//LDS is used? wait for other waves in the same TG
600	s_and_b32	s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
601	s_cbranch_scc0	L_SAVE_LDS_DONE
602
603	// first wave do LDS save;
604
605	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 6			//LDS size in dwords = lds_size * 64dw
606	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 2			//LDS size in bytes
607	s_mov_b32	s_save_buf_rsrc2, s_save_alloc_size			//NUM_RECORDS in bytes
608
609	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
610	//
611	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
612	get_svgpr_size_bytes(s_save_tmp)
613	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
614	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
615	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
616
617	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
618
619	//load 0~63*4(byte address) to vgpr v0
620	v_mbcnt_lo_u32_b32	v0, -1, 0
621	v_mbcnt_hi_u32_b32	v0, -1, v0
622	v_mul_u32_u24	v0, 4, v0
623
624	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
625	s_and_b32	m0, m0, 1
626	s_cmp_eq_u32	m0, 1
627	s_mov_b32	m0, 0x0
628	s_cbranch_scc1	L_SAVE_LDS_W64
629
630L_SAVE_LDS_W32:
631	s_mov_b32	s3, 128
632	s_nop		0
633	s_nop		0
634	s_nop		0
635L_SAVE_LDS_LOOP_W32:
636	ds_read_b32	v1, v0
637	s_waitcnt	0
638	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
639
640	s_add_u32	m0, m0, s3						//every buffer_store_lds does 256 bytes
641	s_add_u32	s_save_mem_offset, s_save_mem_offset, s3
642	v_add_nc_u32	v0, v0, 128						//mem offset increased by 128 bytes
643	s_cmp_lt_u32	m0, s_save_alloc_size					//scc=(m0 < s_save_alloc_size) ? 1 : 0
644	s_cbranch_scc1	L_SAVE_LDS_LOOP_W32					//LDS save is complete?
645
646	s_branch	L_SAVE_LDS_DONE
647
648L_SAVE_LDS_W64:
649	s_mov_b32	s3, 256
650	s_nop		0
651	s_nop		0
652	s_nop		0
653L_SAVE_LDS_LOOP_W64:
654	ds_read_b32	v1, v0
655	s_waitcnt	0
656	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
657
658	s_add_u32	m0, m0, s3						//every buffer_store_lds does 256 bytes
659	s_add_u32	s_save_mem_offset, s_save_mem_offset, s3
660	v_add_nc_u32	v0, v0, 256						//mem offset increased by 256 bytes
661	s_cmp_lt_u32	m0, s_save_alloc_size					//scc=(m0 < s_save_alloc_size) ? 1 : 0
662	s_cbranch_scc1	L_SAVE_LDS_LOOP_W64					//LDS save is complete?
663
664L_SAVE_LDS_DONE:
665	/* save VGPRs  - set the Rest VGPRs */
666L_SAVE_VGPR:
667	// VGPR SR memory offset: 0
668	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
669	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
670	s_and_b32	m0, m0, 1
671	s_cmp_eq_u32	m0, 1
672	s_cbranch_scc1	L_ENABLE_SAVE_VGPR_EXEC_HI
673	s_mov_b32	s_save_mem_offset, (0+128*4)				// for the rest VGPRs
674	s_mov_b32	exec_hi, 0x00000000
675	s_branch	L_SAVE_VGPR_NORMAL
676L_ENABLE_SAVE_VGPR_EXEC_HI:
677	s_mov_b32	s_save_mem_offset, (0+256*4)				// for the rest VGPRs
678	s_mov_b32	exec_hi, 0xFFFFFFFF
679L_SAVE_VGPR_NORMAL:
680	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
681	s_add_u32	s_save_alloc_size, s_save_alloc_size, 1
682	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 2			//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
683	//determine it is wave32 or wave64
684	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
685	s_and_b32	m0, m0, 1
686	s_cmp_eq_u32	m0, 1
687	s_cbranch_scc1	L_SAVE_VGPR_WAVE64
688
689	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
690
691	// VGPR Allocated in 4-GPR granularity
692
693	// VGPR store using dw burst
694	s_mov_b32	m0, 0x4							//VGPR initial index value =4
695	s_cmp_lt_u32	m0, s_save_alloc_size
696	s_cbranch_scc0	L_SAVE_VGPR_END
697
698L_SAVE_VGPR_W32_LOOP:
699	v_movrels_b32	v0, v0							//v0 = v[0+m0]
700	v_movrels_b32	v1, v1							//v1 = v[1+m0]
701	v_movrels_b32	v2, v2							//v2 = v[2+m0]
702	v_movrels_b32	v3, v3							//v3 = v[3+m0]
703
704	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
705	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
706	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
707	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
708
709	s_add_u32	m0, m0, 4						//next vgpr index
710	s_add_u32	s_save_mem_offset, s_save_mem_offset, 128*4		//every buffer_store_dword does 128 bytes
711	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
712	s_cbranch_scc1	L_SAVE_VGPR_W32_LOOP					//VGPR save is complete?
713
714	s_branch	L_SAVE_VGPR_END
715
716L_SAVE_VGPR_WAVE64:
717	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
718
719	// VGPR store using dw burst
720	s_mov_b32	m0, 0x4							//VGPR initial index value =4
721	s_cmp_lt_u32	m0, s_save_alloc_size
722	s_cbranch_scc0	L_SAVE_SHARED_VGPR
723
724L_SAVE_VGPR_W64_LOOP:
725	v_movrels_b32	v0, v0							//v0 = v[0+m0]
726	v_movrels_b32	v1, v1							//v1 = v[1+m0]
727	v_movrels_b32	v2, v2							//v2 = v[2+m0]
728	v_movrels_b32	v3, v3							//v3 = v[3+m0]
729
730	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
731	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
732	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
733	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
734
735	s_add_u32	m0, m0, 4						//next vgpr index
736	s_add_u32	s_save_mem_offset, s_save_mem_offset, 256*4		//every buffer_store_dword does 256 bytes
737	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
738	s_cbranch_scc1	L_SAVE_VGPR_W64_LOOP					//VGPR save is complete?
739
740L_SAVE_SHARED_VGPR:
741	//Below part will be the save shared vgpr part (new for gfx10)
742	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
743	s_and_b32	s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF	//shared_vgpr_size is zero?
744	s_cbranch_scc0	L_SAVE_VGPR_END						//no shared_vgpr used? jump to L_SAVE_LDS
745	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 3			//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
746	//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
747	//save shared_vgpr will start from the index of m0
748	s_add_u32	s_save_alloc_size, s_save_alloc_size, m0
749	s_mov_b32	exec_lo, 0xFFFFFFFF
750	s_mov_b32	exec_hi, 0x00000000
751L_SAVE_SHARED_VGPR_WAVE64_LOOP:
752	v_movrels_b32	v0, v0							//v0 = v[0+m0]
753	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
754	s_add_u32	m0, m0, 1						//next vgpr index
755	s_add_u32	s_save_mem_offset, s_save_mem_offset, 128
756	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
757	s_cbranch_scc1	L_SAVE_SHARED_VGPR_WAVE64_LOOP				//SHARED_VGPR save is complete?
758
759L_SAVE_VGPR_END:
760	s_branch	L_END_PGM
761
762L_RESTORE:
763	/* Setup Resource Contants */
764	s_mov_b32	s_restore_buf_rsrc0, s_restore_spi_init_lo		//base_addr_lo
765	s_and_b32	s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF	//base_addr_hi
766	s_or_b32	s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
767	s_mov_b32	s_restore_buf_rsrc2, 0					//NUM_RECORDS initial value = 0 (in bytes)
768	s_mov_b32	s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
769
770	//determine it is wave32 or wave64
771	get_wave_size(s_restore_size)
772
773	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
774	s_cbranch_scc0	L_RESTORE_VGPR
775
776	/* restore LDS */
777L_RESTORE_LDS:
778	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
779	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
780	s_and_b32	m0, m0, 1
781	s_cmp_eq_u32	m0, 1
782	s_cbranch_scc1	L_ENABLE_RESTORE_LDS_EXEC_HI
783	s_mov_b32	exec_hi, 0x00000000
784	s_branch	L_RESTORE_LDS_NORMAL
785L_ENABLE_RESTORE_LDS_EXEC_HI:
786	s_mov_b32	exec_hi, 0xFFFFFFFF
787L_RESTORE_LDS_NORMAL:
788	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
789	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//lds_size is zero?
790	s_cbranch_scc0	L_RESTORE_VGPR						//no lds used? jump to L_RESTORE_VGPR
791	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 6		//LDS size in dwords = lds_size * 64dw
792	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 2		//LDS size in bytes
793	s_mov_b32	s_restore_buf_rsrc2, s_restore_alloc_size		//NUM_RECORDS in bytes
794
795	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
796	//
797	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
798	get_svgpr_size_bytes(s_restore_tmp)
799	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
800	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
801	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
802
803	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
804
805	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
806	s_and_b32	m0, m0, 1
807	s_cmp_eq_u32	m0, 1
808	s_mov_b32	m0, 0x0
809	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W64
810
811L_RESTORE_LDS_LOOP_W32:
812#if HAVE_BUFFER_LDS_LOAD
813	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1	// first 64DW
814#else
815	buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
816	s_waitcnt	vmcnt(0)
817	ds_store_addtid_b32     v0
818#endif
819	s_add_u32	m0, m0, 128						// 128 DW
820	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128		//mem offset increased by 128DW
821	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc=(m0 < s_restore_alloc_size) ? 1 : 0
822	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W32					//LDS restore is complete?
823	s_branch	L_RESTORE_VGPR
824
825L_RESTORE_LDS_LOOP_W64:
826#if HAVE_BUFFER_LDS_LOAD
827	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1	// first 64DW
828#else
829	buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
830	s_waitcnt	vmcnt(0)
831	ds_store_addtid_b32     v0
832#endif
833	s_add_u32	m0, m0, 256						// 256 DW
834	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256		//mem offset increased by 256DW
835	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc=(m0 < s_restore_alloc_size) ? 1 : 0
836	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W64					//LDS restore is complete?
837
838	/* restore VGPRs */
839L_RESTORE_VGPR:
840	// VGPR SR memory offset : 0
841	s_mov_b32	s_restore_mem_offset, 0x0
842 	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
843	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
844	s_and_b32	m0, m0, 1
845	s_cmp_eq_u32	m0, 1
846	s_cbranch_scc1	L_ENABLE_RESTORE_VGPR_EXEC_HI
847	s_mov_b32	exec_hi, 0x00000000
848	s_branch	L_RESTORE_VGPR_NORMAL
849L_ENABLE_RESTORE_VGPR_EXEC_HI:
850	s_mov_b32	exec_hi, 0xFFFFFFFF
851L_RESTORE_VGPR_NORMAL:
852	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
853	s_add_u32	s_restore_alloc_size, s_restore_alloc_size, 1
854	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 2		//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
855	//determine it is wave32 or wave64
856	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
857	s_and_b32	m0, m0, 1
858	s_cmp_eq_u32	m0, 1
859	s_cbranch_scc1	L_RESTORE_VGPR_WAVE64
860
861	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
862
863	// VGPR load using dw burst
864	s_mov_b32	s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v1, v0 will be the last
865	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128*4
866	s_mov_b32	m0, 4							//VGPR initial index value = 4
867	s_cmp_lt_u32	m0, s_restore_alloc_size
868	s_cbranch_scc0	L_RESTORE_SGPR
869
870L_RESTORE_VGPR_WAVE32_LOOP:
871	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
872	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128
873	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2
874	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3
875	s_waitcnt	vmcnt(0)
876	v_movreld_b32	v0, v0							//v[0+m0] = v0
877	v_movreld_b32	v1, v1
878	v_movreld_b32	v2, v2
879	v_movreld_b32	v3, v3
880	s_add_u32	m0, m0, 4						//next vgpr index
881	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128*4	//every buffer_load_dword does 128 bytes
882	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
883	s_cbranch_scc1	L_RESTORE_VGPR_WAVE32_LOOP				//VGPR restore (except v0) is complete?
884
885	/* VGPR restore on v0 */
886	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
887	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128
888	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2
889	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3
890	s_waitcnt	vmcnt(0)
891
892	s_branch	L_RESTORE_SGPR
893
894L_RESTORE_VGPR_WAVE64:
895	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
896
897	// VGPR load using dw burst
898	s_mov_b32	s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v4, v0 will be the last
899	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256*4
900	s_mov_b32	m0, 4							//VGPR initial index value = 4
901	s_cmp_lt_u32	m0, s_restore_alloc_size
902	s_cbranch_scc0	L_RESTORE_SHARED_VGPR
903
904L_RESTORE_VGPR_WAVE64_LOOP:
905	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
906	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
907	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
908	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
909	s_waitcnt	vmcnt(0)
910	v_movreld_b32	v0, v0							//v[0+m0] = v0
911	v_movreld_b32	v1, v1
912	v_movreld_b32	v2, v2
913	v_movreld_b32	v3, v3
914	s_add_u32	m0, m0, 4						//next vgpr index
915	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256*4	//every buffer_load_dword does 256 bytes
916	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
917	s_cbranch_scc1	L_RESTORE_VGPR_WAVE64_LOOP				//VGPR restore (except v0) is complete?
918
919L_RESTORE_SHARED_VGPR:
920	//Below part will be the restore shared vgpr part (new for gfx10)
921	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)	//shared_vgpr_size
922	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//shared_vgpr_size is zero?
923	s_cbranch_scc0	L_RESTORE_V0						//no shared_vgpr used?
924	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 3		//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
925	//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
926	//restore shared_vgpr will start from the index of m0
927	s_add_u32	s_restore_alloc_size, s_restore_alloc_size, m0
928	s_mov_b32	exec_lo, 0xFFFFFFFF
929	s_mov_b32	exec_hi, 0x00000000
930L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
931	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
932	s_waitcnt	vmcnt(0)
933	v_movreld_b32	v0, v0							//v[0+m0] = v0
934	s_add_u32	m0, m0, 1						//next vgpr index
935	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128
936	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
937	s_cbranch_scc1	L_RESTORE_SHARED_VGPR_WAVE64_LOOP			//VGPR restore (except v0) is complete?
938
939	s_mov_b32	exec_hi, 0xFFFFFFFF					//restore back exec_hi before restoring V0!!
940
941	/* VGPR restore on v0 */
942L_RESTORE_V0:
943	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
944	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256
945	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2
946	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3
947	s_waitcnt	vmcnt(0)
948
949	/* restore SGPRs */
950	//will be 2+8+16*6
951	// SGPR SR memory offset : size(VGPR)+size(SVGPR)
952L_RESTORE_SGPR:
953	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
954	get_svgpr_size_bytes(s_restore_tmp)
955	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
956	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
957	s_sub_u32	s_restore_mem_offset, s_restore_mem_offset, 20*4	//s108~s127 is not saved
958
959	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
960
961	s_mov_b32	m0, s_sgpr_save_num
962
963	read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
964	s_waitcnt	lgkmcnt(0)
965
966	s_sub_u32	m0, m0, 4						// Restore from S[0] to S[104]
967	s_nop		0							// hazard SALU M0=> S_MOVREL
968
969	s_movreld_b64	s0, s0							//s[0+m0] = s0
970	s_movreld_b64	s2, s2
971
972	read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
973	s_waitcnt	lgkmcnt(0)
974
975	s_sub_u32	m0, m0, 8						// Restore from S[0] to S[96]
976	s_nop		0							// hazard SALU M0=> S_MOVREL
977
978	s_movreld_b64	s0, s0							//s[0+m0] = s0
979	s_movreld_b64	s2, s2
980	s_movreld_b64	s4, s4
981	s_movreld_b64	s6, s6
982
983 L_RESTORE_SGPR_LOOP:
984	read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
985	s_waitcnt	lgkmcnt(0)
986
987	s_sub_u32	m0, m0, 16						// Restore from S[n] to S[0]
988	s_nop		0							// hazard SALU M0=> S_MOVREL
989
990	s_movreld_b64	s0, s0							//s[0+m0] = s0
991	s_movreld_b64	s2, s2
992	s_movreld_b64	s4, s4
993	s_movreld_b64	s6, s6
994	s_movreld_b64	s8, s8
995	s_movreld_b64	s10, s10
996	s_movreld_b64	s12, s12
997	s_movreld_b64	s14, s14
998
999	s_cmp_eq_u32	m0, 0							//scc = (m0 < s_sgpr_save_num) ? 1 : 0
1000	s_cbranch_scc0	L_RESTORE_SGPR_LOOP
1001
1002	// s_barrier with MODE.DEBUG_EN=1, STATUS.PRIV=1 incorrectly asserts debug exception.
1003	// Clear DEBUG_EN before and restore MODE after the barrier.
1004	s_setreg_imm32_b32	hwreg(HW_REG_MODE), 0
1005	s_barrier								//barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG
1006
1007	/* restore HW registers */
1008L_RESTORE_HWREG:
1009	// HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
1010	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
1011	get_svgpr_size_bytes(s_restore_tmp)
1012	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
1013	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
1014
1015	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
1016
1017	read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
1018	read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
1019	read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1020	read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
1021	read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
1022	read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)
1023	read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)
1024	read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
1025	read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
1026	read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
1027	s_waitcnt	lgkmcnt(0)
1028
1029	s_setreg_b32	hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch
1030
1031	read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
1032	s_waitcnt	lgkmcnt(0)						//from now on, it is safe to restore STATUS and IB_STS
1033
1034	s_setreg_b32	hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch
1035
1036	s_mov_b32	m0, s_restore_m0
1037	s_mov_b32	exec_lo, s_restore_exec_lo
1038	s_mov_b32	exec_hi, s_restore_exec_hi
1039
1040	s_and_b32	s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
1041	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
1042
1043#if HAVE_XNACK
1044	s_setreg_b32	hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
1045#endif
1046
1047	s_and_b32	s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
1048	s_lshr_b32	s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
1049	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
1050	s_setreg_b32	hwreg(HW_REG_MODE), s_restore_mode
1051
1052	// Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
1053	// ttmp SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)+0x40
1054	get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size)
1055	get_svgpr_size_bytes(s_restore_ttmps_hi)
1056	s_add_u32	s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
1057	s_add_u32	s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes()
1058	s_add_u32	s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
1059	s_addc_u32	s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
1060	s_and_b32	s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
1061	s_load_dwordx4	[ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1
1062	s_load_dwordx4	[ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1
1063	s_load_dword	ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1
1064	s_waitcnt	lgkmcnt(0)
1065
1066#if HAVE_XNACK
1067	restore_ib_sts(s_restore_tmp, s_restore_m0)
1068#endif
1069
1070	s_and_b32	s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff		//pc[47:32] //Do it here in order not to affect STATUS
1071	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
1072	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
1073
1074#if SW_SA_TRAP
1075	// If traps are enabled then return to the shader with PRIV=0.
1076	// Otherwise retain PRIV=1 for subsequent context save requests.
1077	s_getreg_b32	s_restore_tmp, hwreg(HW_REG_STATUS)
1078	s_bitcmp1_b32	s_restore_tmp, SQ_WAVE_STATUS_TRAP_EN_SHIFT
1079	s_cbranch_scc1	L_RETURN_WITHOUT_PRIV
1080
1081	s_setreg_b32	hwreg(HW_REG_STATUS), s_restore_status			// SCC is included, which is changed by previous salu
1082	s_setpc_b64	[s_restore_pc_lo, s_restore_pc_hi]
1083L_RETURN_WITHOUT_PRIV:
1084#endif
1085
1086	s_setreg_b32	hwreg(HW_REG_STATUS), s_restore_status			// SCC is included, which is changed by previous salu
1087	s_rfe_b64	s_restore_pc_lo						//Return to the main shader program and resume execution
1088
1089L_END_PGM:
1090	s_endpgm
1091end
1092
1093function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
1094#if NO_SQC_STORE
1095	// Copy into VGPR for later TCP store.
1096	v_writelane_b32	v2, s, m0
1097	s_add_u32	m0, m0, 0x1
1098#else
1099	s_mov_b32	exec_lo, m0
1100	s_mov_b32	m0, s_mem_offset
1101	s_buffer_store_dword	s, s_rsrc, m0 glc:1
1102	s_add_u32	s_mem_offset, s_mem_offset, 4
1103	s_mov_b32	m0, exec_lo
1104#endif
1105end
1106
1107
1108function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
1109#if NO_SQC_STORE
1110	// Copy into VGPR for later TCP store.
1111	for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++
1112		v_writelane_b32	v2, s[sgpr_idx], ttmp13
1113		s_add_u32	ttmp13, ttmp13, 0x1
1114	end
1115#else
1116	s_buffer_store_dwordx4	s[0], s_rsrc, 0 glc:1
1117	s_buffer_store_dwordx4	s[4], s_rsrc, 16 glc:1
1118	s_buffer_store_dwordx4	s[8], s_rsrc, 32 glc:1
1119	s_buffer_store_dwordx4	s[12], s_rsrc, 48 glc:1
1120	s_add_u32	s_rsrc[0], s_rsrc[0], 4*16
1121	s_addc_u32	s_rsrc[1], s_rsrc[1], 0x0
1122#endif
1123end
1124
1125function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset)
1126#if NO_SQC_STORE
1127	// Copy into VGPR for later TCP store.
1128	for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++
1129		v_writelane_b32	v2, s[sgpr_idx], ttmp13
1130		s_add_u32	ttmp13, ttmp13, 0x1
1131	end
1132#else
1133	s_buffer_store_dwordx4	s[0], s_rsrc, 0 glc:1
1134	s_buffer_store_dwordx4	s[4], s_rsrc, 16 glc:1
1135	s_buffer_store_dwordx4	s[8], s_rsrc, 32 glc:1
1136	s_add_u32	s_rsrc[0], s_rsrc[0], 4*12
1137	s_addc_u32	s_rsrc[1], s_rsrc[1], 0x0
1138#endif
1139end
1140
1141function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
1142	s_buffer_load_dword	s, s_rsrc, s_mem_offset glc:1
1143	s_add_u32	s_mem_offset, s_mem_offset, 4
1144end
1145
1146function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
1147	s_sub_u32	s_mem_offset, s_mem_offset, 4*16
1148	s_buffer_load_dwordx16	s, s_rsrc, s_mem_offset glc:1
1149end
1150
1151function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
1152	s_sub_u32	s_mem_offset, s_mem_offset, 4*8
1153	s_buffer_load_dwordx8	s, s_rsrc, s_mem_offset glc:1
1154end
1155
1156function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
1157	s_sub_u32	s_mem_offset, s_mem_offset, 4*4
1158	s_buffer_load_dwordx4	s, s_rsrc, s_mem_offset glc:1
1159end
1160
1161
1162function get_lds_size_bytes(s_lds_size_byte)
1163	s_getreg_b32	s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
1164	s_lshl_b32	s_lds_size_byte, s_lds_size_byte, 8			//LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
1165end
1166
1167function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
1168	s_getreg_b32	s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
1169	s_add_u32	s_vgpr_size_byte, s_vgpr_size_byte, 1
1170	s_bitcmp1_b32	s_size, S_WAVE_SIZE
1171	s_cbranch_scc1	L_ENABLE_SHIFT_W64
1172	s_lshl_b32	s_vgpr_size_byte, s_vgpr_size_byte, (2+7)		//Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4   (non-zero value)
1173	s_branch	L_SHIFT_DONE
1174L_ENABLE_SHIFT_W64:
1175	s_lshl_b32	s_vgpr_size_byte, s_vgpr_size_byte, (2+8)		//Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)
1176L_SHIFT_DONE:
1177end
1178
1179function get_svgpr_size_bytes(s_svgpr_size_byte)
1180	s_getreg_b32	s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
1181	s_lshl_b32	s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
1182end
1183
1184function get_sgpr_size_bytes
1185	return 512
1186end
1187
1188function get_hwreg_size_bytes
1189	return 128
1190end
1191
1192function get_wave_size(s_reg)
1193	s_getreg_b32	s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
1194	s_lshl_b32	s_reg, s_reg, S_WAVE_SIZE
1195end
1196
1197function save_and_clear_ib_sts(tmp1, tmp2)
1198	// Preserve and clear scalar XNACK state before issuing scalar loads.
1199	// Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into
1200	// unused space ttmp11[31:24].
1201	s_andn2_b32	ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK)
1202	s_getreg_b32	tmp1, hwreg(HW_REG_IB_STS)
1203	s_and_b32	tmp2, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
1204	s_lshl_b32	tmp2, tmp2, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
1205	s_or_b32	ttmp11, ttmp11, tmp2
1206	s_and_b32	tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1207	s_lshl_b32	tmp2, tmp2, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1208	s_or_b32	ttmp11, ttmp11, tmp2
1209	s_andn2_b32	tmp1, tmp1, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK)
1210	s_setreg_b32	hwreg(HW_REG_IB_STS), tmp1
1211end
1212
1213function restore_ib_sts(tmp1, tmp2)
1214	s_lshr_b32	tmp1, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
1215	s_and_b32	tmp2, tmp1, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
1216	s_lshr_b32	tmp1, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
1217	s_and_b32	tmp1, tmp1, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
1218	s_or_b32	tmp1, tmp1, tmp2
1219	s_setreg_b32	hwreg(HW_REG_IB_STS), tmp1
1220end
1221