1 /*
2  * Copyright 2014-2018 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #define pr_fmt(fmt) "kfd2kgd: " fmt
24 
25 #include <linux/module.h>
26 #include <linux/fdtable.h>
27 #include <linux/uaccess.h>
28 #include <linux/firmware.h>
29 #include <drm/drmP.h>
30 #include "amdgpu.h"
31 #include "amdgpu_amdkfd.h"
32 #include "amdgpu_ucode.h"
33 #include "soc15_hw_ip.h"
34 #include "gc/gc_9_0_offset.h"
35 #include "gc/gc_9_0_sh_mask.h"
36 #include "vega10_enum.h"
37 #include "sdma0/sdma0_4_0_offset.h"
38 #include "sdma0/sdma0_4_0_sh_mask.h"
39 #include "sdma1/sdma1_4_0_offset.h"
40 #include "sdma1/sdma1_4_0_sh_mask.h"
41 #include "athub/athub_1_0_offset.h"
42 #include "athub/athub_1_0_sh_mask.h"
43 #include "oss/osssys_4_0_offset.h"
44 #include "oss/osssys_4_0_sh_mask.h"
45 #include "soc15_common.h"
46 #include "v9_structs.h"
47 #include "soc15.h"
48 #include "soc15d.h"
49 #include "mmhub_v1_0.h"
50 #include "gfxhub_v1_0.h"
51 
52 
53 #define V9_PIPE_PER_MEC		(4)
54 #define V9_QUEUES_PER_PIPE_MEC	(8)
55 
56 enum hqd_dequeue_request_type {
57 	NO_ACTION = 0,
58 	DRAIN_PIPE,
59 	RESET_WAVES
60 };
61 
62 /*
63  * Register access functions
64  */
65 
66 static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
67 		uint32_t sh_mem_config,
68 		uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
69 		uint32_t sh_mem_bases);
70 static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
71 		unsigned int vmid);
72 static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
73 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
74 			uint32_t queue_id, uint32_t __user *wptr,
75 			uint32_t wptr_shift, uint32_t wptr_mask,
76 			struct mm_struct *mm);
77 static int kgd_hqd_dump(struct kgd_dev *kgd,
78 			uint32_t pipe_id, uint32_t queue_id,
79 			uint32_t (**dump)[2], uint32_t *n_regs);
80 static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
81 			     uint32_t __user *wptr, struct mm_struct *mm);
82 static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
83 			     uint32_t engine_id, uint32_t queue_id,
84 			     uint32_t (**dump)[2], uint32_t *n_regs);
85 static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
86 		uint32_t pipe_id, uint32_t queue_id);
87 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
88 static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
89 				enum kfd_preempt_type reset_type,
90 				unsigned int utimeout, uint32_t pipe_id,
91 				uint32_t queue_id);
92 static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
93 				unsigned int utimeout);
94 static int kgd_address_watch_disable(struct kgd_dev *kgd);
95 static int kgd_address_watch_execute(struct kgd_dev *kgd,
96 					unsigned int watch_point_id,
97 					uint32_t cntl_val,
98 					uint32_t addr_hi,
99 					uint32_t addr_lo);
100 static int kgd_wave_control_execute(struct kgd_dev *kgd,
101 					uint32_t gfx_index_val,
102 					uint32_t sq_cmd);
103 static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
104 					unsigned int watch_point_id,
105 					unsigned int reg_offset);
106 
107 static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
108 		uint8_t vmid);
109 static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
110 		uint8_t vmid);
111 static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
112 		uint64_t page_table_base);
113 static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
114 static void set_scratch_backing_va(struct kgd_dev *kgd,
115 					uint64_t va, uint32_t vmid);
116 static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid);
117 static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid);
118 
119 /* Because of REG_GET_FIELD() being used, we put this function in the
120  * asic specific file.
121  */
122 static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
123 		struct tile_config *config)
124 {
125 	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
126 
127 	config->gb_addr_config = adev->gfx.config.gb_addr_config;
128 
129 	config->tile_config_ptr = adev->gfx.config.tile_mode_array;
130 	config->num_tile_configs =
131 			ARRAY_SIZE(adev->gfx.config.tile_mode_array);
132 	config->macro_tile_config_ptr =
133 			adev->gfx.config.macrotile_mode_array;
134 	config->num_macro_tile_configs =
135 			ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
136 
137 	return 0;
138 }
139 
140 static const struct kfd2kgd_calls kfd2kgd = {
141 	.program_sh_mem_settings = kgd_program_sh_mem_settings,
142 	.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
143 	.init_interrupts = kgd_init_interrupts,
144 	.hqd_load = kgd_hqd_load,
145 	.hqd_sdma_load = kgd_hqd_sdma_load,
146 	.hqd_dump = kgd_hqd_dump,
147 	.hqd_sdma_dump = kgd_hqd_sdma_dump,
148 	.hqd_is_occupied = kgd_hqd_is_occupied,
149 	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
150 	.hqd_destroy = kgd_hqd_destroy,
151 	.hqd_sdma_destroy = kgd_hqd_sdma_destroy,
152 	.address_watch_disable = kgd_address_watch_disable,
153 	.address_watch_execute = kgd_address_watch_execute,
154 	.wave_control_execute = kgd_wave_control_execute,
155 	.address_watch_get_offset = kgd_address_watch_get_offset,
156 	.get_atc_vmid_pasid_mapping_pasid =
157 			get_atc_vmid_pasid_mapping_pasid,
158 	.get_atc_vmid_pasid_mapping_valid =
159 			get_atc_vmid_pasid_mapping_valid,
160 	.get_fw_version = get_fw_version,
161 	.set_scratch_backing_va = set_scratch_backing_va,
162 	.get_tile_config = amdgpu_amdkfd_get_tile_config,
163 	.set_vm_context_page_table_base = set_vm_context_page_table_base,
164 	.invalidate_tlbs = invalidate_tlbs,
165 	.invalidate_tlbs_vmid = invalidate_tlbs_vmid,
166 	.get_hive_id = amdgpu_amdkfd_get_hive_id,
167 };
168 
169 struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void)
170 {
171 	return (struct kfd2kgd_calls *)&kfd2kgd;
172 }
173 
174 static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
175 {
176 	return (struct amdgpu_device *)kgd;
177 }
178 
179 static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe,
180 			uint32_t queue, uint32_t vmid)
181 {
182 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
183 
184 	mutex_lock(&adev->srbm_mutex);
185 	soc15_grbm_select(adev, mec, pipe, queue, vmid);
186 }
187 
188 static void unlock_srbm(struct kgd_dev *kgd)
189 {
190 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
191 
192 	soc15_grbm_select(adev, 0, 0, 0, 0);
193 	mutex_unlock(&adev->srbm_mutex);
194 }
195 
196 static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id,
197 				uint32_t queue_id)
198 {
199 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
200 
201 	uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
202 	uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
203 
204 	lock_srbm(kgd, mec, pipe, queue_id, 0);
205 }
206 
207 static uint32_t get_queue_mask(struct amdgpu_device *adev,
208 			       uint32_t pipe_id, uint32_t queue_id)
209 {
210 	unsigned int bit = (pipe_id * adev->gfx.mec.num_queue_per_pipe +
211 			    queue_id) & 31;
212 
213 	return ((uint32_t)1) << bit;
214 }
215 
216 static void release_queue(struct kgd_dev *kgd)
217 {
218 	unlock_srbm(kgd);
219 }
220 
221 static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
222 					uint32_t sh_mem_config,
223 					uint32_t sh_mem_ape1_base,
224 					uint32_t sh_mem_ape1_limit,
225 					uint32_t sh_mem_bases)
226 {
227 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
228 
229 	lock_srbm(kgd, 0, 0, 0, vmid);
230 
231 	WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config);
232 	WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases);
233 	/* APE1 no longer exists on GFX9 */
234 
235 	unlock_srbm(kgd);
236 }
237 
238 static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
239 					unsigned int vmid)
240 {
241 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
242 
243 	/*
244 	 * We have to assume that there is no outstanding mapping.
245 	 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because
246 	 * a mapping is in progress or because a mapping finished
247 	 * and the SW cleared it.
248 	 * So the protocol is to always wait & clear.
249 	 */
250 	uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid |
251 			ATC_VMID0_PASID_MAPPING__VALID_MASK;
252 
253 	/*
254 	 * need to do this twice, once for gfx and once for mmhub
255 	 * for ATC add 16 to VMID for mmhub, for IH different registers.
256 	 * ATC_VMID0..15 registers are separate from ATC_VMID16..31.
257 	 */
258 
259 	WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid,
260 	       pasid_mapping);
261 
262 	while (!(RREG32(SOC15_REG_OFFSET(
263 				ATHUB, 0,
264 				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
265 		 (1U << vmid)))
266 		cpu_relax();
267 
268 	WREG32(SOC15_REG_OFFSET(ATHUB, 0,
269 				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
270 	       1U << vmid);
271 
272 	/* Mapping vmid to pasid also for IH block */
273 	WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid,
274 	       pasid_mapping);
275 
276 	WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid,
277 	       pasid_mapping);
278 
279 	while (!(RREG32(SOC15_REG_OFFSET(
280 				ATHUB, 0,
281 				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
282 		 (1U << (vmid + 16))))
283 		cpu_relax();
284 
285 	WREG32(SOC15_REG_OFFSET(ATHUB, 0,
286 				mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
287 	       1U << (vmid + 16));
288 
289 	/* Mapping vmid to pasid also for IH block */
290 	WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid,
291 	       pasid_mapping);
292 	return 0;
293 }
294 
295 /* TODO - RING0 form of field is obsolete, seems to date back to SI
296  * but still works
297  */
298 
299 static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
300 {
301 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
302 	uint32_t mec;
303 	uint32_t pipe;
304 
305 	mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
306 	pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
307 
308 	lock_srbm(kgd, mec, pipe, 0, 0);
309 
310 	WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL),
311 		CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK |
312 		CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
313 
314 	unlock_srbm(kgd);
315 
316 	return 0;
317 }
318 
319 static uint32_t get_sdma_base_addr(struct amdgpu_device *adev,
320 				unsigned int engine_id,
321 				unsigned int queue_id)
322 {
323 	uint32_t base[2] = {
324 		SOC15_REG_OFFSET(SDMA0, 0,
325 				 mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL,
326 		SOC15_REG_OFFSET(SDMA1, 0,
327 				 mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL
328 	};
329 	uint32_t retval;
330 
331 	retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL -
332 					       mmSDMA0_RLC0_RB_CNTL);
333 
334 	pr_debug("sdma base address: 0x%x\n", retval);
335 
336 	return retval;
337 }
338 
339 static inline struct v9_mqd *get_mqd(void *mqd)
340 {
341 	return (struct v9_mqd *)mqd;
342 }
343 
344 static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
345 {
346 	return (struct v9_sdma_mqd *)mqd;
347 }
348 
349 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
350 			uint32_t queue_id, uint32_t __user *wptr,
351 			uint32_t wptr_shift, uint32_t wptr_mask,
352 			struct mm_struct *mm)
353 {
354 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
355 	struct v9_mqd *m;
356 	uint32_t *mqd_hqd;
357 	uint32_t reg, hqd_base, data;
358 
359 	m = get_mqd(mqd);
360 
361 	acquire_queue(kgd, pipe_id, queue_id);
362 
363 	/* HIQ is set during driver init period with vmid set to 0*/
364 	if (m->cp_hqd_vmid == 0) {
365 		uint32_t value, mec, pipe;
366 
367 		mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
368 		pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
369 
370 		pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
371 			mec, pipe, queue_id);
372 		value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS));
373 		value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1,
374 			((mec << 5) | (pipe << 3) | queue_id | 0x80));
375 		WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value);
376 	}
377 
378 	/* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */
379 	mqd_hqd = &m->cp_mqd_base_addr_lo;
380 	hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
381 
382 	for (reg = hqd_base;
383 	     reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
384 		WREG32(reg, mqd_hqd[reg - hqd_base]);
385 
386 
387 	/* Activate doorbell logic before triggering WPTR poll. */
388 	data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control,
389 			     CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
390 	WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data);
391 
392 	if (wptr) {
393 		/* Don't read wptr with get_user because the user
394 		 * context may not be accessible (if this function
395 		 * runs in a work queue). Instead trigger a one-shot
396 		 * polling read from memory in the CP. This assumes
397 		 * that wptr is GPU-accessible in the queue's VMID via
398 		 * ATC or SVM. WPTR==RPTR before starting the poll so
399 		 * the CP starts fetching new commands from the right
400 		 * place.
401 		 *
402 		 * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit
403 		 * tricky. Assume that the queue didn't overflow. The
404 		 * number of valid bits in the 32-bit RPTR depends on
405 		 * the queue size. The remaining bits are taken from
406 		 * the saved 64-bit WPTR. If the WPTR wrapped, add the
407 		 * queue size.
408 		 */
409 		uint32_t queue_size =
410 			2 << REG_GET_FIELD(m->cp_hqd_pq_control,
411 					   CP_HQD_PQ_CONTROL, QUEUE_SIZE);
412 		uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1);
413 
414 		if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr)
415 			guessed_wptr += queue_size;
416 		guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1);
417 		guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32;
418 
419 		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO),
420 		       lower_32_bits(guessed_wptr));
421 		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI),
422 		       upper_32_bits(guessed_wptr));
423 		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR),
424 		       lower_32_bits((uintptr_t)wptr));
425 		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI),
426 		       upper_32_bits((uintptr_t)wptr));
427 		WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1),
428 		       get_queue_mask(adev, pipe_id, queue_id));
429 	}
430 
431 	/* Start the EOP fetcher */
432 	WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR),
433 	       REG_SET_FIELD(m->cp_hqd_eop_rptr,
434 			     CP_HQD_EOP_RPTR, INIT_FETCHER, 1));
435 
436 	data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
437 	WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data);
438 
439 	release_queue(kgd);
440 
441 	return 0;
442 }
443 
444 static int kgd_hqd_dump(struct kgd_dev *kgd,
445 			uint32_t pipe_id, uint32_t queue_id,
446 			uint32_t (**dump)[2], uint32_t *n_regs)
447 {
448 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
449 	uint32_t i = 0, reg;
450 #define HQD_N_REGS 56
451 #define DUMP_REG(addr) do {				\
452 		if (WARN_ON_ONCE(i >= HQD_N_REGS))	\
453 			break;				\
454 		(*dump)[i][0] = (addr) << 2;		\
455 		(*dump)[i++][1] = RREG32(addr);		\
456 	} while (0)
457 
458 	*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
459 	if (*dump == NULL)
460 		return -ENOMEM;
461 
462 	acquire_queue(kgd, pipe_id, queue_id);
463 
464 	for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
465 	     reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
466 		DUMP_REG(reg);
467 
468 	release_queue(kgd);
469 
470 	WARN_ON_ONCE(i != HQD_N_REGS);
471 	*n_regs = i;
472 
473 	return 0;
474 }
475 
476 static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
477 			     uint32_t __user *wptr, struct mm_struct *mm)
478 {
479 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
480 	struct v9_sdma_mqd *m;
481 	uint32_t sdma_base_addr, sdmax_gfx_context_cntl;
482 	unsigned long end_jiffies;
483 	uint32_t data;
484 	uint64_t data64;
485 	uint64_t __user *wptr64 = (uint64_t __user *)wptr;
486 
487 	m = get_sdma_mqd(mqd);
488 	sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id,
489 					    m->sdma_queue_id);
490 	sdmax_gfx_context_cntl = m->sdma_engine_id ?
491 		SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) :
492 		SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL);
493 
494 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
495 		m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
496 
497 	end_jiffies = msecs_to_jiffies(2000) + jiffies;
498 	while (true) {
499 		data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
500 		if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
501 			break;
502 		if (time_after(jiffies, end_jiffies))
503 			return -ETIME;
504 		usleep_range(500, 1000);
505 	}
506 	data = RREG32(sdmax_gfx_context_cntl);
507 	data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL,
508 			     RESUME_CTX, 0);
509 	WREG32(sdmax_gfx_context_cntl, data);
510 
511 	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET,
512 	       m->sdmax_rlcx_doorbell_offset);
513 
514 	data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
515 			     ENABLE, 1);
516 	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
517 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr);
518 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI,
519 				m->sdmax_rlcx_rb_rptr_hi);
520 
521 	WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1);
522 	if (read_user_wptr(mm, wptr64, data64)) {
523 		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
524 		       lower_32_bits(data64));
525 		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI,
526 		       upper_32_bits(data64));
527 	} else {
528 		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
529 		       m->sdmax_rlcx_rb_rptr);
530 		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI,
531 		       m->sdmax_rlcx_rb_rptr_hi);
532 	}
533 	WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0);
534 
535 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
536 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI,
537 			m->sdmax_rlcx_rb_base_hi);
538 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
539 			m->sdmax_rlcx_rb_rptr_addr_lo);
540 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
541 			m->sdmax_rlcx_rb_rptr_addr_hi);
542 
543 	data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
544 			     RB_ENABLE, 1);
545 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
546 
547 	return 0;
548 }
549 
550 static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
551 			     uint32_t engine_id, uint32_t queue_id,
552 			     uint32_t (**dump)[2], uint32_t *n_regs)
553 {
554 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
555 	uint32_t sdma_base_addr = get_sdma_base_addr(adev, engine_id, queue_id);
556 	uint32_t i = 0, reg;
557 #undef HQD_N_REGS
558 #define HQD_N_REGS (19+6+7+10)
559 
560 	*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
561 	if (*dump == NULL)
562 		return -ENOMEM;
563 
564 	for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
565 		DUMP_REG(sdma_base_addr + reg);
566 	for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++)
567 		DUMP_REG(sdma_base_addr + reg);
568 	for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN;
569 	     reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++)
570 		DUMP_REG(sdma_base_addr + reg);
571 	for (reg = mmSDMA0_RLC0_MIDCMD_DATA0;
572 	     reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++)
573 		DUMP_REG(sdma_base_addr + reg);
574 
575 	WARN_ON_ONCE(i != HQD_N_REGS);
576 	*n_regs = i;
577 
578 	return 0;
579 }
580 
581 static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
582 				uint32_t pipe_id, uint32_t queue_id)
583 {
584 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
585 	uint32_t act;
586 	bool retval = false;
587 	uint32_t low, high;
588 
589 	acquire_queue(kgd, pipe_id, queue_id);
590 	act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
591 	if (act) {
592 		low = lower_32_bits(queue_address >> 8);
593 		high = upper_32_bits(queue_address >> 8);
594 
595 		if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) &&
596 		   high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI)))
597 			retval = true;
598 	}
599 	release_queue(kgd);
600 	return retval;
601 }
602 
603 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
604 {
605 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
606 	struct v9_sdma_mqd *m;
607 	uint32_t sdma_base_addr;
608 	uint32_t sdma_rlc_rb_cntl;
609 
610 	m = get_sdma_mqd(mqd);
611 	sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id,
612 					    m->sdma_queue_id);
613 
614 	sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL);
615 
616 	if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)
617 		return true;
618 
619 	return false;
620 }
621 
622 static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
623 				enum kfd_preempt_type reset_type,
624 				unsigned int utimeout, uint32_t pipe_id,
625 				uint32_t queue_id)
626 {
627 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
628 	enum hqd_dequeue_request_type type;
629 	unsigned long end_jiffies;
630 	uint32_t temp;
631 	struct v9_mqd *m = get_mqd(mqd);
632 
633 	if (adev->in_gpu_reset)
634 		return -EIO;
635 
636 	acquire_queue(kgd, pipe_id, queue_id);
637 
638 	if (m->cp_hqd_vmid == 0)
639 		WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0);
640 
641 	switch (reset_type) {
642 	case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
643 		type = DRAIN_PIPE;
644 		break;
645 	case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
646 		type = RESET_WAVES;
647 		break;
648 	default:
649 		type = DRAIN_PIPE;
650 		break;
651 	}
652 
653 	WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type);
654 
655 	end_jiffies = (utimeout * HZ / 1000) + jiffies;
656 	while (true) {
657 		temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
658 		if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
659 			break;
660 		if (time_after(jiffies, end_jiffies)) {
661 			pr_err("cp queue preemption time out.\n");
662 			release_queue(kgd);
663 			return -ETIME;
664 		}
665 		usleep_range(500, 1000);
666 	}
667 
668 	release_queue(kgd);
669 	return 0;
670 }
671 
672 static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
673 				unsigned int utimeout)
674 {
675 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
676 	struct v9_sdma_mqd *m;
677 	uint32_t sdma_base_addr;
678 	uint32_t temp;
679 	unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
680 
681 	m = get_sdma_mqd(mqd);
682 	sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id,
683 					    m->sdma_queue_id);
684 
685 	temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL);
686 	temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK;
687 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp);
688 
689 	while (true) {
690 		temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
691 		if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
692 			break;
693 		if (time_after(jiffies, end_jiffies))
694 			return -ETIME;
695 		usleep_range(500, 1000);
696 	}
697 
698 	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
699 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
700 		RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
701 		SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
702 
703 	m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
704 	m->sdmax_rlcx_rb_rptr_hi =
705 		RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI);
706 
707 	return 0;
708 }
709 
710 static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
711 							uint8_t vmid)
712 {
713 	uint32_t reg;
714 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
715 
716 	reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
717 		     + vmid);
718 	return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK;
719 }
720 
721 static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
722 								uint8_t vmid)
723 {
724 	uint32_t reg;
725 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
726 
727 	reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
728 		     + vmid);
729 	return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
730 }
731 
732 static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
733 {
734 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
735 
736 	/* Use legacy mode tlb invalidation.
737 	 *
738 	 * Currently on Raven the code below is broken for anything but
739 	 * legacy mode due to a MMHUB power gating problem. A workaround
740 	 * is for MMHUB to wait until the condition PER_VMID_INVALIDATE_REQ
741 	 * == PER_VMID_INVALIDATE_ACK instead of simply waiting for the ack
742 	 * bit.
743 	 *
744 	 * TODO 1: agree on the right set of invalidation registers for
745 	 * KFD use. Use the last one for now. Invalidate both GC and
746 	 * MMHUB.
747 	 *
748 	 * TODO 2: support range-based invalidation, requires kfg2kgd
749 	 * interface change
750 	 */
751 	amdgpu_gmc_flush_gpu_tlb(adev, vmid, 0);
752 }
753 
754 static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid)
755 {
756 	signed long r;
757 	uint32_t seq;
758 	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
759 
760 	spin_lock(&adev->gfx.kiq.ring_lock);
761 	amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/
762 	amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0));
763 	amdgpu_ring_write(ring,
764 			PACKET3_INVALIDATE_TLBS_DST_SEL(1) |
765 			PACKET3_INVALIDATE_TLBS_ALL_HUB(1) |
766 			PACKET3_INVALIDATE_TLBS_PASID(pasid) |
767 			PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(0)); /* legacy */
768 	amdgpu_fence_emit_polling(ring, &seq);
769 	amdgpu_ring_commit(ring);
770 	spin_unlock(&adev->gfx.kiq.ring_lock);
771 
772 	r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
773 	if (r < 1) {
774 		DRM_ERROR("wait for kiq fence error: %ld.\n", r);
775 		return -ETIME;
776 	}
777 
778 	return 0;
779 }
780 
781 static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid)
782 {
783 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
784 	int vmid;
785 	struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
786 
787 	if (adev->in_gpu_reset)
788 		return -EIO;
789 
790 	if (ring->sched.ready)
791 		return invalidate_tlbs_with_kiq(adev, pasid);
792 
793 	for (vmid = 0; vmid < 16; vmid++) {
794 		if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid))
795 			continue;
796 		if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) {
797 			if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid)
798 				== pasid) {
799 				write_vmid_invalidate_request(kgd, vmid);
800 				break;
801 			}
802 		}
803 	}
804 
805 	return 0;
806 }
807 
808 static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid)
809 {
810 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
811 
812 	if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) {
813 		pr_err("non kfd vmid %d\n", vmid);
814 		return 0;
815 	}
816 
817 	write_vmid_invalidate_request(kgd, vmid);
818 	return 0;
819 }
820 
821 static int kgd_address_watch_disable(struct kgd_dev *kgd)
822 {
823 	return 0;
824 }
825 
826 static int kgd_address_watch_execute(struct kgd_dev *kgd,
827 					unsigned int watch_point_id,
828 					uint32_t cntl_val,
829 					uint32_t addr_hi,
830 					uint32_t addr_lo)
831 {
832 	return 0;
833 }
834 
835 static int kgd_wave_control_execute(struct kgd_dev *kgd,
836 					uint32_t gfx_index_val,
837 					uint32_t sq_cmd)
838 {
839 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
840 	uint32_t data = 0;
841 
842 	mutex_lock(&adev->grbm_idx_mutex);
843 
844 	WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val);
845 	WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd);
846 
847 	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
848 		INSTANCE_BROADCAST_WRITES, 1);
849 	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
850 		SH_BROADCAST_WRITES, 1);
851 	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
852 		SE_BROADCAST_WRITES, 1);
853 
854 	WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data);
855 	mutex_unlock(&adev->grbm_idx_mutex);
856 
857 	return 0;
858 }
859 
860 static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
861 					unsigned int watch_point_id,
862 					unsigned int reg_offset)
863 {
864 	return 0;
865 }
866 
867 static void set_scratch_backing_va(struct kgd_dev *kgd,
868 					uint64_t va, uint32_t vmid)
869 {
870 	/* No longer needed on GFXv9. The scratch base address is
871 	 * passed to the shader by the CP. It's the user mode driver's
872 	 * responsibility.
873 	 */
874 }
875 
876 /* FIXME: Does this need to be ASIC-specific code? */
877 static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
878 {
879 	struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
880 	const union amdgpu_firmware_header *hdr;
881 
882 	switch (type) {
883 	case KGD_ENGINE_PFP:
884 		hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data;
885 		break;
886 
887 	case KGD_ENGINE_ME:
888 		hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data;
889 		break;
890 
891 	case KGD_ENGINE_CE:
892 		hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data;
893 		break;
894 
895 	case KGD_ENGINE_MEC1:
896 		hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data;
897 		break;
898 
899 	case KGD_ENGINE_MEC2:
900 		hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data;
901 		break;
902 
903 	case KGD_ENGINE_RLC:
904 		hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data;
905 		break;
906 
907 	case KGD_ENGINE_SDMA1:
908 		hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data;
909 		break;
910 
911 	case KGD_ENGINE_SDMA2:
912 		hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data;
913 		break;
914 
915 	default:
916 		return 0;
917 	}
918 
919 	if (hdr == NULL)
920 		return 0;
921 
922 	/* Only 12 bit in use*/
923 	return hdr->common.ucode_version;
924 }
925 
926 static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
927 		uint64_t page_table_base)
928 {
929 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
930 
931 	if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) {
932 		pr_err("trying to set page table base for wrong VMID %u\n",
933 		       vmid);
934 		return;
935 	}
936 
937 	/* TODO: take advantage of per-process address space size. For
938 	 * now, all processes share the same address space size, like
939 	 * on GFX8 and older.
940 	 */
941 	mmhub_v1_0_setup_vm_pt_regs(adev, vmid, page_table_base);
942 
943 	gfxhub_v1_0_setup_vm_pt_regs(adev, vmid, page_table_base);
944 }
945