xref: /openbmc/linux/drivers/gpu/drm/amd/amdkfd/kfd_debug.c (revision d32fd6bb9f2bc8178cdd65ebec1ad670a8bfa241)
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include "kfd_topology.h"
26 #include <linux/file.h>
27 #include <uapi/linux/kfd_ioctl.h>
28 
29 #define MAX_WATCH_ADDRESSES	4
30 
kfd_dbg_ev_query_debug_event(struct kfd_process * process,unsigned int * queue_id,unsigned int * gpu_id,uint64_t exception_clear_mask,uint64_t * event_status)31 int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
32 		      unsigned int *queue_id,
33 		      unsigned int *gpu_id,
34 		      uint64_t exception_clear_mask,
35 		      uint64_t *event_status)
36 {
37 	struct process_queue_manager *pqm;
38 	struct process_queue_node *pqn;
39 	int i;
40 
41 	if (!(process && process->debug_trap_enabled))
42 		return -ENODATA;
43 
44 	mutex_lock(&process->event_mutex);
45 	*event_status = 0;
46 	*queue_id = 0;
47 	*gpu_id = 0;
48 
49 	/* find and report queue events */
50 	pqm = &process->pqm;
51 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
52 		uint64_t tmp = process->exception_enable_mask;
53 
54 		if (!pqn->q)
55 			continue;
56 
57 		tmp &= pqn->q->properties.exception_status;
58 
59 		if (!tmp)
60 			continue;
61 
62 		*event_status = pqn->q->properties.exception_status;
63 		*queue_id = pqn->q->properties.queue_id;
64 		*gpu_id = pqn->q->device->id;
65 		pqn->q->properties.exception_status &= ~exception_clear_mask;
66 		goto out;
67 	}
68 
69 	/* find and report device events */
70 	for (i = 0; i < process->n_pdds; i++) {
71 		struct kfd_process_device *pdd = process->pdds[i];
72 		uint64_t tmp = process->exception_enable_mask
73 						& pdd->exception_status;
74 
75 		if (!tmp)
76 			continue;
77 
78 		*event_status = pdd->exception_status;
79 		*gpu_id = pdd->dev->id;
80 		pdd->exception_status &= ~exception_clear_mask;
81 		goto out;
82 	}
83 
84 	/* report process events */
85 	if (process->exception_enable_mask & process->exception_status) {
86 		*event_status = process->exception_status;
87 		process->exception_status &= ~exception_clear_mask;
88 	}
89 
90 out:
91 	mutex_unlock(&process->event_mutex);
92 	return *event_status ? 0 : -EAGAIN;
93 }
94 
debug_event_write_work_handler(struct work_struct * work)95 void debug_event_write_work_handler(struct work_struct *work)
96 {
97 	struct kfd_process *process;
98 
99 	static const char write_data = '.';
100 	loff_t pos = 0;
101 
102 	process = container_of(work,
103 			struct kfd_process,
104 			debug_event_workarea);
105 
106 	if (process->debug_trap_enabled && process->dbg_ev_file)
107 		kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
108 }
109 
110 /* update process/device/queue exception status, write to descriptor
111  * only if exception_status is enabled.
112  */
kfd_dbg_ev_raise(uint64_t event_mask,struct kfd_process * process,struct kfd_node * dev,unsigned int source_id,bool use_worker,void * exception_data,size_t exception_data_size)113 bool kfd_dbg_ev_raise(uint64_t event_mask,
114 			struct kfd_process *process, struct kfd_node *dev,
115 			unsigned int source_id, bool use_worker,
116 			void *exception_data, size_t exception_data_size)
117 {
118 	struct process_queue_manager *pqm;
119 	struct process_queue_node *pqn;
120 	int i;
121 	static const char write_data = '.';
122 	loff_t pos = 0;
123 	bool is_subscribed = true;
124 
125 	if (!(process && process->debug_trap_enabled))
126 		return false;
127 
128 	mutex_lock(&process->event_mutex);
129 
130 	if (event_mask & KFD_EC_MASK_DEVICE) {
131 		for (i = 0; i < process->n_pdds; i++) {
132 			struct kfd_process_device *pdd = process->pdds[i];
133 
134 			if (pdd->dev != dev)
135 				continue;
136 
137 			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
138 
139 			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
140 				if (!pdd->vm_fault_exc_data) {
141 					pdd->vm_fault_exc_data = kmemdup(
142 							exception_data,
143 							exception_data_size,
144 							GFP_KERNEL);
145 					if (!pdd->vm_fault_exc_data)
146 						pr_debug("Failed to allocate exception data memory");
147 				} else {
148 					pr_debug("Debugger exception data not saved\n");
149 					print_hex_dump_bytes("exception data: ",
150 							DUMP_PREFIX_OFFSET,
151 							exception_data,
152 							exception_data_size);
153 				}
154 			}
155 			break;
156 		}
157 	} else if (event_mask & KFD_EC_MASK_PROCESS) {
158 		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
159 	} else {
160 		pqm = &process->pqm;
161 		list_for_each_entry(pqn, &pqm->queues,
162 				process_queue_list) {
163 			int target_id;
164 
165 			if (!pqn->q)
166 				continue;
167 
168 			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
169 					pqn->q->properties.queue_id :
170 							pqn->q->doorbell_id;
171 
172 			if (pqn->q->device != dev || target_id != source_id)
173 				continue;
174 
175 			pqn->q->properties.exception_status |= event_mask;
176 			break;
177 		}
178 	}
179 
180 	if (process->exception_enable_mask & event_mask) {
181 		if (use_worker)
182 			schedule_work(&process->debug_event_workarea);
183 		else
184 			kernel_write(process->dbg_ev_file,
185 					&write_data,
186 					1,
187 					&pos);
188 	} else {
189 		is_subscribed = false;
190 	}
191 
192 	mutex_unlock(&process->event_mutex);
193 
194 	return is_subscribed;
195 }
196 
197 /* set pending event queue entry from ring entry  */
kfd_set_dbg_ev_from_interrupt(struct kfd_node * dev,unsigned int pasid,uint32_t doorbell_id,uint64_t trap_mask,void * exception_data,size_t exception_data_size)198 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
199 				   unsigned int pasid,
200 				   uint32_t doorbell_id,
201 				   uint64_t trap_mask,
202 				   void *exception_data,
203 				   size_t exception_data_size)
204 {
205 	struct kfd_process *p;
206 	bool signaled_to_debugger_or_runtime = false;
207 
208 	p = kfd_lookup_process_by_pasid(pasid);
209 
210 	if (!p)
211 		return false;
212 
213 	if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
214 			      exception_data, exception_data_size)) {
215 		struct process_queue_manager *pqm;
216 		struct process_queue_node *pqn;
217 
218 		if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
219 		       p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
220 			mutex_lock(&p->mutex);
221 
222 			pqm = &p->pqm;
223 			list_for_each_entry(pqn, &pqm->queues,
224 							process_queue_list) {
225 
226 				if (!(pqn->q && pqn->q->device == dev &&
227 				      pqn->q->doorbell_id == doorbell_id))
228 					continue;
229 
230 				kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
231 							      trap_mask);
232 
233 				signaled_to_debugger_or_runtime = true;
234 
235 				break;
236 			}
237 
238 			mutex_unlock(&p->mutex);
239 		} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
240 			kfd_dqm_evict_pasid(dev->dqm, p->pasid);
241 			kfd_signal_vm_fault_event(dev, p->pasid, NULL,
242 							exception_data);
243 
244 			signaled_to_debugger_or_runtime = true;
245 		}
246 	} else {
247 		signaled_to_debugger_or_runtime = true;
248 	}
249 
250 	kfd_unref_process(p);
251 
252 	return signaled_to_debugger_or_runtime;
253 }
254 
kfd_dbg_send_exception_to_runtime(struct kfd_process * p,unsigned int dev_id,unsigned int queue_id,uint64_t error_reason)255 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
256 					unsigned int dev_id,
257 					unsigned int queue_id,
258 					uint64_t error_reason)
259 {
260 	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
261 		struct kfd_process_device *pdd = NULL;
262 		struct kfd_hsa_memory_exception_data *data;
263 		int i;
264 
265 		for (i = 0; i < p->n_pdds; i++) {
266 			if (p->pdds[i]->dev->id == dev_id) {
267 				pdd = p->pdds[i];
268 				break;
269 			}
270 		}
271 
272 		if (!pdd)
273 			return -ENODEV;
274 
275 		data = (struct kfd_hsa_memory_exception_data *)
276 						pdd->vm_fault_exc_data;
277 
278 		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
279 		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
280 		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
281 	}
282 
283 	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
284 		/*
285 		 * block should only happen after the debugger receives runtime
286 		 * enable notice.
287 		 */
288 		up(&p->runtime_enable_sema);
289 		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
290 	}
291 
292 	if (error_reason)
293 		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
294 
295 	return 0;
296 }
297 
kfd_dbg_set_queue_workaround(struct queue * q,bool enable)298 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
299 {
300 	struct mqd_update_info minfo = {0};
301 	int err;
302 
303 	if (!q)
304 		return 0;
305 
306 	if (!kfd_dbg_has_cwsr_workaround(q->device))
307 		return 0;
308 
309 	if (enable && q->properties.is_user_cu_masked)
310 		return -EBUSY;
311 
312 	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
313 
314 	q->properties.is_dbg_wa = enable;
315 	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
316 	if (err)
317 		q->properties.is_dbg_wa = false;
318 
319 	return err;
320 }
321 
kfd_dbg_set_workaround(struct kfd_process * target,bool enable)322 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
323 {
324 	struct process_queue_manager *pqm = &target->pqm;
325 	struct process_queue_node *pqn;
326 	int r = 0;
327 
328 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
329 		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
330 		if (enable && r)
331 			goto unwind;
332 	}
333 
334 	return 0;
335 
336 unwind:
337 	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
338 		kfd_dbg_set_queue_workaround(pqn->q, false);
339 
340 	if (enable)
341 		target->runtime_info.runtime_state = r == -EBUSY ?
342 				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
343 				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
344 
345 	return r;
346 }
347 
kfd_dbg_set_mes_debug_mode(struct kfd_process_device * pdd,bool sq_trap_en)348 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
349 {
350 	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
351 	uint32_t flags = pdd->process->dbg_flags;
352 	struct amdgpu_device *adev = pdd->dev->adev;
353 	int r;
354 
355 	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
356 		return 0;
357 
358 	if (!pdd->proc_ctx_cpu_ptr) {
359 			r = amdgpu_amdkfd_alloc_gtt_mem(adev,
360 				AMDGPU_MES_PROC_CTX_SIZE,
361 				&pdd->proc_ctx_bo,
362 				&pdd->proc_ctx_gpu_addr,
363 				&pdd->proc_ctx_cpu_ptr,
364 				false);
365 		if (r) {
366 			dev_err(adev->dev,
367 			"failed to allocate process context bo\n");
368 			return r;
369 		}
370 		memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
371 	}
372 
373 	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
374 						pdd->watch_points, flags, sq_trap_en);
375 }
376 
377 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
kfd_dbg_get_dev_watch_id(struct kfd_process_device * pdd,int * watch_id)378 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
379 {
380 	int i;
381 
382 	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
383 
384 	spin_lock(&pdd->dev->kfd->watch_points_lock);
385 
386 	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
387 		/* device watchpoint in use so skip */
388 		if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1)
389 			continue;
390 
391 		pdd->alloc_watch_ids |= 0x1 << i;
392 		pdd->dev->kfd->alloc_watch_ids |= 0x1 << i;
393 		*watch_id = i;
394 		spin_unlock(&pdd->dev->kfd->watch_points_lock);
395 		return 0;
396 	}
397 
398 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
399 
400 	return -ENOMEM;
401 }
402 
kfd_dbg_clear_dev_watch_id(struct kfd_process_device * pdd,int watch_id)403 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
404 {
405 	spin_lock(&pdd->dev->kfd->watch_points_lock);
406 
407 	/* process owns device watch point so safe to clear */
408 	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
409 		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
410 		pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id);
411 	}
412 
413 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
414 }
415 
kfd_dbg_owns_dev_watch_id(struct kfd_process_device * pdd,int watch_id)416 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
417 {
418 	bool owns_watch_id = false;
419 
420 	spin_lock(&pdd->dev->kfd->watch_points_lock);
421 	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
422 			((pdd->alloc_watch_ids >> watch_id) & 0x1);
423 
424 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
425 
426 	return owns_watch_id;
427 }
428 
kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device * pdd,uint32_t watch_id)429 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
430 					uint32_t watch_id)
431 {
432 	int r;
433 
434 	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
435 		return -EINVAL;
436 
437 	if (!pdd->dev->kfd->shared_resources.enable_mes) {
438 		r = debug_lock_and_unmap(pdd->dev->dqm);
439 		if (r)
440 			return r;
441 	}
442 
443 	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
444 	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
445 							pdd->dev->adev,
446 							watch_id);
447 	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
448 
449 	if (!pdd->dev->kfd->shared_resources.enable_mes)
450 		r = debug_map_and_unlock(pdd->dev->dqm);
451 	else
452 		r = kfd_dbg_set_mes_debug_mode(pdd, true);
453 
454 	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
455 
456 	return r;
457 }
458 
kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device * pdd,uint64_t watch_address,uint32_t watch_address_mask,uint32_t * watch_id,uint32_t watch_mode)459 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
460 					uint64_t watch_address,
461 					uint32_t watch_address_mask,
462 					uint32_t *watch_id,
463 					uint32_t watch_mode)
464 {
465 	int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
466 	uint32_t xcc_mask = pdd->dev->xcc_mask;
467 
468 	if (r)
469 		return r;
470 
471 	if (!pdd->dev->kfd->shared_resources.enable_mes) {
472 		r = debug_lock_and_unmap(pdd->dev->dqm);
473 		if (r) {
474 			kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
475 			return r;
476 		}
477 	}
478 
479 	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
480 	for_each_inst(xcc_id, xcc_mask)
481 		pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
482 				pdd->dev->adev,
483 				watch_address,
484 				watch_address_mask,
485 				*watch_id,
486 				watch_mode,
487 				pdd->dev->vm_info.last_vmid_kfd,
488 				xcc_id);
489 	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
490 
491 	if (!pdd->dev->kfd->shared_resources.enable_mes)
492 		r = debug_map_and_unlock(pdd->dev->dqm);
493 	else
494 		r = kfd_dbg_set_mes_debug_mode(pdd, true);
495 
496 	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
497 	if (r)
498 		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
499 
500 	return 0;
501 }
502 
kfd_dbg_clear_process_address_watch(struct kfd_process * target)503 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
504 {
505 	int i, j;
506 
507 	for (i = 0; i < target->n_pdds; i++)
508 		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
509 			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
510 }
511 
kfd_dbg_trap_set_flags(struct kfd_process * target,uint32_t * flags)512 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
513 {
514 	uint32_t prev_flags = target->dbg_flags;
515 	int i, r = 0, rewind_count = 0;
516 
517 	for (i = 0; i < target->n_pdds; i++) {
518 		if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
519 			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
520 			*flags = prev_flags;
521 			return -EACCES;
522 		}
523 	}
524 
525 	target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
526 	*flags = prev_flags;
527 	for (i = 0; i < target->n_pdds; i++) {
528 		struct kfd_process_device *pdd = target->pdds[i];
529 
530 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
531 			continue;
532 
533 		if (!pdd->dev->kfd->shared_resources.enable_mes)
534 			r = debug_refresh_runlist(pdd->dev->dqm);
535 		else
536 			r = kfd_dbg_set_mes_debug_mode(pdd, true);
537 
538 		if (r) {
539 			target->dbg_flags = prev_flags;
540 			break;
541 		}
542 
543 		rewind_count++;
544 	}
545 
546 	/* Rewind flags */
547 	if (r) {
548 		target->dbg_flags = prev_flags;
549 
550 		for (i = 0; i < rewind_count; i++) {
551 			struct kfd_process_device *pdd = target->pdds[i];
552 
553 			if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
554 				continue;
555 
556 			if (!pdd->dev->kfd->shared_resources.enable_mes)
557 				debug_refresh_runlist(pdd->dev->dqm);
558 			else
559 				kfd_dbg_set_mes_debug_mode(pdd, true);
560 		}
561 	}
562 
563 	return r;
564 }
565 
566 /* kfd_dbg_trap_deactivate:
567  *	target: target process
568  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
569  *	unwind_count:
570  *		If unwind == true, how far down the pdd list we need
571  *				to unwind
572  *		else: ignored
573  */
kfd_dbg_trap_deactivate(struct kfd_process * target,bool unwind,int unwind_count)574 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
575 {
576 	int i;
577 
578 	if (!unwind) {
579 		uint32_t flags = 0;
580 		int resume_count = resume_queues(target, 0, NULL);
581 
582 		if (resume_count)
583 			pr_debug("Resumed %d queues\n", resume_count);
584 
585 		cancel_work_sync(&target->debug_event_workarea);
586 		kfd_dbg_clear_process_address_watch(target);
587 		kfd_dbg_trap_set_wave_launch_mode(target, 0);
588 
589 		kfd_dbg_trap_set_flags(target, &flags);
590 	}
591 
592 	for (i = 0; i < target->n_pdds; i++) {
593 		struct kfd_process_device *pdd = target->pdds[i];
594 
595 		/* If this is an unwind, and we have unwound the required
596 		 * enable calls on the pdd list, we need to stop now
597 		 * otherwise we may mess up another debugger session.
598 		 */
599 		if (unwind && i == unwind_count)
600 			break;
601 
602 		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
603 
604 		/* GFX off is already disabled by debug activate if not RLC restore supported. */
605 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
606 			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
607 		pdd->spi_dbg_override =
608 				pdd->dev->kfd2kgd->disable_debug_trap(
609 				pdd->dev->adev,
610 				target->runtime_info.ttmp_setup,
611 				pdd->dev->vm_info.last_vmid_kfd);
612 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
613 
614 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
615 				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
616 			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
617 
618 		if (!pdd->dev->kfd->shared_resources.enable_mes)
619 			debug_refresh_runlist(pdd->dev->dqm);
620 		else
621 			kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
622 	}
623 
624 	kfd_dbg_set_workaround(target, false);
625 }
626 
kfd_dbg_clean_exception_status(struct kfd_process * target)627 static void kfd_dbg_clean_exception_status(struct kfd_process *target)
628 {
629 	struct process_queue_manager *pqm;
630 	struct process_queue_node *pqn;
631 	int i;
632 
633 	for (i = 0; i < target->n_pdds; i++) {
634 		struct kfd_process_device *pdd = target->pdds[i];
635 
636 		kfd_process_drain_interrupts(pdd);
637 
638 		pdd->exception_status = 0;
639 	}
640 
641 	pqm = &target->pqm;
642 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
643 		if (!pqn->q)
644 			continue;
645 
646 		pqn->q->properties.exception_status = 0;
647 	}
648 
649 	target->exception_status = 0;
650 }
651 
kfd_dbg_trap_disable(struct kfd_process * target)652 int kfd_dbg_trap_disable(struct kfd_process *target)
653 {
654 	if (!target->debug_trap_enabled)
655 		return 0;
656 
657 	/*
658 	 * Defer deactivation to runtime if runtime not enabled otherwise reset
659 	 * attached running target runtime state to enable for re-attach.
660 	 */
661 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
662 		kfd_dbg_trap_deactivate(target, false, 0);
663 	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
664 		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
665 
666 	cancel_work_sync(&target->debug_event_workarea);
667 	fput(target->dbg_ev_file);
668 	target->dbg_ev_file = NULL;
669 
670 	if (target->debugger_process) {
671 		atomic_dec(&target->debugger_process->debugged_process_count);
672 		target->debugger_process = NULL;
673 	}
674 
675 	target->debug_trap_enabled = false;
676 	kfd_dbg_clean_exception_status(target);
677 	kfd_unref_process(target);
678 
679 	return 0;
680 }
681 
kfd_dbg_trap_activate(struct kfd_process * target)682 int kfd_dbg_trap_activate(struct kfd_process *target)
683 {
684 	int i, r = 0;
685 
686 	r = kfd_dbg_set_workaround(target, true);
687 	if (r)
688 		return r;
689 
690 	for (i = 0; i < target->n_pdds; i++) {
691 		struct kfd_process_device *pdd = target->pdds[i];
692 
693 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
694 			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
695 
696 			if (r) {
697 				target->runtime_info.runtime_state = (r == -EBUSY) ?
698 							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
699 							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
700 
701 				goto unwind_err;
702 			}
703 		}
704 
705 		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
706 		 * If RLC restore of debug registers is not supported and runtime enable
707 		 * hasn't done so already on ttmp setup request, restore the trap config registers.
708 		 *
709 		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
710 		 * the debug session.
711 		 */
712 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
713 		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
714 						target->runtime_info.ttmp_setup))
715 			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
716 								pdd->dev->vm_info.last_vmid_kfd);
717 
718 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
719 					pdd->dev->adev,
720 					false,
721 					pdd->dev->vm_info.last_vmid_kfd);
722 
723 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
724 			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
725 
726 		/*
727 		 * Setting the debug flag in the trap handler requires that the TMA has been
728 		 * allocated, which occurs during CWSR initialization.
729 		 * In the event that CWSR has not been initialized at this point, setting the
730 		 * flag will be called again during CWSR initialization if the target process
731 		 * is still debug enabled.
732 		 */
733 		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
734 
735 		if (!pdd->dev->kfd->shared_resources.enable_mes)
736 			r = debug_refresh_runlist(pdd->dev->dqm);
737 		else
738 			r = kfd_dbg_set_mes_debug_mode(pdd, true);
739 
740 		if (r) {
741 			target->runtime_info.runtime_state =
742 					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
743 			goto unwind_err;
744 		}
745 	}
746 
747 	return 0;
748 
749 unwind_err:
750 	/* Enabling debug failed, we need to disable on
751 	 * all GPUs so the enable is all or nothing.
752 	 */
753 	kfd_dbg_trap_deactivate(target, true, i);
754 	return r;
755 }
756 
kfd_dbg_trap_enable(struct kfd_process * target,uint32_t fd,void __user * runtime_info,uint32_t * runtime_size)757 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
758 			void __user *runtime_info, uint32_t *runtime_size)
759 {
760 	struct file *f;
761 	uint32_t copy_size;
762 	int i, r = 0;
763 
764 	if (target->debug_trap_enabled)
765 		return -EALREADY;
766 
767 	/* Enable pre-checks */
768 	for (i = 0; i < target->n_pdds; i++) {
769 		struct kfd_process_device *pdd = target->pdds[i];
770 
771 		if (!KFD_IS_SOC15(pdd->dev))
772 			return -ENODEV;
773 
774 		if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
775 					 kfd_dbg_has_cwsr_workaround(pdd->dev)))
776 			return -EBUSY;
777 	}
778 
779 	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
780 
781 	f = fget(fd);
782 	if (!f) {
783 		pr_err("Failed to get file for (%i)\n", fd);
784 		return -EBADF;
785 	}
786 
787 	target->dbg_ev_file = f;
788 
789 	/* defer activation to runtime if not runtime enabled */
790 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
791 		kfd_dbg_trap_activate(target);
792 
793 	/* We already hold the process reference but hold another one for the
794 	 * debug session.
795 	 */
796 	kref_get(&target->ref);
797 	target->debug_trap_enabled = true;
798 
799 	if (target->debugger_process)
800 		atomic_inc(&target->debugger_process->debugged_process_count);
801 
802 	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
803 		kfd_dbg_trap_deactivate(target, false, 0);
804 		r = -EFAULT;
805 	}
806 
807 	*runtime_size = sizeof(target->runtime_info);
808 
809 	return r;
810 }
811 
kfd_dbg_validate_trap_override_request(struct kfd_process * p,uint32_t trap_override,uint32_t trap_mask_request,uint32_t * trap_mask_supported)812 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
813 						uint32_t trap_override,
814 						uint32_t trap_mask_request,
815 						uint32_t *trap_mask_supported)
816 {
817 	int i = 0;
818 
819 	*trap_mask_supported = 0xffffffff;
820 
821 	for (i = 0; i < p->n_pdds; i++) {
822 		struct kfd_process_device *pdd = p->pdds[i];
823 		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
824 								pdd->dev->adev,
825 								trap_override,
826 								trap_mask_supported);
827 
828 		if (err)
829 			return err;
830 	}
831 
832 	if (trap_mask_request & ~*trap_mask_supported)
833 		return -EACCES;
834 
835 	return 0;
836 }
837 
kfd_dbg_trap_set_wave_launch_override(struct kfd_process * target,uint32_t trap_override,uint32_t trap_mask_bits,uint32_t trap_mask_request,uint32_t * trap_mask_prev,uint32_t * trap_mask_supported)838 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
839 					uint32_t trap_override,
840 					uint32_t trap_mask_bits,
841 					uint32_t trap_mask_request,
842 					uint32_t *trap_mask_prev,
843 					uint32_t *trap_mask_supported)
844 {
845 	int r = 0, i;
846 
847 	r = kfd_dbg_validate_trap_override_request(target,
848 						trap_override,
849 						trap_mask_request,
850 						trap_mask_supported);
851 
852 	if (r)
853 		return r;
854 
855 	for (i = 0; i < target->n_pdds; i++) {
856 		struct kfd_process_device *pdd = target->pdds[i];
857 
858 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
859 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
860 				pdd->dev->adev,
861 				pdd->dev->vm_info.last_vmid_kfd,
862 				trap_override,
863 				trap_mask_bits,
864 				trap_mask_request,
865 				trap_mask_prev,
866 				pdd->spi_dbg_override);
867 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
868 
869 		if (!pdd->dev->kfd->shared_resources.enable_mes)
870 			r = debug_refresh_runlist(pdd->dev->dqm);
871 		else
872 			r = kfd_dbg_set_mes_debug_mode(pdd, true);
873 
874 		if (r)
875 			break;
876 	}
877 
878 	return r;
879 }
880 
kfd_dbg_trap_set_wave_launch_mode(struct kfd_process * target,uint8_t wave_launch_mode)881 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
882 					uint8_t wave_launch_mode)
883 {
884 	int r = 0, i;
885 
886 	if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
887 			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
888 			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
889 		return -EINVAL;
890 
891 	for (i = 0; i < target->n_pdds; i++) {
892 		struct kfd_process_device *pdd = target->pdds[i];
893 
894 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
895 		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
896 				pdd->dev->adev,
897 				wave_launch_mode,
898 				pdd->dev->vm_info.last_vmid_kfd);
899 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
900 
901 		if (!pdd->dev->kfd->shared_resources.enable_mes)
902 			r = debug_refresh_runlist(pdd->dev->dqm);
903 		else
904 			r = kfd_dbg_set_mes_debug_mode(pdd, true);
905 
906 		if (r)
907 			break;
908 	}
909 
910 	return r;
911 }
912 
kfd_dbg_trap_query_exception_info(struct kfd_process * target,uint32_t source_id,uint32_t exception_code,bool clear_exception,void __user * info,uint32_t * info_size)913 int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
914 		uint32_t source_id,
915 		uint32_t exception_code,
916 		bool clear_exception,
917 		void __user *info,
918 		uint32_t *info_size)
919 {
920 	bool found = false;
921 	int r = 0;
922 	uint32_t copy_size, actual_info_size = 0;
923 	uint64_t *exception_status_ptr = NULL;
924 
925 	if (!target)
926 		return -EINVAL;
927 
928 	if (!info || !info_size)
929 		return -EINVAL;
930 
931 	mutex_lock(&target->event_mutex);
932 
933 	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
934 		/* Per queue exceptions */
935 		struct queue *queue = NULL;
936 		int i;
937 
938 		for (i = 0; i < target->n_pdds; i++) {
939 			struct kfd_process_device *pdd = target->pdds[i];
940 			struct qcm_process_device *qpd = &pdd->qpd;
941 
942 			list_for_each_entry(queue, &qpd->queues_list, list) {
943 				if (!found && queue->properties.queue_id == source_id) {
944 					found = true;
945 					break;
946 				}
947 			}
948 			if (found)
949 				break;
950 		}
951 
952 		if (!found) {
953 			r = -EINVAL;
954 			goto out;
955 		}
956 
957 		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
958 			r = -ENODATA;
959 			goto out;
960 		}
961 		exception_status_ptr = &queue->properties.exception_status;
962 	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
963 		/* Per device exceptions */
964 		struct kfd_process_device *pdd = NULL;
965 		int i;
966 
967 		for (i = 0; i < target->n_pdds; i++) {
968 			pdd = target->pdds[i];
969 			if (pdd->dev->id == source_id) {
970 				found = true;
971 				break;
972 			}
973 		}
974 
975 		if (!found) {
976 			r = -EINVAL;
977 			goto out;
978 		}
979 
980 		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
981 			r = -ENODATA;
982 			goto out;
983 		}
984 
985 		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
986 			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
987 
988 			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
989 				r = -EFAULT;
990 				goto out;
991 			}
992 			actual_info_size = pdd->vm_fault_exc_data_size;
993 			if (clear_exception) {
994 				kfree(pdd->vm_fault_exc_data);
995 				pdd->vm_fault_exc_data = NULL;
996 				pdd->vm_fault_exc_data_size = 0;
997 			}
998 		}
999 		exception_status_ptr = &pdd->exception_status;
1000 	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
1001 		/* Per process exceptions */
1002 		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
1003 			r = -ENODATA;
1004 			goto out;
1005 		}
1006 
1007 		if (exception_code == EC_PROCESS_RUNTIME) {
1008 			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
1009 
1010 			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
1011 				r = -EFAULT;
1012 				goto out;
1013 			}
1014 
1015 			actual_info_size = sizeof(target->runtime_info);
1016 		}
1017 
1018 		exception_status_ptr = &target->exception_status;
1019 	} else {
1020 		pr_debug("Bad exception type [%i]\n", exception_code);
1021 		r = -EINVAL;
1022 		goto out;
1023 	}
1024 
1025 	*info_size = actual_info_size;
1026 	if (clear_exception)
1027 		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1028 out:
1029 	mutex_unlock(&target->event_mutex);
1030 	return r;
1031 }
1032 
kfd_dbg_trap_device_snapshot(struct kfd_process * target,uint64_t exception_clear_mask,void __user * user_info,uint32_t * number_of_device_infos,uint32_t * entry_size)1033 int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1034 		uint64_t exception_clear_mask,
1035 		void __user *user_info,
1036 		uint32_t *number_of_device_infos,
1037 		uint32_t *entry_size)
1038 {
1039 	struct kfd_dbg_device_info_entry device_info;
1040 	uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
1041 	int i, r = 0;
1042 
1043 	if (!(target && user_info && number_of_device_infos && entry_size))
1044 		return -EINVAL;
1045 
1046 	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1047 	*number_of_device_infos = target->n_pdds;
1048 	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1049 
1050 	if (!tmp_num_devices)
1051 		return 0;
1052 
1053 	memset(&device_info, 0, sizeof(device_info));
1054 
1055 	mutex_lock(&target->event_mutex);
1056 
1057 	/* Run over all pdd of the process */
1058 	for (i = 0; i < tmp_num_devices; i++) {
1059 		struct kfd_process_device *pdd = target->pdds[i];
1060 		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1061 
1062 		device_info.gpu_id = pdd->dev->id;
1063 		device_info.exception_status = pdd->exception_status;
1064 		device_info.lds_base = pdd->lds_base;
1065 		device_info.lds_limit = pdd->lds_limit;
1066 		device_info.scratch_base = pdd->scratch_base;
1067 		device_info.scratch_limit = pdd->scratch_limit;
1068 		device_info.gpuvm_base = pdd->gpuvm_base;
1069 		device_info.gpuvm_limit = pdd->gpuvm_limit;
1070 		device_info.location_id = topo_dev->node_props.location_id;
1071 		device_info.vendor_id = topo_dev->node_props.vendor_id;
1072 		device_info.device_id = topo_dev->node_props.device_id;
1073 		device_info.revision_id = pdd->dev->adev->pdev->revision;
1074 		device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1075 		device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1076 		device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1077 		device_info.gfx_target_version =
1078 			topo_dev->node_props.gfx_target_version;
1079 		device_info.simd_count = topo_dev->node_props.simd_count;
1080 		device_info.max_waves_per_simd =
1081 			topo_dev->node_props.max_waves_per_simd;
1082 		device_info.array_count = topo_dev->node_props.array_count;
1083 		device_info.simd_arrays_per_engine =
1084 			topo_dev->node_props.simd_arrays_per_engine;
1085 		device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1086 		device_info.capability = topo_dev->node_props.capability;
1087 		device_info.debug_prop = topo_dev->node_props.debug_prop;
1088 
1089 		if (exception_clear_mask)
1090 			pdd->exception_status &= ~exception_clear_mask;
1091 
1092 		if (copy_to_user(user_info, &device_info, *entry_size)) {
1093 			r = -EFAULT;
1094 			break;
1095 		}
1096 
1097 		user_info += tmp_entry_size;
1098 	}
1099 
1100 	mutex_unlock(&target->event_mutex);
1101 
1102 	return r;
1103 }
1104 
kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process * target,uint64_t exception_set_mask)1105 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1106 					uint64_t exception_set_mask)
1107 {
1108 	uint64_t found_mask = 0;
1109 	struct process_queue_manager *pqm;
1110 	struct process_queue_node *pqn;
1111 	static const char write_data = '.';
1112 	loff_t pos = 0;
1113 	int i;
1114 
1115 	mutex_lock(&target->event_mutex);
1116 
1117 	found_mask |= target->exception_status;
1118 
1119 	pqm = &target->pqm;
1120 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1121 		if (!pqn->q)
1122 			continue;
1123 
1124 		found_mask |= pqn->q->properties.exception_status;
1125 	}
1126 
1127 	for (i = 0; i < target->n_pdds; i++) {
1128 		struct kfd_process_device *pdd = target->pdds[i];
1129 
1130 		found_mask |= pdd->exception_status;
1131 	}
1132 
1133 	if (exception_set_mask & found_mask)
1134 		kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1135 
1136 	target->exception_enable_mask = exception_set_mask;
1137 
1138 	mutex_unlock(&target->event_mutex);
1139 }
1140