1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include "kfd_topology.h"
26 #include <linux/file.h>
27 #include <uapi/linux/kfd_ioctl.h>
28 
29 #define MAX_WATCH_ADDRESSES	4
30 
31 int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
32 		      unsigned int *queue_id,
33 		      unsigned int *gpu_id,
34 		      uint64_t exception_clear_mask,
35 		      uint64_t *event_status)
36 {
37 	struct process_queue_manager *pqm;
38 	struct process_queue_node *pqn;
39 	int i;
40 
41 	if (!(process && process->debug_trap_enabled))
42 		return -ENODATA;
43 
44 	mutex_lock(&process->event_mutex);
45 	*event_status = 0;
46 	*queue_id = 0;
47 	*gpu_id = 0;
48 
49 	/* find and report queue events */
50 	pqm = &process->pqm;
51 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
52 		uint64_t tmp = process->exception_enable_mask;
53 
54 		if (!pqn->q)
55 			continue;
56 
57 		tmp &= pqn->q->properties.exception_status;
58 
59 		if (!tmp)
60 			continue;
61 
62 		*event_status = pqn->q->properties.exception_status;
63 		*queue_id = pqn->q->properties.queue_id;
64 		*gpu_id = pqn->q->device->id;
65 		pqn->q->properties.exception_status &= ~exception_clear_mask;
66 		goto out;
67 	}
68 
69 	/* find and report device events */
70 	for (i = 0; i < process->n_pdds; i++) {
71 		struct kfd_process_device *pdd = process->pdds[i];
72 		uint64_t tmp = process->exception_enable_mask
73 						& pdd->exception_status;
74 
75 		if (!tmp)
76 			continue;
77 
78 		*event_status = pdd->exception_status;
79 		*gpu_id = pdd->dev->id;
80 		pdd->exception_status &= ~exception_clear_mask;
81 		goto out;
82 	}
83 
84 	/* report process events */
85 	if (process->exception_enable_mask & process->exception_status) {
86 		*event_status = process->exception_status;
87 		process->exception_status &= ~exception_clear_mask;
88 	}
89 
90 out:
91 	mutex_unlock(&process->event_mutex);
92 	return *event_status ? 0 : -EAGAIN;
93 }
94 
95 void debug_event_write_work_handler(struct work_struct *work)
96 {
97 	struct kfd_process *process;
98 
99 	static const char write_data = '.';
100 	loff_t pos = 0;
101 
102 	process = container_of(work,
103 			struct kfd_process,
104 			debug_event_workarea);
105 
106 	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
107 }
108 
109 /* update process/device/queue exception status, write to descriptor
110  * only if exception_status is enabled.
111  */
112 bool kfd_dbg_ev_raise(uint64_t event_mask,
113 			struct kfd_process *process, struct kfd_node *dev,
114 			unsigned int source_id, bool use_worker,
115 			void *exception_data, size_t exception_data_size)
116 {
117 	struct process_queue_manager *pqm;
118 	struct process_queue_node *pqn;
119 	int i;
120 	static const char write_data = '.';
121 	loff_t pos = 0;
122 	bool is_subscribed = true;
123 
124 	if (!(process && process->debug_trap_enabled))
125 		return false;
126 
127 	mutex_lock(&process->event_mutex);
128 
129 	if (event_mask & KFD_EC_MASK_DEVICE) {
130 		for (i = 0; i < process->n_pdds; i++) {
131 			struct kfd_process_device *pdd = process->pdds[i];
132 
133 			if (pdd->dev != dev)
134 				continue;
135 
136 			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
137 
138 			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
139 				if (!pdd->vm_fault_exc_data) {
140 					pdd->vm_fault_exc_data = kmemdup(
141 							exception_data,
142 							exception_data_size,
143 							GFP_KERNEL);
144 					if (!pdd->vm_fault_exc_data)
145 						pr_debug("Failed to allocate exception data memory");
146 				} else {
147 					pr_debug("Debugger exception data not saved\n");
148 					print_hex_dump_bytes("exception data: ",
149 							DUMP_PREFIX_OFFSET,
150 							exception_data,
151 							exception_data_size);
152 				}
153 			}
154 			break;
155 		}
156 	} else if (event_mask & KFD_EC_MASK_PROCESS) {
157 		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
158 	} else {
159 		pqm = &process->pqm;
160 		list_for_each_entry(pqn, &pqm->queues,
161 				process_queue_list) {
162 			int target_id;
163 
164 			if (!pqn->q)
165 				continue;
166 
167 			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
168 					pqn->q->properties.queue_id :
169 							pqn->q->doorbell_id;
170 
171 			if (pqn->q->device != dev || target_id != source_id)
172 				continue;
173 
174 			pqn->q->properties.exception_status |= event_mask;
175 			break;
176 		}
177 	}
178 
179 	if (process->exception_enable_mask & event_mask) {
180 		if (use_worker)
181 			schedule_work(&process->debug_event_workarea);
182 		else
183 			kernel_write(process->dbg_ev_file,
184 					&write_data,
185 					1,
186 					&pos);
187 	} else {
188 		is_subscribed = false;
189 	}
190 
191 	mutex_unlock(&process->event_mutex);
192 
193 	return is_subscribed;
194 }
195 
196 /* set pending event queue entry from ring entry  */
197 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
198 				   unsigned int pasid,
199 				   uint32_t doorbell_id,
200 				   uint64_t trap_mask,
201 				   void *exception_data,
202 				   size_t exception_data_size)
203 {
204 	struct kfd_process *p;
205 	bool signaled_to_debugger_or_runtime = false;
206 
207 	p = kfd_lookup_process_by_pasid(pasid);
208 
209 	if (!p)
210 		return false;
211 
212 	if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
213 			      exception_data, exception_data_size)) {
214 		struct process_queue_manager *pqm;
215 		struct process_queue_node *pqn;
216 
217 		if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
218 		       p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
219 			mutex_lock(&p->mutex);
220 
221 			pqm = &p->pqm;
222 			list_for_each_entry(pqn, &pqm->queues,
223 							process_queue_list) {
224 
225 				if (!(pqn->q && pqn->q->device == dev &&
226 				      pqn->q->doorbell_id == doorbell_id))
227 					continue;
228 
229 				kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
230 							      trap_mask);
231 
232 				signaled_to_debugger_or_runtime = true;
233 
234 				break;
235 			}
236 
237 			mutex_unlock(&p->mutex);
238 		} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
239 			kfd_dqm_evict_pasid(dev->dqm, p->pasid);
240 			kfd_signal_vm_fault_event(dev, p->pasid, NULL,
241 							exception_data);
242 
243 			signaled_to_debugger_or_runtime = true;
244 		}
245 	} else {
246 		signaled_to_debugger_or_runtime = true;
247 	}
248 
249 	kfd_unref_process(p);
250 
251 	return signaled_to_debugger_or_runtime;
252 }
253 
254 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
255 					unsigned int dev_id,
256 					unsigned int queue_id,
257 					uint64_t error_reason)
258 {
259 	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
260 		struct kfd_process_device *pdd = NULL;
261 		struct kfd_hsa_memory_exception_data *data;
262 		int i;
263 
264 		for (i = 0; i < p->n_pdds; i++) {
265 			if (p->pdds[i]->dev->id == dev_id) {
266 				pdd = p->pdds[i];
267 				break;
268 			}
269 		}
270 
271 		if (!pdd)
272 			return -ENODEV;
273 
274 		data = (struct kfd_hsa_memory_exception_data *)
275 						pdd->vm_fault_exc_data;
276 
277 		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
278 		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
279 		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
280 	}
281 
282 	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
283 		/*
284 		 * block should only happen after the debugger receives runtime
285 		 * enable notice.
286 		 */
287 		up(&p->runtime_enable_sema);
288 		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
289 	}
290 
291 	if (error_reason)
292 		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
293 
294 	return 0;
295 }
296 
297 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
298 {
299 	struct mqd_update_info minfo = {0};
300 	int err;
301 
302 	if (!q)
303 		return 0;
304 
305 	if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
306 	    KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
307 		return 0;
308 
309 	if (enable && q->properties.is_user_cu_masked)
310 		return -EBUSY;
311 
312 	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
313 
314 	q->properties.is_dbg_wa = enable;
315 	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
316 	if (err)
317 		q->properties.is_dbg_wa = false;
318 
319 	return err;
320 }
321 
322 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
323 {
324 	struct process_queue_manager *pqm = &target->pqm;
325 	struct process_queue_node *pqn;
326 	int r = 0;
327 
328 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
329 		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
330 		if (enable && r)
331 			goto unwind;
332 	}
333 
334 	return 0;
335 
336 unwind:
337 	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
338 		kfd_dbg_set_queue_workaround(pqn->q, false);
339 
340 	if (enable)
341 		target->runtime_info.runtime_state = r == -EBUSY ?
342 				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
343 				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
344 
345 	return r;
346 }
347 
348 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
349 {
350 	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
351 	uint32_t flags = pdd->process->dbg_flags;
352 	bool sq_trap_en = !!spi_dbg_cntl;
353 
354 	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
355 		return 0;
356 
357 	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
358 						pdd->watch_points, flags, sq_trap_en);
359 }
360 
361 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
362 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
363 {
364 	int i;
365 
366 	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
367 
368 	spin_lock(&pdd->dev->kfd->watch_points_lock);
369 
370 	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
371 		/* device watchpoint in use so skip */
372 		if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1)
373 			continue;
374 
375 		pdd->alloc_watch_ids |= 0x1 << i;
376 		pdd->dev->kfd->alloc_watch_ids |= 0x1 << i;
377 		*watch_id = i;
378 		spin_unlock(&pdd->dev->kfd->watch_points_lock);
379 		return 0;
380 	}
381 
382 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
383 
384 	return -ENOMEM;
385 }
386 
387 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
388 {
389 	spin_lock(&pdd->dev->kfd->watch_points_lock);
390 
391 	/* process owns device watch point so safe to clear */
392 	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
393 		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
394 		pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id);
395 	}
396 
397 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
398 }
399 
400 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
401 {
402 	bool owns_watch_id = false;
403 
404 	spin_lock(&pdd->dev->kfd->watch_points_lock);
405 	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
406 			((pdd->alloc_watch_ids >> watch_id) & 0x1);
407 
408 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
409 
410 	return owns_watch_id;
411 }
412 
413 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
414 					uint32_t watch_id)
415 {
416 	int r;
417 
418 	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
419 		return -EINVAL;
420 
421 	if (!pdd->dev->kfd->shared_resources.enable_mes) {
422 		r = debug_lock_and_unmap(pdd->dev->dqm);
423 		if (r)
424 			return r;
425 	}
426 
427 	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
428 	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
429 							pdd->dev->adev,
430 							watch_id);
431 	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
432 
433 	if (!pdd->dev->kfd->shared_resources.enable_mes)
434 		r = debug_map_and_unlock(pdd->dev->dqm);
435 	else
436 		r = kfd_dbg_set_mes_debug_mode(pdd);
437 
438 	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
439 
440 	return r;
441 }
442 
443 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
444 					uint64_t watch_address,
445 					uint32_t watch_address_mask,
446 					uint32_t *watch_id,
447 					uint32_t watch_mode)
448 {
449 	int r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
450 
451 	if (r)
452 		return r;
453 
454 	if (!pdd->dev->kfd->shared_resources.enable_mes) {
455 		r = debug_lock_and_unmap(pdd->dev->dqm);
456 		if (r) {
457 			kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
458 			return r;
459 		}
460 	}
461 
462 	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
463 	pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
464 				pdd->dev->adev,
465 				watch_address,
466 				watch_address_mask,
467 				*watch_id,
468 				watch_mode,
469 				pdd->dev->vm_info.last_vmid_kfd);
470 	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
471 
472 	if (!pdd->dev->kfd->shared_resources.enable_mes)
473 		r = debug_map_and_unlock(pdd->dev->dqm);
474 	else
475 		r = kfd_dbg_set_mes_debug_mode(pdd);
476 
477 	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
478 	if (r)
479 		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
480 
481 	return 0;
482 }
483 
484 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
485 {
486 	int i, j;
487 
488 	for (i = 0; i < target->n_pdds; i++)
489 		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
490 			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
491 }
492 
493 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
494 {
495 	uint32_t prev_flags = target->dbg_flags;
496 	int i, r = 0, rewind_count = 0;
497 
498 	for (i = 0; i < target->n_pdds; i++) {
499 		if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
500 			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
501 			*flags = prev_flags;
502 			return -EACCES;
503 		}
504 	}
505 
506 	target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
507 	*flags = prev_flags;
508 	for (i = 0; i < target->n_pdds; i++) {
509 		struct kfd_process_device *pdd = target->pdds[i];
510 
511 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
512 			continue;
513 
514 		if (!pdd->dev->kfd->shared_resources.enable_mes)
515 			r = debug_refresh_runlist(pdd->dev->dqm);
516 		else
517 			r = kfd_dbg_set_mes_debug_mode(pdd);
518 
519 		if (r) {
520 			target->dbg_flags = prev_flags;
521 			break;
522 		}
523 
524 		rewind_count++;
525 	}
526 
527 	/* Rewind flags */
528 	if (r) {
529 		target->dbg_flags = prev_flags;
530 
531 		for (i = 0; i < rewind_count; i++) {
532 			struct kfd_process_device *pdd = target->pdds[i];
533 
534 			if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
535 				continue;
536 
537 			if (!pdd->dev->kfd->shared_resources.enable_mes)
538 				debug_refresh_runlist(pdd->dev->dqm);
539 			else
540 				kfd_dbg_set_mes_debug_mode(pdd);
541 		}
542 	}
543 
544 	return r;
545 }
546 
547 /* kfd_dbg_trap_deactivate:
548  *	target: target process
549  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
550  *	unwind_count:
551  *		If unwind == true, how far down the pdd list we need
552  *				to unwind
553  *		else: ignored
554  */
555 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
556 {
557 	int i;
558 
559 	if (!unwind) {
560 		uint32_t flags = 0;
561 		int resume_count = resume_queues(target, 0, NULL);
562 
563 		if (resume_count)
564 			pr_debug("Resumed %d queues\n", resume_count);
565 
566 		cancel_work_sync(&target->debug_event_workarea);
567 		kfd_dbg_clear_process_address_watch(target);
568 		kfd_dbg_trap_set_wave_launch_mode(target, 0);
569 
570 		kfd_dbg_trap_set_flags(target, &flags);
571 	}
572 
573 	for (i = 0; i < target->n_pdds; i++) {
574 		struct kfd_process_device *pdd = target->pdds[i];
575 
576 		/* If this is an unwind, and we have unwound the required
577 		 * enable calls on the pdd list, we need to stop now
578 		 * otherwise we may mess up another debugger session.
579 		 */
580 		if (unwind && i == unwind_count)
581 			break;
582 
583 		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
584 
585 		/* GFX off is already disabled by debug activate if not RLC restore supported. */
586 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
587 			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
588 		pdd->spi_dbg_override =
589 				pdd->dev->kfd2kgd->disable_debug_trap(
590 				pdd->dev->adev,
591 				target->runtime_info.ttmp_setup,
592 				pdd->dev->vm_info.last_vmid_kfd);
593 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
594 
595 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
596 				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
597 			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
598 
599 		if (!pdd->dev->kfd->shared_resources.enable_mes)
600 			debug_refresh_runlist(pdd->dev->dqm);
601 		else
602 			kfd_dbg_set_mes_debug_mode(pdd);
603 	}
604 
605 	kfd_dbg_set_workaround(target, false);
606 }
607 
608 static void kfd_dbg_clean_exception_status(struct kfd_process *target)
609 {
610 	struct process_queue_manager *pqm;
611 	struct process_queue_node *pqn;
612 	int i;
613 
614 	for (i = 0; i < target->n_pdds; i++) {
615 		struct kfd_process_device *pdd = target->pdds[i];
616 
617 		kfd_process_drain_interrupts(pdd);
618 
619 		pdd->exception_status = 0;
620 	}
621 
622 	pqm = &target->pqm;
623 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
624 		if (!pqn->q)
625 			continue;
626 
627 		pqn->q->properties.exception_status = 0;
628 	}
629 
630 	target->exception_status = 0;
631 }
632 
633 int kfd_dbg_trap_disable(struct kfd_process *target)
634 {
635 	if (!target->debug_trap_enabled)
636 		return 0;
637 
638 	/*
639 	 * Defer deactivation to runtime if runtime not enabled otherwise reset
640 	 * attached running target runtime state to enable for re-attach.
641 	 */
642 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
643 		kfd_dbg_trap_deactivate(target, false, 0);
644 	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
645 		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
646 
647 	fput(target->dbg_ev_file);
648 	target->dbg_ev_file = NULL;
649 
650 	if (target->debugger_process) {
651 		atomic_dec(&target->debugger_process->debugged_process_count);
652 		target->debugger_process = NULL;
653 	}
654 
655 	target->debug_trap_enabled = false;
656 	kfd_dbg_clean_exception_status(target);
657 	kfd_unref_process(target);
658 
659 	return 0;
660 }
661 
662 int kfd_dbg_trap_activate(struct kfd_process *target)
663 {
664 	int i, r = 0;
665 
666 	r = kfd_dbg_set_workaround(target, true);
667 	if (r)
668 		return r;
669 
670 	for (i = 0; i < target->n_pdds; i++) {
671 		struct kfd_process_device *pdd = target->pdds[i];
672 
673 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
674 			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
675 
676 			if (r) {
677 				target->runtime_info.runtime_state = (r == -EBUSY) ?
678 							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
679 							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
680 
681 				goto unwind_err;
682 			}
683 		}
684 
685 		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
686 		 * If RLC restore of debug registers is not supported and runtime enable
687 		 * hasn't done so already on ttmp setup request, restore the trap config registers.
688 		 *
689 		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
690 		 * the debug session.
691 		 */
692 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
693 		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
694 						target->runtime_info.ttmp_setup))
695 			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
696 								pdd->dev->vm_info.last_vmid_kfd);
697 
698 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
699 					pdd->dev->adev,
700 					false,
701 					pdd->dev->vm_info.last_vmid_kfd);
702 
703 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
704 			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
705 
706 		/*
707 		 * Setting the debug flag in the trap handler requires that the TMA has been
708 		 * allocated, which occurs during CWSR initialization.
709 		 * In the event that CWSR has not been initialized at this point, setting the
710 		 * flag will be called again during CWSR initialization if the target process
711 		 * is still debug enabled.
712 		 */
713 		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
714 
715 		if (!pdd->dev->kfd->shared_resources.enable_mes)
716 			r = debug_refresh_runlist(pdd->dev->dqm);
717 		else
718 			r = kfd_dbg_set_mes_debug_mode(pdd);
719 
720 		if (r) {
721 			target->runtime_info.runtime_state =
722 					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
723 			goto unwind_err;
724 		}
725 	}
726 
727 	return 0;
728 
729 unwind_err:
730 	/* Enabling debug failed, we need to disable on
731 	 * all GPUs so the enable is all or nothing.
732 	 */
733 	kfd_dbg_trap_deactivate(target, true, i);
734 	return r;
735 }
736 
737 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
738 			void __user *runtime_info, uint32_t *runtime_size)
739 {
740 	struct file *f;
741 	uint32_t copy_size;
742 	int i, r = 0;
743 
744 	if (target->debug_trap_enabled)
745 		return -EALREADY;
746 
747 	/* Enable pre-checks */
748 	for (i = 0; i < target->n_pdds; i++) {
749 		struct kfd_process_device *pdd = target->pdds[i];
750 
751 		if (!KFD_IS_SOC15(pdd->dev))
752 			return -ENODEV;
753 
754 		if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
755 			return -EBUSY;
756 	}
757 
758 	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
759 
760 	f = fget(fd);
761 	if (!f) {
762 		pr_err("Failed to get file for (%i)\n", fd);
763 		return -EBADF;
764 	}
765 
766 	target->dbg_ev_file = f;
767 
768 	/* defer activation to runtime if not runtime enabled */
769 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
770 		kfd_dbg_trap_activate(target);
771 
772 	/* We already hold the process reference but hold another one for the
773 	 * debug session.
774 	 */
775 	kref_get(&target->ref);
776 	target->debug_trap_enabled = true;
777 
778 	if (target->debugger_process)
779 		atomic_inc(&target->debugger_process->debugged_process_count);
780 
781 	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
782 		kfd_dbg_trap_deactivate(target, false, 0);
783 		r = -EFAULT;
784 	}
785 
786 	*runtime_size = sizeof(target->runtime_info);
787 
788 	return r;
789 }
790 
791 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
792 						uint32_t trap_override,
793 						uint32_t trap_mask_request,
794 						uint32_t *trap_mask_supported)
795 {
796 	int i = 0;
797 
798 	*trap_mask_supported = 0xffffffff;
799 
800 	for (i = 0; i < p->n_pdds; i++) {
801 		struct kfd_process_device *pdd = p->pdds[i];
802 		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
803 								pdd->dev->adev,
804 								trap_override,
805 								trap_mask_supported);
806 
807 		if (err)
808 			return err;
809 	}
810 
811 	if (trap_mask_request & ~*trap_mask_supported)
812 		return -EACCES;
813 
814 	return 0;
815 }
816 
817 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
818 					uint32_t trap_override,
819 					uint32_t trap_mask_bits,
820 					uint32_t trap_mask_request,
821 					uint32_t *trap_mask_prev,
822 					uint32_t *trap_mask_supported)
823 {
824 	int r = 0, i;
825 
826 	r = kfd_dbg_validate_trap_override_request(target,
827 						trap_override,
828 						trap_mask_request,
829 						trap_mask_supported);
830 
831 	if (r)
832 		return r;
833 
834 	for (i = 0; i < target->n_pdds; i++) {
835 		struct kfd_process_device *pdd = target->pdds[i];
836 
837 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
838 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
839 				pdd->dev->adev,
840 				pdd->dev->vm_info.last_vmid_kfd,
841 				trap_override,
842 				trap_mask_bits,
843 				trap_mask_request,
844 				trap_mask_prev,
845 				pdd->spi_dbg_override);
846 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
847 
848 		if (!pdd->dev->kfd->shared_resources.enable_mes)
849 			r = debug_refresh_runlist(pdd->dev->dqm);
850 		else
851 			r = kfd_dbg_set_mes_debug_mode(pdd);
852 
853 		if (r)
854 			break;
855 	}
856 
857 	return r;
858 }
859 
860 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
861 					uint8_t wave_launch_mode)
862 {
863 	int r = 0, i;
864 
865 	if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
866 			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
867 			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
868 		return -EINVAL;
869 
870 	for (i = 0; i < target->n_pdds; i++) {
871 		struct kfd_process_device *pdd = target->pdds[i];
872 
873 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
874 		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
875 				pdd->dev->adev,
876 				wave_launch_mode,
877 				pdd->dev->vm_info.last_vmid_kfd);
878 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
879 
880 		if (!pdd->dev->kfd->shared_resources.enable_mes)
881 			r = debug_refresh_runlist(pdd->dev->dqm);
882 		else
883 			r = kfd_dbg_set_mes_debug_mode(pdd);
884 
885 		if (r)
886 			break;
887 	}
888 
889 	return r;
890 }
891 
892 int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
893 		uint32_t source_id,
894 		uint32_t exception_code,
895 		bool clear_exception,
896 		void __user *info,
897 		uint32_t *info_size)
898 {
899 	bool found = false;
900 	int r = 0;
901 	uint32_t copy_size, actual_info_size = 0;
902 	uint64_t *exception_status_ptr = NULL;
903 
904 	if (!target)
905 		return -EINVAL;
906 
907 	if (!info || !info_size)
908 		return -EINVAL;
909 
910 	mutex_lock(&target->event_mutex);
911 
912 	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
913 		/* Per queue exceptions */
914 		struct queue *queue = NULL;
915 		int i;
916 
917 		for (i = 0; i < target->n_pdds; i++) {
918 			struct kfd_process_device *pdd = target->pdds[i];
919 			struct qcm_process_device *qpd = &pdd->qpd;
920 
921 			list_for_each_entry(queue, &qpd->queues_list, list) {
922 				if (!found && queue->properties.queue_id == source_id) {
923 					found = true;
924 					break;
925 				}
926 			}
927 			if (found)
928 				break;
929 		}
930 
931 		if (!found) {
932 			r = -EINVAL;
933 			goto out;
934 		}
935 
936 		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
937 			r = -ENODATA;
938 			goto out;
939 		}
940 		exception_status_ptr = &queue->properties.exception_status;
941 	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
942 		/* Per device exceptions */
943 		struct kfd_process_device *pdd = NULL;
944 		int i;
945 
946 		for (i = 0; i < target->n_pdds; i++) {
947 			pdd = target->pdds[i];
948 			if (pdd->dev->id == source_id) {
949 				found = true;
950 				break;
951 			}
952 		}
953 
954 		if (!found) {
955 			r = -EINVAL;
956 			goto out;
957 		}
958 
959 		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
960 			r = -ENODATA;
961 			goto out;
962 		}
963 
964 		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
965 			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
966 
967 			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
968 				r = -EFAULT;
969 				goto out;
970 			}
971 			actual_info_size = pdd->vm_fault_exc_data_size;
972 			if (clear_exception) {
973 				kfree(pdd->vm_fault_exc_data);
974 				pdd->vm_fault_exc_data = NULL;
975 				pdd->vm_fault_exc_data_size = 0;
976 			}
977 		}
978 		exception_status_ptr = &pdd->exception_status;
979 	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
980 		/* Per process exceptions */
981 		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
982 			r = -ENODATA;
983 			goto out;
984 		}
985 
986 		if (exception_code == EC_PROCESS_RUNTIME) {
987 			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
988 
989 			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
990 				r = -EFAULT;
991 				goto out;
992 			}
993 
994 			actual_info_size = sizeof(target->runtime_info);
995 		}
996 
997 		exception_status_ptr = &target->exception_status;
998 	} else {
999 		pr_debug("Bad exception type [%i]\n", exception_code);
1000 		r = -EINVAL;
1001 		goto out;
1002 	}
1003 
1004 	*info_size = actual_info_size;
1005 	if (clear_exception)
1006 		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1007 out:
1008 	mutex_unlock(&target->event_mutex);
1009 	return r;
1010 }
1011 
1012 int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1013 		uint64_t exception_clear_mask,
1014 		void __user *user_info,
1015 		uint32_t *number_of_device_infos,
1016 		uint32_t *entry_size)
1017 {
1018 	struct kfd_dbg_device_info_entry device_info;
1019 	uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
1020 	int i, r = 0;
1021 
1022 	if (!(target && user_info && number_of_device_infos && entry_size))
1023 		return -EINVAL;
1024 
1025 	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1026 	*number_of_device_infos = target->n_pdds;
1027 	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1028 
1029 	if (!tmp_num_devices)
1030 		return 0;
1031 
1032 	memset(&device_info, 0, sizeof(device_info));
1033 
1034 	mutex_lock(&target->event_mutex);
1035 
1036 	/* Run over all pdd of the process */
1037 	for (i = 0; i < tmp_num_devices; i++) {
1038 		struct kfd_process_device *pdd = target->pdds[i];
1039 		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1040 
1041 		device_info.gpu_id = pdd->dev->id;
1042 		device_info.exception_status = pdd->exception_status;
1043 		device_info.lds_base = pdd->lds_base;
1044 		device_info.lds_limit = pdd->lds_limit;
1045 		device_info.scratch_base = pdd->scratch_base;
1046 		device_info.scratch_limit = pdd->scratch_limit;
1047 		device_info.gpuvm_base = pdd->gpuvm_base;
1048 		device_info.gpuvm_limit = pdd->gpuvm_limit;
1049 		device_info.location_id = topo_dev->node_props.location_id;
1050 		device_info.vendor_id = topo_dev->node_props.vendor_id;
1051 		device_info.device_id = topo_dev->node_props.device_id;
1052 		device_info.revision_id = pdd->dev->adev->pdev->revision;
1053 		device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1054 		device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1055 		device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1056 		device_info.gfx_target_version =
1057 			topo_dev->node_props.gfx_target_version;
1058 		device_info.simd_count = topo_dev->node_props.simd_count;
1059 		device_info.max_waves_per_simd =
1060 			topo_dev->node_props.max_waves_per_simd;
1061 		device_info.array_count = topo_dev->node_props.array_count;
1062 		device_info.simd_arrays_per_engine =
1063 			topo_dev->node_props.simd_arrays_per_engine;
1064 		device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1065 		device_info.capability = topo_dev->node_props.capability;
1066 		device_info.debug_prop = topo_dev->node_props.debug_prop;
1067 
1068 		if (exception_clear_mask)
1069 			pdd->exception_status &= ~exception_clear_mask;
1070 
1071 		if (copy_to_user(user_info, &device_info, *entry_size)) {
1072 			r = -EFAULT;
1073 			break;
1074 		}
1075 
1076 		user_info += tmp_entry_size;
1077 	}
1078 
1079 	mutex_unlock(&target->event_mutex);
1080 
1081 	return r;
1082 }
1083 
1084 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1085 					uint64_t exception_set_mask)
1086 {
1087 	uint64_t found_mask = 0;
1088 	struct process_queue_manager *pqm;
1089 	struct process_queue_node *pqn;
1090 	static const char write_data = '.';
1091 	loff_t pos = 0;
1092 	int i;
1093 
1094 	mutex_lock(&target->event_mutex);
1095 
1096 	found_mask |= target->exception_status;
1097 
1098 	pqm = &target->pqm;
1099 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1100 		if (!pqn->q)
1101 			continue;
1102 
1103 		found_mask |= pqn->q->properties.exception_status;
1104 	}
1105 
1106 	for (i = 0; i < target->n_pdds; i++) {
1107 		struct kfd_process_device *pdd = target->pdds[i];
1108 
1109 		found_mask |= pdd->exception_status;
1110 	}
1111 
1112 	if (exception_set_mask & found_mask)
1113 		kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1114 
1115 	target->exception_enable_mask = exception_set_mask;
1116 
1117 	mutex_unlock(&target->event_mutex);
1118 }
1119