1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include <linux/file.h>
26 #include <uapi/linux/kfd_ioctl.h>
27 
28 #define MAX_WATCH_ADDRESSES	4
29 
30 int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
31 		      unsigned int *queue_id,
32 		      unsigned int *gpu_id,
33 		      uint64_t exception_clear_mask,
34 		      uint64_t *event_status)
35 {
36 	struct process_queue_manager *pqm;
37 	struct process_queue_node *pqn;
38 	int i;
39 
40 	if (!(process && process->debug_trap_enabled))
41 		return -ENODATA;
42 
43 	mutex_lock(&process->event_mutex);
44 	*event_status = 0;
45 	*queue_id = 0;
46 	*gpu_id = 0;
47 
48 	/* find and report queue events */
49 	pqm = &process->pqm;
50 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
51 		uint64_t tmp = process->exception_enable_mask;
52 
53 		if (!pqn->q)
54 			continue;
55 
56 		tmp &= pqn->q->properties.exception_status;
57 
58 		if (!tmp)
59 			continue;
60 
61 		*event_status = pqn->q->properties.exception_status;
62 		*queue_id = pqn->q->properties.queue_id;
63 		*gpu_id = pqn->q->device->id;
64 		pqn->q->properties.exception_status &= ~exception_clear_mask;
65 		goto out;
66 	}
67 
68 	/* find and report device events */
69 	for (i = 0; i < process->n_pdds; i++) {
70 		struct kfd_process_device *pdd = process->pdds[i];
71 		uint64_t tmp = process->exception_enable_mask
72 						& pdd->exception_status;
73 
74 		if (!tmp)
75 			continue;
76 
77 		*event_status = pdd->exception_status;
78 		*gpu_id = pdd->dev->id;
79 		pdd->exception_status &= ~exception_clear_mask;
80 		goto out;
81 	}
82 
83 	/* report process events */
84 	if (process->exception_enable_mask & process->exception_status) {
85 		*event_status = process->exception_status;
86 		process->exception_status &= ~exception_clear_mask;
87 	}
88 
89 out:
90 	mutex_unlock(&process->event_mutex);
91 	return *event_status ? 0 : -EAGAIN;
92 }
93 
94 void debug_event_write_work_handler(struct work_struct *work)
95 {
96 	struct kfd_process *process;
97 
98 	static const char write_data = '.';
99 	loff_t pos = 0;
100 
101 	process = container_of(work,
102 			struct kfd_process,
103 			debug_event_workarea);
104 
105 	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
106 }
107 
108 /* update process/device/queue exception status, write to descriptor
109  * only if exception_status is enabled.
110  */
111 bool kfd_dbg_ev_raise(uint64_t event_mask,
112 			struct kfd_process *process, struct kfd_node *dev,
113 			unsigned int source_id, bool use_worker,
114 			void *exception_data, size_t exception_data_size)
115 {
116 	struct process_queue_manager *pqm;
117 	struct process_queue_node *pqn;
118 	int i;
119 	static const char write_data = '.';
120 	loff_t pos = 0;
121 	bool is_subscribed = true;
122 
123 	if (!(process && process->debug_trap_enabled))
124 		return false;
125 
126 	mutex_lock(&process->event_mutex);
127 
128 	if (event_mask & KFD_EC_MASK_DEVICE) {
129 		for (i = 0; i < process->n_pdds; i++) {
130 			struct kfd_process_device *pdd = process->pdds[i];
131 
132 			if (pdd->dev != dev)
133 				continue;
134 
135 			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
136 
137 			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
138 				if (!pdd->vm_fault_exc_data) {
139 					pdd->vm_fault_exc_data = kmemdup(
140 							exception_data,
141 							exception_data_size,
142 							GFP_KERNEL);
143 					if (!pdd->vm_fault_exc_data)
144 						pr_debug("Failed to allocate exception data memory");
145 				} else {
146 					pr_debug("Debugger exception data not saved\n");
147 					print_hex_dump_bytes("exception data: ",
148 							DUMP_PREFIX_OFFSET,
149 							exception_data,
150 							exception_data_size);
151 				}
152 			}
153 			break;
154 		}
155 	} else if (event_mask & KFD_EC_MASK_PROCESS) {
156 		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
157 	} else {
158 		pqm = &process->pqm;
159 		list_for_each_entry(pqn, &pqm->queues,
160 				process_queue_list) {
161 			int target_id;
162 
163 			if (!pqn->q)
164 				continue;
165 
166 			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
167 					pqn->q->properties.queue_id :
168 							pqn->q->doorbell_id;
169 
170 			if (pqn->q->device != dev || target_id != source_id)
171 				continue;
172 
173 			pqn->q->properties.exception_status |= event_mask;
174 			break;
175 		}
176 	}
177 
178 	if (process->exception_enable_mask & event_mask) {
179 		if (use_worker)
180 			schedule_work(&process->debug_event_workarea);
181 		else
182 			kernel_write(process->dbg_ev_file,
183 					&write_data,
184 					1,
185 					&pos);
186 	} else {
187 		is_subscribed = false;
188 	}
189 
190 	mutex_unlock(&process->event_mutex);
191 
192 	return is_subscribed;
193 }
194 
195 /* set pending event queue entry from ring entry  */
196 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
197 				   unsigned int pasid,
198 				   uint32_t doorbell_id,
199 				   uint64_t trap_mask,
200 				   void *exception_data,
201 				   size_t exception_data_size)
202 {
203 	struct kfd_process *p;
204 	bool signaled_to_debugger_or_runtime = false;
205 
206 	p = kfd_lookup_process_by_pasid(pasid);
207 
208 	if (!p)
209 		return false;
210 
211 	if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
212 			      exception_data, exception_data_size)) {
213 		struct process_queue_manager *pqm;
214 		struct process_queue_node *pqn;
215 
216 		if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
217 		       p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
218 			mutex_lock(&p->mutex);
219 
220 			pqm = &p->pqm;
221 			list_for_each_entry(pqn, &pqm->queues,
222 							process_queue_list) {
223 
224 				if (!(pqn->q && pqn->q->device == dev &&
225 				      pqn->q->doorbell_id == doorbell_id))
226 					continue;
227 
228 				kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
229 							      trap_mask);
230 
231 				signaled_to_debugger_or_runtime = true;
232 
233 				break;
234 			}
235 
236 			mutex_unlock(&p->mutex);
237 		} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
238 			kfd_dqm_evict_pasid(dev->dqm, p->pasid);
239 			kfd_signal_vm_fault_event(dev, p->pasid, NULL,
240 							exception_data);
241 
242 			signaled_to_debugger_or_runtime = true;
243 		}
244 	} else {
245 		signaled_to_debugger_or_runtime = true;
246 	}
247 
248 	kfd_unref_process(p);
249 
250 	return signaled_to_debugger_or_runtime;
251 }
252 
253 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
254 					unsigned int dev_id,
255 					unsigned int queue_id,
256 					uint64_t error_reason)
257 {
258 	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
259 		struct kfd_process_device *pdd = NULL;
260 		struct kfd_hsa_memory_exception_data *data;
261 		int i;
262 
263 		for (i = 0; i < p->n_pdds; i++) {
264 			if (p->pdds[i]->dev->id == dev_id) {
265 				pdd = p->pdds[i];
266 				break;
267 			}
268 		}
269 
270 		if (!pdd)
271 			return -ENODEV;
272 
273 		data = (struct kfd_hsa_memory_exception_data *)
274 						pdd->vm_fault_exc_data;
275 
276 		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
277 		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
278 		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
279 	}
280 
281 	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
282 		/*
283 		 * block should only happen after the debugger receives runtime
284 		 * enable notice.
285 		 */
286 		up(&p->runtime_enable_sema);
287 		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
288 	}
289 
290 	if (error_reason)
291 		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
292 
293 	return 0;
294 }
295 
296 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
297 {
298 	struct mqd_update_info minfo = {0};
299 	int err;
300 
301 	if (!q)
302 		return 0;
303 
304 	if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
305 	    KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
306 		return 0;
307 
308 	if (enable && q->properties.is_user_cu_masked)
309 		return -EBUSY;
310 
311 	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
312 
313 	q->properties.is_dbg_wa = enable;
314 	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
315 	if (err)
316 		q->properties.is_dbg_wa = false;
317 
318 	return err;
319 }
320 
321 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
322 {
323 	struct process_queue_manager *pqm = &target->pqm;
324 	struct process_queue_node *pqn;
325 	int r = 0;
326 
327 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
328 		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
329 		if (enable && r)
330 			goto unwind;
331 	}
332 
333 	return 0;
334 
335 unwind:
336 	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
337 		kfd_dbg_set_queue_workaround(pqn->q, false);
338 
339 	if (enable)
340 		target->runtime_info.runtime_state = r == -EBUSY ?
341 				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
342 				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
343 
344 	return r;
345 }
346 
347 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
348 {
349 	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
350 	uint32_t flags = pdd->process->dbg_flags;
351 
352 	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
353 		return 0;
354 
355 	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
356 						pdd->watch_points, flags);
357 }
358 
359 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
360 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
361 {
362 	int i;
363 
364 	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
365 
366 	spin_lock(&pdd->dev->kfd->watch_points_lock);
367 
368 	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
369 		/* device watchpoint in use so skip */
370 		if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1)
371 			continue;
372 
373 		pdd->alloc_watch_ids |= 0x1 << i;
374 		pdd->dev->kfd->alloc_watch_ids |= 0x1 << i;
375 		*watch_id = i;
376 		spin_unlock(&pdd->dev->kfd->watch_points_lock);
377 		return 0;
378 	}
379 
380 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
381 
382 	return -ENOMEM;
383 }
384 
385 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
386 {
387 	spin_lock(&pdd->dev->kfd->watch_points_lock);
388 
389 	/* process owns device watch point so safe to clear */
390 	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
391 		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
392 		pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id);
393 	}
394 
395 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
396 }
397 
398 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
399 {
400 	bool owns_watch_id = false;
401 
402 	spin_lock(&pdd->dev->kfd->watch_points_lock);
403 	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
404 			((pdd->alloc_watch_ids >> watch_id) & 0x1);
405 
406 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
407 
408 	return owns_watch_id;
409 }
410 
411 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
412 					uint32_t watch_id)
413 {
414 	int r;
415 
416 	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
417 		return -EINVAL;
418 
419 	if (!pdd->dev->kfd->shared_resources.enable_mes) {
420 		r = debug_lock_and_unmap(pdd->dev->dqm);
421 		if (r)
422 			return r;
423 	}
424 
425 	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
426 	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
427 							pdd->dev->adev,
428 							watch_id);
429 	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
430 
431 	if (!pdd->dev->kfd->shared_resources.enable_mes)
432 		r = debug_map_and_unlock(pdd->dev->dqm);
433 	else
434 		r = kfd_dbg_set_mes_debug_mode(pdd);
435 
436 	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
437 
438 	return r;
439 }
440 
441 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
442 					uint64_t watch_address,
443 					uint32_t watch_address_mask,
444 					uint32_t *watch_id,
445 					uint32_t watch_mode)
446 {
447 	int r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
448 
449 	if (r)
450 		return r;
451 
452 	if (!pdd->dev->kfd->shared_resources.enable_mes) {
453 		r = debug_lock_and_unmap(pdd->dev->dqm);
454 		if (r) {
455 			kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
456 			return r;
457 		}
458 	}
459 
460 	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
461 	pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
462 				pdd->dev->adev,
463 				watch_address,
464 				watch_address_mask,
465 				*watch_id,
466 				watch_mode,
467 				pdd->dev->vm_info.last_vmid_kfd);
468 	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
469 
470 	if (!pdd->dev->kfd->shared_resources.enable_mes)
471 		r = debug_map_and_unlock(pdd->dev->dqm);
472 	else
473 		r = kfd_dbg_set_mes_debug_mode(pdd);
474 
475 	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
476 	if (r)
477 		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
478 
479 	return 0;
480 }
481 
482 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
483 {
484 	int i, j;
485 
486 	for (i = 0; i < target->n_pdds; i++)
487 		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
488 			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
489 }
490 
491 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
492 {
493 	uint32_t prev_flags = target->dbg_flags;
494 	int i, r = 0, rewind_count = 0;
495 
496 	for (i = 0; i < target->n_pdds; i++) {
497 		if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
498 			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
499 			*flags = prev_flags;
500 			return -EACCES;
501 		}
502 	}
503 
504 	target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
505 	*flags = prev_flags;
506 	for (i = 0; i < target->n_pdds; i++) {
507 		struct kfd_process_device *pdd = target->pdds[i];
508 
509 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
510 			continue;
511 
512 		if (!pdd->dev->kfd->shared_resources.enable_mes)
513 			r = debug_refresh_runlist(pdd->dev->dqm);
514 		else
515 			r = kfd_dbg_set_mes_debug_mode(pdd);
516 
517 		if (r) {
518 			target->dbg_flags = prev_flags;
519 			break;
520 		}
521 
522 		rewind_count++;
523 	}
524 
525 	/* Rewind flags */
526 	if (r) {
527 		target->dbg_flags = prev_flags;
528 
529 		for (i = 0; i < rewind_count; i++) {
530 			struct kfd_process_device *pdd = target->pdds[i];
531 
532 			if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
533 				continue;
534 
535 			if (!pdd->dev->kfd->shared_resources.enable_mes)
536 				debug_refresh_runlist(pdd->dev->dqm);
537 			else
538 				kfd_dbg_set_mes_debug_mode(pdd);
539 		}
540 	}
541 
542 	return r;
543 }
544 
545 /* kfd_dbg_trap_deactivate:
546  *	target: target process
547  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
548  *	unwind_count:
549  *		If unwind == true, how far down the pdd list we need
550  *				to unwind
551  *		else: ignored
552  */
553 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
554 {
555 	int i;
556 
557 	if (!unwind) {
558 		uint32_t flags = 0;
559 
560 		cancel_work_sync(&target->debug_event_workarea);
561 		kfd_dbg_clear_process_address_watch(target);
562 		kfd_dbg_trap_set_wave_launch_mode(target, 0);
563 
564 		kfd_dbg_trap_set_flags(target, &flags);
565 	}
566 
567 	for (i = 0; i < target->n_pdds; i++) {
568 		struct kfd_process_device *pdd = target->pdds[i];
569 
570 		/* If this is an unwind, and we have unwound the required
571 		 * enable calls on the pdd list, we need to stop now
572 		 * otherwise we may mess up another debugger session.
573 		 */
574 		if (unwind && i == unwind_count)
575 			break;
576 
577 		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
578 
579 		/* GFX off is already disabled by debug activate if not RLC restore supported. */
580 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
581 			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
582 		pdd->spi_dbg_override =
583 				pdd->dev->kfd2kgd->disable_debug_trap(
584 				pdd->dev->adev,
585 				target->runtime_info.ttmp_setup,
586 				pdd->dev->vm_info.last_vmid_kfd);
587 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
588 
589 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
590 				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
591 			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
592 
593 		if (!pdd->dev->kfd->shared_resources.enable_mes)
594 			debug_refresh_runlist(pdd->dev->dqm);
595 		else
596 			kfd_dbg_set_mes_debug_mode(pdd);
597 	}
598 
599 	kfd_dbg_set_workaround(target, false);
600 
601 	if (!unwind) {
602 		int resume_count = resume_queues(target, 0, NULL);
603 
604 		if (resume_count)
605 			pr_debug("Resumed %d queues\n", resume_count);
606 	}
607 }
608 
609 static void kfd_dbg_clean_exception_status(struct kfd_process *target)
610 {
611 	struct process_queue_manager *pqm;
612 	struct process_queue_node *pqn;
613 	int i;
614 
615 	for (i = 0; i < target->n_pdds; i++) {
616 		struct kfd_process_device *pdd = target->pdds[i];
617 
618 		kfd_process_drain_interrupts(pdd);
619 
620 		pdd->exception_status = 0;
621 	}
622 
623 	pqm = &target->pqm;
624 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
625 		if (!pqn->q)
626 			continue;
627 
628 		pqn->q->properties.exception_status = 0;
629 	}
630 
631 	target->exception_status = 0;
632 }
633 
634 int kfd_dbg_trap_disable(struct kfd_process *target)
635 {
636 	if (!target->debug_trap_enabled)
637 		return 0;
638 
639 	/*
640 	 * Defer deactivation to runtime if runtime not enabled otherwise reset
641 	 * attached running target runtime state to enable for re-attach.
642 	 */
643 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
644 		kfd_dbg_trap_deactivate(target, false, 0);
645 	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
646 		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
647 
648 	fput(target->dbg_ev_file);
649 	target->dbg_ev_file = NULL;
650 
651 	if (target->debugger_process) {
652 		atomic_dec(&target->debugger_process->debugged_process_count);
653 		target->debugger_process = NULL;
654 	}
655 
656 	target->debug_trap_enabled = false;
657 	kfd_dbg_clean_exception_status(target);
658 	kfd_unref_process(target);
659 
660 	return 0;
661 }
662 
663 int kfd_dbg_trap_activate(struct kfd_process *target)
664 {
665 	int i, r = 0;
666 
667 	r = kfd_dbg_set_workaround(target, true);
668 	if (r)
669 		return r;
670 
671 	for (i = 0; i < target->n_pdds; i++) {
672 		struct kfd_process_device *pdd = target->pdds[i];
673 
674 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
675 			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
676 
677 			if (r) {
678 				target->runtime_info.runtime_state = (r == -EBUSY) ?
679 							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
680 							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
681 
682 				goto unwind_err;
683 			}
684 		}
685 
686 		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
687 		 * If RLC restore of debug registers is not supported and runtime enable
688 		 * hasn't done so already on ttmp setup request, restore the trap config registers.
689 		 *
690 		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
691 		 * the debug session.
692 		 */
693 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
694 		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
695 						target->runtime_info.ttmp_setup))
696 			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
697 								pdd->dev->vm_info.last_vmid_kfd);
698 
699 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
700 					pdd->dev->adev,
701 					false,
702 					pdd->dev->vm_info.last_vmid_kfd);
703 
704 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
705 			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
706 
707 		/*
708 		 * Setting the debug flag in the trap handler requires that the TMA has been
709 		 * allocated, which occurs during CWSR initialization.
710 		 * In the event that CWSR has not been initialized at this point, setting the
711 		 * flag will be called again during CWSR initialization if the target process
712 		 * is still debug enabled.
713 		 */
714 		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
715 
716 		if (!pdd->dev->kfd->shared_resources.enable_mes)
717 			r = debug_refresh_runlist(pdd->dev->dqm);
718 		else
719 			r = kfd_dbg_set_mes_debug_mode(pdd);
720 
721 		if (r) {
722 			target->runtime_info.runtime_state =
723 					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
724 			goto unwind_err;
725 		}
726 	}
727 
728 	return 0;
729 
730 unwind_err:
731 	/* Enabling debug failed, we need to disable on
732 	 * all GPUs so the enable is all or nothing.
733 	 */
734 	kfd_dbg_trap_deactivate(target, true, i);
735 	return r;
736 }
737 
738 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
739 			void __user *runtime_info, uint32_t *runtime_size)
740 {
741 	struct file *f;
742 	uint32_t copy_size;
743 	int i, r = 0;
744 
745 	if (target->debug_trap_enabled)
746 		return -EALREADY;
747 
748 	/* Enable pre-checks */
749 	for (i = 0; i < target->n_pdds; i++) {
750 		struct kfd_process_device *pdd = target->pdds[i];
751 
752 		if (!KFD_IS_SOC15(pdd->dev))
753 			return -ENODEV;
754 
755 		if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
756 			return -EBUSY;
757 	}
758 
759 	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
760 
761 	f = fget(fd);
762 	if (!f) {
763 		pr_err("Failed to get file for (%i)\n", fd);
764 		return -EBADF;
765 	}
766 
767 	target->dbg_ev_file = f;
768 
769 	/* defer activation to runtime if not runtime enabled */
770 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
771 		kfd_dbg_trap_activate(target);
772 
773 	/* We already hold the process reference but hold another one for the
774 	 * debug session.
775 	 */
776 	kref_get(&target->ref);
777 	target->debug_trap_enabled = true;
778 
779 	if (target->debugger_process)
780 		atomic_inc(&target->debugger_process->debugged_process_count);
781 
782 	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
783 		kfd_dbg_trap_deactivate(target, false, 0);
784 		r = -EFAULT;
785 	}
786 
787 	*runtime_size = sizeof(target->runtime_info);
788 
789 	return r;
790 }
791 
792 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
793 						uint32_t trap_override,
794 						uint32_t trap_mask_request,
795 						uint32_t *trap_mask_supported)
796 {
797 	int i = 0;
798 
799 	*trap_mask_supported = 0xffffffff;
800 
801 	for (i = 0; i < p->n_pdds; i++) {
802 		struct kfd_process_device *pdd = p->pdds[i];
803 		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
804 								pdd->dev->adev,
805 								trap_override,
806 								trap_mask_supported);
807 
808 		if (err)
809 			return err;
810 	}
811 
812 	if (trap_mask_request & ~*trap_mask_supported)
813 		return -EACCES;
814 
815 	return 0;
816 }
817 
818 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
819 					uint32_t trap_override,
820 					uint32_t trap_mask_bits,
821 					uint32_t trap_mask_request,
822 					uint32_t *trap_mask_prev,
823 					uint32_t *trap_mask_supported)
824 {
825 	int r = 0, i;
826 
827 	r = kfd_dbg_validate_trap_override_request(target,
828 						trap_override,
829 						trap_mask_request,
830 						trap_mask_supported);
831 
832 	if (r)
833 		return r;
834 
835 	for (i = 0; i < target->n_pdds; i++) {
836 		struct kfd_process_device *pdd = target->pdds[i];
837 
838 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
839 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
840 				pdd->dev->adev,
841 				pdd->dev->vm_info.last_vmid_kfd,
842 				trap_override,
843 				trap_mask_bits,
844 				trap_mask_request,
845 				trap_mask_prev,
846 				pdd->spi_dbg_override);
847 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
848 
849 		if (!pdd->dev->kfd->shared_resources.enable_mes)
850 			r = debug_refresh_runlist(pdd->dev->dqm);
851 		else
852 			r = kfd_dbg_set_mes_debug_mode(pdd);
853 
854 		if (r)
855 			break;
856 	}
857 
858 	return r;
859 }
860 
861 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
862 					uint8_t wave_launch_mode)
863 {
864 	int r = 0, i;
865 
866 	if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
867 			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
868 			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
869 		return -EINVAL;
870 
871 	for (i = 0; i < target->n_pdds; i++) {
872 		struct kfd_process_device *pdd = target->pdds[i];
873 
874 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
875 		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
876 				pdd->dev->adev,
877 				wave_launch_mode,
878 				pdd->dev->vm_info.last_vmid_kfd);
879 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
880 
881 		if (!pdd->dev->kfd->shared_resources.enable_mes)
882 			r = debug_refresh_runlist(pdd->dev->dqm);
883 		else
884 			r = kfd_dbg_set_mes_debug_mode(pdd);
885 
886 		if (r)
887 			break;
888 	}
889 
890 	return r;
891 }
892 
893 int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
894 		uint32_t source_id,
895 		uint32_t exception_code,
896 		bool clear_exception,
897 		void __user *info,
898 		uint32_t *info_size)
899 {
900 	bool found = false;
901 	int r = 0;
902 	uint32_t copy_size, actual_info_size = 0;
903 	uint64_t *exception_status_ptr = NULL;
904 
905 	if (!target)
906 		return -EINVAL;
907 
908 	if (!info || !info_size)
909 		return -EINVAL;
910 
911 	mutex_lock(&target->event_mutex);
912 
913 	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
914 		/* Per queue exceptions */
915 		struct queue *queue = NULL;
916 		int i;
917 
918 		for (i = 0; i < target->n_pdds; i++) {
919 			struct kfd_process_device *pdd = target->pdds[i];
920 			struct qcm_process_device *qpd = &pdd->qpd;
921 
922 			list_for_each_entry(queue, &qpd->queues_list, list) {
923 				if (!found && queue->properties.queue_id == source_id) {
924 					found = true;
925 					break;
926 				}
927 			}
928 			if (found)
929 				break;
930 		}
931 
932 		if (!found) {
933 			r = -EINVAL;
934 			goto out;
935 		}
936 
937 		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
938 			r = -ENODATA;
939 			goto out;
940 		}
941 		exception_status_ptr = &queue->properties.exception_status;
942 	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
943 		/* Per device exceptions */
944 		struct kfd_process_device *pdd = NULL;
945 		int i;
946 
947 		for (i = 0; i < target->n_pdds; i++) {
948 			pdd = target->pdds[i];
949 			if (pdd->dev->id == source_id) {
950 				found = true;
951 				break;
952 			}
953 		}
954 
955 		if (!found) {
956 			r = -EINVAL;
957 			goto out;
958 		}
959 
960 		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
961 			r = -ENODATA;
962 			goto out;
963 		}
964 
965 		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
966 			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
967 
968 			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
969 				r = -EFAULT;
970 				goto out;
971 			}
972 			actual_info_size = pdd->vm_fault_exc_data_size;
973 			if (clear_exception) {
974 				kfree(pdd->vm_fault_exc_data);
975 				pdd->vm_fault_exc_data = NULL;
976 				pdd->vm_fault_exc_data_size = 0;
977 			}
978 		}
979 		exception_status_ptr = &pdd->exception_status;
980 	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
981 		/* Per process exceptions */
982 		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
983 			r = -ENODATA;
984 			goto out;
985 		}
986 
987 		if (exception_code == EC_PROCESS_RUNTIME) {
988 			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
989 
990 			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
991 				r = -EFAULT;
992 				goto out;
993 			}
994 
995 			actual_info_size = sizeof(target->runtime_info);
996 		}
997 
998 		exception_status_ptr = &target->exception_status;
999 	} else {
1000 		pr_debug("Bad exception type [%i]\n", exception_code);
1001 		r = -EINVAL;
1002 		goto out;
1003 	}
1004 
1005 	*info_size = actual_info_size;
1006 	if (clear_exception)
1007 		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1008 out:
1009 	mutex_unlock(&target->event_mutex);
1010 	return r;
1011 }
1012 
1013 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1014 					uint64_t exception_set_mask)
1015 {
1016 	uint64_t found_mask = 0;
1017 	struct process_queue_manager *pqm;
1018 	struct process_queue_node *pqn;
1019 	static const char write_data = '.';
1020 	loff_t pos = 0;
1021 	int i;
1022 
1023 	mutex_lock(&target->event_mutex);
1024 
1025 	found_mask |= target->exception_status;
1026 
1027 	pqm = &target->pqm;
1028 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1029 		if (!pqn)
1030 			continue;
1031 
1032 		found_mask |= pqn->q->properties.exception_status;
1033 	}
1034 
1035 	for (i = 0; i < target->n_pdds; i++) {
1036 		struct kfd_process_device *pdd = target->pdds[i];
1037 
1038 		found_mask |= pdd->exception_status;
1039 	}
1040 
1041 	if (exception_set_mask & found_mask)
1042 		kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1043 
1044 	target->exception_enable_mask = exception_set_mask;
1045 
1046 	mutex_unlock(&target->event_mutex);
1047 }
1048