1 /*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include "kfd_topology.h"
26 #include <linux/file.h>
27 #include <uapi/linux/kfd_ioctl.h>
28
29 #define MAX_WATCH_ADDRESSES 4
30
kfd_dbg_ev_query_debug_event(struct kfd_process * process,unsigned int * queue_id,unsigned int * gpu_id,uint64_t exception_clear_mask,uint64_t * event_status)31 int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
32 unsigned int *queue_id,
33 unsigned int *gpu_id,
34 uint64_t exception_clear_mask,
35 uint64_t *event_status)
36 {
37 struct process_queue_manager *pqm;
38 struct process_queue_node *pqn;
39 int i;
40
41 if (!(process && process->debug_trap_enabled))
42 return -ENODATA;
43
44 mutex_lock(&process->event_mutex);
45 *event_status = 0;
46 *queue_id = 0;
47 *gpu_id = 0;
48
49 /* find and report queue events */
50 pqm = &process->pqm;
51 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
52 uint64_t tmp = process->exception_enable_mask;
53
54 if (!pqn->q)
55 continue;
56
57 tmp &= pqn->q->properties.exception_status;
58
59 if (!tmp)
60 continue;
61
62 *event_status = pqn->q->properties.exception_status;
63 *queue_id = pqn->q->properties.queue_id;
64 *gpu_id = pqn->q->device->id;
65 pqn->q->properties.exception_status &= ~exception_clear_mask;
66 goto out;
67 }
68
69 /* find and report device events */
70 for (i = 0; i < process->n_pdds; i++) {
71 struct kfd_process_device *pdd = process->pdds[i];
72 uint64_t tmp = process->exception_enable_mask
73 & pdd->exception_status;
74
75 if (!tmp)
76 continue;
77
78 *event_status = pdd->exception_status;
79 *gpu_id = pdd->dev->id;
80 pdd->exception_status &= ~exception_clear_mask;
81 goto out;
82 }
83
84 /* report process events */
85 if (process->exception_enable_mask & process->exception_status) {
86 *event_status = process->exception_status;
87 process->exception_status &= ~exception_clear_mask;
88 }
89
90 out:
91 mutex_unlock(&process->event_mutex);
92 return *event_status ? 0 : -EAGAIN;
93 }
94
debug_event_write_work_handler(struct work_struct * work)95 void debug_event_write_work_handler(struct work_struct *work)
96 {
97 struct kfd_process *process;
98
99 static const char write_data = '.';
100 loff_t pos = 0;
101
102 process = container_of(work,
103 struct kfd_process,
104 debug_event_workarea);
105
106 if (process->debug_trap_enabled && process->dbg_ev_file)
107 kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
108 }
109
110 /* update process/device/queue exception status, write to descriptor
111 * only if exception_status is enabled.
112 */
kfd_dbg_ev_raise(uint64_t event_mask,struct kfd_process * process,struct kfd_node * dev,unsigned int source_id,bool use_worker,void * exception_data,size_t exception_data_size)113 bool kfd_dbg_ev_raise(uint64_t event_mask,
114 struct kfd_process *process, struct kfd_node *dev,
115 unsigned int source_id, bool use_worker,
116 void *exception_data, size_t exception_data_size)
117 {
118 struct process_queue_manager *pqm;
119 struct process_queue_node *pqn;
120 int i;
121 static const char write_data = '.';
122 loff_t pos = 0;
123 bool is_subscribed = true;
124
125 if (!(process && process->debug_trap_enabled))
126 return false;
127
128 mutex_lock(&process->event_mutex);
129
130 if (event_mask & KFD_EC_MASK_DEVICE) {
131 for (i = 0; i < process->n_pdds; i++) {
132 struct kfd_process_device *pdd = process->pdds[i];
133
134 if (pdd->dev != dev)
135 continue;
136
137 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
138
139 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
140 if (!pdd->vm_fault_exc_data) {
141 pdd->vm_fault_exc_data = kmemdup(
142 exception_data,
143 exception_data_size,
144 GFP_KERNEL);
145 if (!pdd->vm_fault_exc_data)
146 pr_debug("Failed to allocate exception data memory");
147 } else {
148 pr_debug("Debugger exception data not saved\n");
149 print_hex_dump_bytes("exception data: ",
150 DUMP_PREFIX_OFFSET,
151 exception_data,
152 exception_data_size);
153 }
154 }
155 break;
156 }
157 } else if (event_mask & KFD_EC_MASK_PROCESS) {
158 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
159 } else {
160 pqm = &process->pqm;
161 list_for_each_entry(pqn, &pqm->queues,
162 process_queue_list) {
163 int target_id;
164
165 if (!pqn->q)
166 continue;
167
168 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
169 pqn->q->properties.queue_id :
170 pqn->q->doorbell_id;
171
172 if (pqn->q->device != dev || target_id != source_id)
173 continue;
174
175 pqn->q->properties.exception_status |= event_mask;
176 break;
177 }
178 }
179
180 if (process->exception_enable_mask & event_mask) {
181 if (use_worker)
182 schedule_work(&process->debug_event_workarea);
183 else
184 kernel_write(process->dbg_ev_file,
185 &write_data,
186 1,
187 &pos);
188 } else {
189 is_subscribed = false;
190 }
191
192 mutex_unlock(&process->event_mutex);
193
194 return is_subscribed;
195 }
196
197 /* set pending event queue entry from ring entry */
kfd_set_dbg_ev_from_interrupt(struct kfd_node * dev,unsigned int pasid,uint32_t doorbell_id,uint64_t trap_mask,void * exception_data,size_t exception_data_size)198 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
199 unsigned int pasid,
200 uint32_t doorbell_id,
201 uint64_t trap_mask,
202 void *exception_data,
203 size_t exception_data_size)
204 {
205 struct kfd_process *p;
206 bool signaled_to_debugger_or_runtime = false;
207
208 p = kfd_lookup_process_by_pasid(pasid);
209
210 if (!p)
211 return false;
212
213 if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
214 exception_data, exception_data_size)) {
215 struct process_queue_manager *pqm;
216 struct process_queue_node *pqn;
217
218 if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
219 p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
220 mutex_lock(&p->mutex);
221
222 pqm = &p->pqm;
223 list_for_each_entry(pqn, &pqm->queues,
224 process_queue_list) {
225
226 if (!(pqn->q && pqn->q->device == dev &&
227 pqn->q->doorbell_id == doorbell_id))
228 continue;
229
230 kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
231 trap_mask);
232
233 signaled_to_debugger_or_runtime = true;
234
235 break;
236 }
237
238 mutex_unlock(&p->mutex);
239 } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
240 kfd_dqm_evict_pasid(dev->dqm, p->pasid);
241 kfd_signal_vm_fault_event(dev, p->pasid, NULL,
242 exception_data);
243
244 signaled_to_debugger_or_runtime = true;
245 }
246 } else {
247 signaled_to_debugger_or_runtime = true;
248 }
249
250 kfd_unref_process(p);
251
252 return signaled_to_debugger_or_runtime;
253 }
254
kfd_dbg_send_exception_to_runtime(struct kfd_process * p,unsigned int dev_id,unsigned int queue_id,uint64_t error_reason)255 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
256 unsigned int dev_id,
257 unsigned int queue_id,
258 uint64_t error_reason)
259 {
260 if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
261 struct kfd_process_device *pdd = NULL;
262 struct kfd_hsa_memory_exception_data *data;
263 int i;
264
265 for (i = 0; i < p->n_pdds; i++) {
266 if (p->pdds[i]->dev->id == dev_id) {
267 pdd = p->pdds[i];
268 break;
269 }
270 }
271
272 if (!pdd)
273 return -ENODEV;
274
275 data = (struct kfd_hsa_memory_exception_data *)
276 pdd->vm_fault_exc_data;
277
278 kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
279 kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
280 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
281 }
282
283 if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
284 /*
285 * block should only happen after the debugger receives runtime
286 * enable notice.
287 */
288 up(&p->runtime_enable_sema);
289 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
290 }
291
292 if (error_reason)
293 return kfd_send_exception_to_runtime(p, queue_id, error_reason);
294
295 return 0;
296 }
297
kfd_dbg_set_queue_workaround(struct queue * q,bool enable)298 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
299 {
300 struct mqd_update_info minfo = {0};
301 int err;
302
303 if (!q)
304 return 0;
305
306 if (!kfd_dbg_has_cwsr_workaround(q->device))
307 return 0;
308
309 if (enable && q->properties.is_user_cu_masked)
310 return -EBUSY;
311
312 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
313
314 q->properties.is_dbg_wa = enable;
315 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
316 if (err)
317 q->properties.is_dbg_wa = false;
318
319 return err;
320 }
321
kfd_dbg_set_workaround(struct kfd_process * target,bool enable)322 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
323 {
324 struct process_queue_manager *pqm = &target->pqm;
325 struct process_queue_node *pqn;
326 int r = 0;
327
328 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
329 r = kfd_dbg_set_queue_workaround(pqn->q, enable);
330 if (enable && r)
331 goto unwind;
332 }
333
334 return 0;
335
336 unwind:
337 list_for_each_entry(pqn, &pqm->queues, process_queue_list)
338 kfd_dbg_set_queue_workaround(pqn->q, false);
339
340 if (enable)
341 target->runtime_info.runtime_state = r == -EBUSY ?
342 DEBUG_RUNTIME_STATE_ENABLED_BUSY :
343 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
344
345 return r;
346 }
347
kfd_dbg_set_mes_debug_mode(struct kfd_process_device * pdd,bool sq_trap_en)348 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
349 {
350 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
351 uint32_t flags = pdd->process->dbg_flags;
352 struct amdgpu_device *adev = pdd->dev->adev;
353 int r;
354
355 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
356 return 0;
357
358 if (!pdd->proc_ctx_cpu_ptr) {
359 r = amdgpu_amdkfd_alloc_gtt_mem(adev,
360 AMDGPU_MES_PROC_CTX_SIZE,
361 &pdd->proc_ctx_bo,
362 &pdd->proc_ctx_gpu_addr,
363 &pdd->proc_ctx_cpu_ptr,
364 false);
365 if (r) {
366 dev_err(adev->dev,
367 "failed to allocate process context bo\n");
368 return r;
369 }
370 memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
371 }
372
373 return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
374 pdd->watch_points, flags, sq_trap_en);
375 }
376
377 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
kfd_dbg_get_dev_watch_id(struct kfd_process_device * pdd,int * watch_id)378 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
379 {
380 int i;
381
382 *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
383
384 spin_lock(&pdd->dev->kfd->watch_points_lock);
385
386 for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
387 /* device watchpoint in use so skip */
388 if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1)
389 continue;
390
391 pdd->alloc_watch_ids |= 0x1 << i;
392 pdd->dev->kfd->alloc_watch_ids |= 0x1 << i;
393 *watch_id = i;
394 spin_unlock(&pdd->dev->kfd->watch_points_lock);
395 return 0;
396 }
397
398 spin_unlock(&pdd->dev->kfd->watch_points_lock);
399
400 return -ENOMEM;
401 }
402
kfd_dbg_clear_dev_watch_id(struct kfd_process_device * pdd,int watch_id)403 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
404 {
405 spin_lock(&pdd->dev->kfd->watch_points_lock);
406
407 /* process owns device watch point so safe to clear */
408 if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
409 pdd->alloc_watch_ids &= ~(0x1 << watch_id);
410 pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id);
411 }
412
413 spin_unlock(&pdd->dev->kfd->watch_points_lock);
414 }
415
kfd_dbg_owns_dev_watch_id(struct kfd_process_device * pdd,int watch_id)416 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
417 {
418 bool owns_watch_id = false;
419
420 spin_lock(&pdd->dev->kfd->watch_points_lock);
421 owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
422 ((pdd->alloc_watch_ids >> watch_id) & 0x1);
423
424 spin_unlock(&pdd->dev->kfd->watch_points_lock);
425
426 return owns_watch_id;
427 }
428
kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device * pdd,uint32_t watch_id)429 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
430 uint32_t watch_id)
431 {
432 int r;
433
434 if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
435 return -EINVAL;
436
437 if (!pdd->dev->kfd->shared_resources.enable_mes) {
438 r = debug_lock_and_unmap(pdd->dev->dqm);
439 if (r)
440 return r;
441 }
442
443 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
444 pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
445 pdd->dev->adev,
446 watch_id);
447 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
448
449 if (!pdd->dev->kfd->shared_resources.enable_mes)
450 r = debug_map_and_unlock(pdd->dev->dqm);
451 else
452 r = kfd_dbg_set_mes_debug_mode(pdd, true);
453
454 kfd_dbg_clear_dev_watch_id(pdd, watch_id);
455
456 return r;
457 }
458
kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device * pdd,uint64_t watch_address,uint32_t watch_address_mask,uint32_t * watch_id,uint32_t watch_mode)459 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
460 uint64_t watch_address,
461 uint32_t watch_address_mask,
462 uint32_t *watch_id,
463 uint32_t watch_mode)
464 {
465 int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
466 uint32_t xcc_mask = pdd->dev->xcc_mask;
467
468 if (r)
469 return r;
470
471 if (!pdd->dev->kfd->shared_resources.enable_mes) {
472 r = debug_lock_and_unmap(pdd->dev->dqm);
473 if (r) {
474 kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
475 return r;
476 }
477 }
478
479 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
480 for_each_inst(xcc_id, xcc_mask)
481 pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
482 pdd->dev->adev,
483 watch_address,
484 watch_address_mask,
485 *watch_id,
486 watch_mode,
487 pdd->dev->vm_info.last_vmid_kfd,
488 xcc_id);
489 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
490
491 if (!pdd->dev->kfd->shared_resources.enable_mes)
492 r = debug_map_and_unlock(pdd->dev->dqm);
493 else
494 r = kfd_dbg_set_mes_debug_mode(pdd, true);
495
496 /* HWS is broken so no point in HW rollback but release the watchpoint anyways */
497 if (r)
498 kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
499
500 return 0;
501 }
502
kfd_dbg_clear_process_address_watch(struct kfd_process * target)503 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
504 {
505 int i, j;
506
507 for (i = 0; i < target->n_pdds; i++)
508 for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
509 kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
510 }
511
kfd_dbg_trap_set_flags(struct kfd_process * target,uint32_t * flags)512 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
513 {
514 uint32_t prev_flags = target->dbg_flags;
515 int i, r = 0, rewind_count = 0;
516
517 for (i = 0; i < target->n_pdds; i++) {
518 if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
519 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
520 *flags = prev_flags;
521 return -EACCES;
522 }
523 }
524
525 target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
526 *flags = prev_flags;
527 for (i = 0; i < target->n_pdds; i++) {
528 struct kfd_process_device *pdd = target->pdds[i];
529
530 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
531 continue;
532
533 if (!pdd->dev->kfd->shared_resources.enable_mes)
534 r = debug_refresh_runlist(pdd->dev->dqm);
535 else
536 r = kfd_dbg_set_mes_debug_mode(pdd, true);
537
538 if (r) {
539 target->dbg_flags = prev_flags;
540 break;
541 }
542
543 rewind_count++;
544 }
545
546 /* Rewind flags */
547 if (r) {
548 target->dbg_flags = prev_flags;
549
550 for (i = 0; i < rewind_count; i++) {
551 struct kfd_process_device *pdd = target->pdds[i];
552
553 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
554 continue;
555
556 if (!pdd->dev->kfd->shared_resources.enable_mes)
557 debug_refresh_runlist(pdd->dev->dqm);
558 else
559 kfd_dbg_set_mes_debug_mode(pdd, true);
560 }
561 }
562
563 return r;
564 }
565
566 /* kfd_dbg_trap_deactivate:
567 * target: target process
568 * unwind: If this is unwinding a failed kfd_dbg_trap_enable()
569 * unwind_count:
570 * If unwind == true, how far down the pdd list we need
571 * to unwind
572 * else: ignored
573 */
kfd_dbg_trap_deactivate(struct kfd_process * target,bool unwind,int unwind_count)574 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
575 {
576 int i;
577
578 if (!unwind) {
579 uint32_t flags = 0;
580 int resume_count = resume_queues(target, 0, NULL);
581
582 if (resume_count)
583 pr_debug("Resumed %d queues\n", resume_count);
584
585 cancel_work_sync(&target->debug_event_workarea);
586 kfd_dbg_clear_process_address_watch(target);
587 kfd_dbg_trap_set_wave_launch_mode(target, 0);
588
589 kfd_dbg_trap_set_flags(target, &flags);
590 }
591
592 for (i = 0; i < target->n_pdds; i++) {
593 struct kfd_process_device *pdd = target->pdds[i];
594
595 /* If this is an unwind, and we have unwound the required
596 * enable calls on the pdd list, we need to stop now
597 * otherwise we may mess up another debugger session.
598 */
599 if (unwind && i == unwind_count)
600 break;
601
602 kfd_process_set_trap_debug_flag(&pdd->qpd, false);
603
604 /* GFX off is already disabled by debug activate if not RLC restore supported. */
605 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
606 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
607 pdd->spi_dbg_override =
608 pdd->dev->kfd2kgd->disable_debug_trap(
609 pdd->dev->adev,
610 target->runtime_info.ttmp_setup,
611 pdd->dev->vm_info.last_vmid_kfd);
612 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
613
614 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
615 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
616 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
617
618 if (!pdd->dev->kfd->shared_resources.enable_mes)
619 debug_refresh_runlist(pdd->dev->dqm);
620 else
621 kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
622 }
623
624 kfd_dbg_set_workaround(target, false);
625 }
626
kfd_dbg_clean_exception_status(struct kfd_process * target)627 static void kfd_dbg_clean_exception_status(struct kfd_process *target)
628 {
629 struct process_queue_manager *pqm;
630 struct process_queue_node *pqn;
631 int i;
632
633 for (i = 0; i < target->n_pdds; i++) {
634 struct kfd_process_device *pdd = target->pdds[i];
635
636 kfd_process_drain_interrupts(pdd);
637
638 pdd->exception_status = 0;
639 }
640
641 pqm = &target->pqm;
642 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
643 if (!pqn->q)
644 continue;
645
646 pqn->q->properties.exception_status = 0;
647 }
648
649 target->exception_status = 0;
650 }
651
kfd_dbg_trap_disable(struct kfd_process * target)652 int kfd_dbg_trap_disable(struct kfd_process *target)
653 {
654 if (!target->debug_trap_enabled)
655 return 0;
656
657 /*
658 * Defer deactivation to runtime if runtime not enabled otherwise reset
659 * attached running target runtime state to enable for re-attach.
660 */
661 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
662 kfd_dbg_trap_deactivate(target, false, 0);
663 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
664 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
665
666 cancel_work_sync(&target->debug_event_workarea);
667 fput(target->dbg_ev_file);
668 target->dbg_ev_file = NULL;
669
670 if (target->debugger_process) {
671 atomic_dec(&target->debugger_process->debugged_process_count);
672 target->debugger_process = NULL;
673 }
674
675 target->debug_trap_enabled = false;
676 kfd_dbg_clean_exception_status(target);
677 kfd_unref_process(target);
678
679 return 0;
680 }
681
kfd_dbg_trap_activate(struct kfd_process * target)682 int kfd_dbg_trap_activate(struct kfd_process *target)
683 {
684 int i, r = 0;
685
686 r = kfd_dbg_set_workaround(target, true);
687 if (r)
688 return r;
689
690 for (i = 0; i < target->n_pdds; i++) {
691 struct kfd_process_device *pdd = target->pdds[i];
692
693 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
694 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
695
696 if (r) {
697 target->runtime_info.runtime_state = (r == -EBUSY) ?
698 DEBUG_RUNTIME_STATE_ENABLED_BUSY :
699 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
700
701 goto unwind_err;
702 }
703 }
704
705 /* Disable GFX OFF to prevent garbage read/writes to debug registers.
706 * If RLC restore of debug registers is not supported and runtime enable
707 * hasn't done so already on ttmp setup request, restore the trap config registers.
708 *
709 * If RLC restore of debug registers is not supported, keep gfx off disabled for
710 * the debug session.
711 */
712 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
713 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
714 target->runtime_info.ttmp_setup))
715 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
716 pdd->dev->vm_info.last_vmid_kfd);
717
718 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
719 pdd->dev->adev,
720 false,
721 pdd->dev->vm_info.last_vmid_kfd);
722
723 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
724 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
725
726 /*
727 * Setting the debug flag in the trap handler requires that the TMA has been
728 * allocated, which occurs during CWSR initialization.
729 * In the event that CWSR has not been initialized at this point, setting the
730 * flag will be called again during CWSR initialization if the target process
731 * is still debug enabled.
732 */
733 kfd_process_set_trap_debug_flag(&pdd->qpd, true);
734
735 if (!pdd->dev->kfd->shared_resources.enable_mes)
736 r = debug_refresh_runlist(pdd->dev->dqm);
737 else
738 r = kfd_dbg_set_mes_debug_mode(pdd, true);
739
740 if (r) {
741 target->runtime_info.runtime_state =
742 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
743 goto unwind_err;
744 }
745 }
746
747 return 0;
748
749 unwind_err:
750 /* Enabling debug failed, we need to disable on
751 * all GPUs so the enable is all or nothing.
752 */
753 kfd_dbg_trap_deactivate(target, true, i);
754 return r;
755 }
756
kfd_dbg_trap_enable(struct kfd_process * target,uint32_t fd,void __user * runtime_info,uint32_t * runtime_size)757 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
758 void __user *runtime_info, uint32_t *runtime_size)
759 {
760 struct file *f;
761 uint32_t copy_size;
762 int i, r = 0;
763
764 if (target->debug_trap_enabled)
765 return -EALREADY;
766
767 /* Enable pre-checks */
768 for (i = 0; i < target->n_pdds; i++) {
769 struct kfd_process_device *pdd = target->pdds[i];
770
771 if (!KFD_IS_SOC15(pdd->dev))
772 return -ENODEV;
773
774 if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
775 kfd_dbg_has_cwsr_workaround(pdd->dev)))
776 return -EBUSY;
777 }
778
779 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
780
781 f = fget(fd);
782 if (!f) {
783 pr_err("Failed to get file for (%i)\n", fd);
784 return -EBADF;
785 }
786
787 target->dbg_ev_file = f;
788
789 /* defer activation to runtime if not runtime enabled */
790 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
791 kfd_dbg_trap_activate(target);
792
793 /* We already hold the process reference but hold another one for the
794 * debug session.
795 */
796 kref_get(&target->ref);
797 target->debug_trap_enabled = true;
798
799 if (target->debugger_process)
800 atomic_inc(&target->debugger_process->debugged_process_count);
801
802 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
803 kfd_dbg_trap_deactivate(target, false, 0);
804 r = -EFAULT;
805 }
806
807 *runtime_size = sizeof(target->runtime_info);
808
809 return r;
810 }
811
kfd_dbg_validate_trap_override_request(struct kfd_process * p,uint32_t trap_override,uint32_t trap_mask_request,uint32_t * trap_mask_supported)812 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
813 uint32_t trap_override,
814 uint32_t trap_mask_request,
815 uint32_t *trap_mask_supported)
816 {
817 int i = 0;
818
819 *trap_mask_supported = 0xffffffff;
820
821 for (i = 0; i < p->n_pdds; i++) {
822 struct kfd_process_device *pdd = p->pdds[i];
823 int err = pdd->dev->kfd2kgd->validate_trap_override_request(
824 pdd->dev->adev,
825 trap_override,
826 trap_mask_supported);
827
828 if (err)
829 return err;
830 }
831
832 if (trap_mask_request & ~*trap_mask_supported)
833 return -EACCES;
834
835 return 0;
836 }
837
kfd_dbg_trap_set_wave_launch_override(struct kfd_process * target,uint32_t trap_override,uint32_t trap_mask_bits,uint32_t trap_mask_request,uint32_t * trap_mask_prev,uint32_t * trap_mask_supported)838 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
839 uint32_t trap_override,
840 uint32_t trap_mask_bits,
841 uint32_t trap_mask_request,
842 uint32_t *trap_mask_prev,
843 uint32_t *trap_mask_supported)
844 {
845 int r = 0, i;
846
847 r = kfd_dbg_validate_trap_override_request(target,
848 trap_override,
849 trap_mask_request,
850 trap_mask_supported);
851
852 if (r)
853 return r;
854
855 for (i = 0; i < target->n_pdds; i++) {
856 struct kfd_process_device *pdd = target->pdds[i];
857
858 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
859 pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
860 pdd->dev->adev,
861 pdd->dev->vm_info.last_vmid_kfd,
862 trap_override,
863 trap_mask_bits,
864 trap_mask_request,
865 trap_mask_prev,
866 pdd->spi_dbg_override);
867 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
868
869 if (!pdd->dev->kfd->shared_resources.enable_mes)
870 r = debug_refresh_runlist(pdd->dev->dqm);
871 else
872 r = kfd_dbg_set_mes_debug_mode(pdd, true);
873
874 if (r)
875 break;
876 }
877
878 return r;
879 }
880
kfd_dbg_trap_set_wave_launch_mode(struct kfd_process * target,uint8_t wave_launch_mode)881 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
882 uint8_t wave_launch_mode)
883 {
884 int r = 0, i;
885
886 if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
887 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
888 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
889 return -EINVAL;
890
891 for (i = 0; i < target->n_pdds; i++) {
892 struct kfd_process_device *pdd = target->pdds[i];
893
894 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
895 pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
896 pdd->dev->adev,
897 wave_launch_mode,
898 pdd->dev->vm_info.last_vmid_kfd);
899 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
900
901 if (!pdd->dev->kfd->shared_resources.enable_mes)
902 r = debug_refresh_runlist(pdd->dev->dqm);
903 else
904 r = kfd_dbg_set_mes_debug_mode(pdd, true);
905
906 if (r)
907 break;
908 }
909
910 return r;
911 }
912
kfd_dbg_trap_query_exception_info(struct kfd_process * target,uint32_t source_id,uint32_t exception_code,bool clear_exception,void __user * info,uint32_t * info_size)913 int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
914 uint32_t source_id,
915 uint32_t exception_code,
916 bool clear_exception,
917 void __user *info,
918 uint32_t *info_size)
919 {
920 bool found = false;
921 int r = 0;
922 uint32_t copy_size, actual_info_size = 0;
923 uint64_t *exception_status_ptr = NULL;
924
925 if (!target)
926 return -EINVAL;
927
928 if (!info || !info_size)
929 return -EINVAL;
930
931 mutex_lock(&target->event_mutex);
932
933 if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
934 /* Per queue exceptions */
935 struct queue *queue = NULL;
936 int i;
937
938 for (i = 0; i < target->n_pdds; i++) {
939 struct kfd_process_device *pdd = target->pdds[i];
940 struct qcm_process_device *qpd = &pdd->qpd;
941
942 list_for_each_entry(queue, &qpd->queues_list, list) {
943 if (!found && queue->properties.queue_id == source_id) {
944 found = true;
945 break;
946 }
947 }
948 if (found)
949 break;
950 }
951
952 if (!found) {
953 r = -EINVAL;
954 goto out;
955 }
956
957 if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
958 r = -ENODATA;
959 goto out;
960 }
961 exception_status_ptr = &queue->properties.exception_status;
962 } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
963 /* Per device exceptions */
964 struct kfd_process_device *pdd = NULL;
965 int i;
966
967 for (i = 0; i < target->n_pdds; i++) {
968 pdd = target->pdds[i];
969 if (pdd->dev->id == source_id) {
970 found = true;
971 break;
972 }
973 }
974
975 if (!found) {
976 r = -EINVAL;
977 goto out;
978 }
979
980 if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
981 r = -ENODATA;
982 goto out;
983 }
984
985 if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
986 copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
987
988 if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
989 r = -EFAULT;
990 goto out;
991 }
992 actual_info_size = pdd->vm_fault_exc_data_size;
993 if (clear_exception) {
994 kfree(pdd->vm_fault_exc_data);
995 pdd->vm_fault_exc_data = NULL;
996 pdd->vm_fault_exc_data_size = 0;
997 }
998 }
999 exception_status_ptr = &pdd->exception_status;
1000 } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
1001 /* Per process exceptions */
1002 if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
1003 r = -ENODATA;
1004 goto out;
1005 }
1006
1007 if (exception_code == EC_PROCESS_RUNTIME) {
1008 copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
1009
1010 if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
1011 r = -EFAULT;
1012 goto out;
1013 }
1014
1015 actual_info_size = sizeof(target->runtime_info);
1016 }
1017
1018 exception_status_ptr = &target->exception_status;
1019 } else {
1020 pr_debug("Bad exception type [%i]\n", exception_code);
1021 r = -EINVAL;
1022 goto out;
1023 }
1024
1025 *info_size = actual_info_size;
1026 if (clear_exception)
1027 *exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1028 out:
1029 mutex_unlock(&target->event_mutex);
1030 return r;
1031 }
1032
kfd_dbg_trap_device_snapshot(struct kfd_process * target,uint64_t exception_clear_mask,void __user * user_info,uint32_t * number_of_device_infos,uint32_t * entry_size)1033 int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1034 uint64_t exception_clear_mask,
1035 void __user *user_info,
1036 uint32_t *number_of_device_infos,
1037 uint32_t *entry_size)
1038 {
1039 struct kfd_dbg_device_info_entry device_info;
1040 uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
1041 int i, r = 0;
1042
1043 if (!(target && user_info && number_of_device_infos && entry_size))
1044 return -EINVAL;
1045
1046 tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1047 *number_of_device_infos = target->n_pdds;
1048 *entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1049
1050 if (!tmp_num_devices)
1051 return 0;
1052
1053 memset(&device_info, 0, sizeof(device_info));
1054
1055 mutex_lock(&target->event_mutex);
1056
1057 /* Run over all pdd of the process */
1058 for (i = 0; i < tmp_num_devices; i++) {
1059 struct kfd_process_device *pdd = target->pdds[i];
1060 struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1061
1062 device_info.gpu_id = pdd->dev->id;
1063 device_info.exception_status = pdd->exception_status;
1064 device_info.lds_base = pdd->lds_base;
1065 device_info.lds_limit = pdd->lds_limit;
1066 device_info.scratch_base = pdd->scratch_base;
1067 device_info.scratch_limit = pdd->scratch_limit;
1068 device_info.gpuvm_base = pdd->gpuvm_base;
1069 device_info.gpuvm_limit = pdd->gpuvm_limit;
1070 device_info.location_id = topo_dev->node_props.location_id;
1071 device_info.vendor_id = topo_dev->node_props.vendor_id;
1072 device_info.device_id = topo_dev->node_props.device_id;
1073 device_info.revision_id = pdd->dev->adev->pdev->revision;
1074 device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1075 device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1076 device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1077 device_info.gfx_target_version =
1078 topo_dev->node_props.gfx_target_version;
1079 device_info.simd_count = topo_dev->node_props.simd_count;
1080 device_info.max_waves_per_simd =
1081 topo_dev->node_props.max_waves_per_simd;
1082 device_info.array_count = topo_dev->node_props.array_count;
1083 device_info.simd_arrays_per_engine =
1084 topo_dev->node_props.simd_arrays_per_engine;
1085 device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1086 device_info.capability = topo_dev->node_props.capability;
1087 device_info.debug_prop = topo_dev->node_props.debug_prop;
1088
1089 if (exception_clear_mask)
1090 pdd->exception_status &= ~exception_clear_mask;
1091
1092 if (copy_to_user(user_info, &device_info, *entry_size)) {
1093 r = -EFAULT;
1094 break;
1095 }
1096
1097 user_info += tmp_entry_size;
1098 }
1099
1100 mutex_unlock(&target->event_mutex);
1101
1102 return r;
1103 }
1104
kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process * target,uint64_t exception_set_mask)1105 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1106 uint64_t exception_set_mask)
1107 {
1108 uint64_t found_mask = 0;
1109 struct process_queue_manager *pqm;
1110 struct process_queue_node *pqn;
1111 static const char write_data = '.';
1112 loff_t pos = 0;
1113 int i;
1114
1115 mutex_lock(&target->event_mutex);
1116
1117 found_mask |= target->exception_status;
1118
1119 pqm = &target->pqm;
1120 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1121 if (!pqn->q)
1122 continue;
1123
1124 found_mask |= pqn->q->properties.exception_status;
1125 }
1126
1127 for (i = 0; i < target->n_pdds; i++) {
1128 struct kfd_process_device *pdd = target->pdds[i];
1129
1130 found_mask |= pdd->exception_status;
1131 }
1132
1133 if (exception_set_mask & found_mask)
1134 kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1135
1136 target->exception_enable_mask = exception_set_mask;
1137
1138 mutex_unlock(&target->event_mutex);
1139 }
1140