1 /* 2 * Copyright 2023 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "kfd_debug.h" 24 #include "kfd_device_queue_manager.h" 25 #include <linux/file.h> 26 #include <uapi/linux/kfd_ioctl.h> 27 28 #define MAX_WATCH_ADDRESSES 4 29 30 int kfd_dbg_ev_query_debug_event(struct kfd_process *process, 31 unsigned int *queue_id, 32 unsigned int *gpu_id, 33 uint64_t exception_clear_mask, 34 uint64_t *event_status) 35 { 36 struct process_queue_manager *pqm; 37 struct process_queue_node *pqn; 38 int i; 39 40 if (!(process && process->debug_trap_enabled)) 41 return -ENODATA; 42 43 mutex_lock(&process->event_mutex); 44 *event_status = 0; 45 *queue_id = 0; 46 *gpu_id = 0; 47 48 /* find and report queue events */ 49 pqm = &process->pqm; 50 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 51 uint64_t tmp = process->exception_enable_mask; 52 53 if (!pqn->q) 54 continue; 55 56 tmp &= pqn->q->properties.exception_status; 57 58 if (!tmp) 59 continue; 60 61 *event_status = pqn->q->properties.exception_status; 62 *queue_id = pqn->q->properties.queue_id; 63 *gpu_id = pqn->q->device->id; 64 pqn->q->properties.exception_status &= ~exception_clear_mask; 65 goto out; 66 } 67 68 /* find and report device events */ 69 for (i = 0; i < process->n_pdds; i++) { 70 struct kfd_process_device *pdd = process->pdds[i]; 71 uint64_t tmp = process->exception_enable_mask 72 & pdd->exception_status; 73 74 if (!tmp) 75 continue; 76 77 *event_status = pdd->exception_status; 78 *gpu_id = pdd->dev->id; 79 pdd->exception_status &= ~exception_clear_mask; 80 goto out; 81 } 82 83 /* report process events */ 84 if (process->exception_enable_mask & process->exception_status) { 85 *event_status = process->exception_status; 86 process->exception_status &= ~exception_clear_mask; 87 } 88 89 out: 90 mutex_unlock(&process->event_mutex); 91 return *event_status ? 0 : -EAGAIN; 92 } 93 94 void debug_event_write_work_handler(struct work_struct *work) 95 { 96 struct kfd_process *process; 97 98 static const char write_data = '.'; 99 loff_t pos = 0; 100 101 process = container_of(work, 102 struct kfd_process, 103 debug_event_workarea); 104 105 kernel_write(process->dbg_ev_file, &write_data, 1, &pos); 106 } 107 108 /* update process/device/queue exception status, write to descriptor 109 * only if exception_status is enabled. 110 */ 111 bool kfd_dbg_ev_raise(uint64_t event_mask, 112 struct kfd_process *process, struct kfd_node *dev, 113 unsigned int source_id, bool use_worker, 114 void *exception_data, size_t exception_data_size) 115 { 116 struct process_queue_manager *pqm; 117 struct process_queue_node *pqn; 118 int i; 119 static const char write_data = '.'; 120 loff_t pos = 0; 121 bool is_subscribed = true; 122 123 if (!(process && process->debug_trap_enabled)) 124 return false; 125 126 mutex_lock(&process->event_mutex); 127 128 if (event_mask & KFD_EC_MASK_DEVICE) { 129 for (i = 0; i < process->n_pdds; i++) { 130 struct kfd_process_device *pdd = process->pdds[i]; 131 132 if (pdd->dev != dev) 133 continue; 134 135 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE; 136 137 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 138 if (!pdd->vm_fault_exc_data) { 139 pdd->vm_fault_exc_data = kmemdup( 140 exception_data, 141 exception_data_size, 142 GFP_KERNEL); 143 if (!pdd->vm_fault_exc_data) 144 pr_debug("Failed to allocate exception data memory"); 145 } else { 146 pr_debug("Debugger exception data not saved\n"); 147 print_hex_dump_bytes("exception data: ", 148 DUMP_PREFIX_OFFSET, 149 exception_data, 150 exception_data_size); 151 } 152 } 153 break; 154 } 155 } else if (event_mask & KFD_EC_MASK_PROCESS) { 156 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS; 157 } else { 158 pqm = &process->pqm; 159 list_for_each_entry(pqn, &pqm->queues, 160 process_queue_list) { 161 int target_id; 162 163 if (!pqn->q) 164 continue; 165 166 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ? 167 pqn->q->properties.queue_id : 168 pqn->q->doorbell_id; 169 170 if (pqn->q->device != dev || target_id != source_id) 171 continue; 172 173 pqn->q->properties.exception_status |= event_mask; 174 break; 175 } 176 } 177 178 if (process->exception_enable_mask & event_mask) { 179 if (use_worker) 180 schedule_work(&process->debug_event_workarea); 181 else 182 kernel_write(process->dbg_ev_file, 183 &write_data, 184 1, 185 &pos); 186 } else { 187 is_subscribed = false; 188 } 189 190 mutex_unlock(&process->event_mutex); 191 192 return is_subscribed; 193 } 194 195 /* set pending event queue entry from ring entry */ 196 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev, 197 unsigned int pasid, 198 uint32_t doorbell_id, 199 uint64_t trap_mask, 200 void *exception_data, 201 size_t exception_data_size) 202 { 203 struct kfd_process *p; 204 bool signaled_to_debugger_or_runtime = false; 205 206 p = kfd_lookup_process_by_pasid(pasid); 207 208 if (!p) 209 return false; 210 211 if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true, 212 exception_data, exception_data_size)) { 213 struct process_queue_manager *pqm; 214 struct process_queue_node *pqn; 215 216 if (!!(trap_mask & KFD_EC_MASK_QUEUE) && 217 p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) { 218 mutex_lock(&p->mutex); 219 220 pqm = &p->pqm; 221 list_for_each_entry(pqn, &pqm->queues, 222 process_queue_list) { 223 224 if (!(pqn->q && pqn->q->device == dev && 225 pqn->q->doorbell_id == doorbell_id)) 226 continue; 227 228 kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id, 229 trap_mask); 230 231 signaled_to_debugger_or_runtime = true; 232 233 break; 234 } 235 236 mutex_unlock(&p->mutex); 237 } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 238 kfd_dqm_evict_pasid(dev->dqm, p->pasid); 239 kfd_signal_vm_fault_event(dev, p->pasid, NULL, 240 exception_data); 241 242 signaled_to_debugger_or_runtime = true; 243 } 244 } else { 245 signaled_to_debugger_or_runtime = true; 246 } 247 248 kfd_unref_process(p); 249 250 return signaled_to_debugger_or_runtime; 251 } 252 253 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p, 254 unsigned int dev_id, 255 unsigned int queue_id, 256 uint64_t error_reason) 257 { 258 if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 259 struct kfd_process_device *pdd = NULL; 260 struct kfd_hsa_memory_exception_data *data; 261 int i; 262 263 for (i = 0; i < p->n_pdds; i++) { 264 if (p->pdds[i]->dev->id == dev_id) { 265 pdd = p->pdds[i]; 266 break; 267 } 268 } 269 270 if (!pdd) 271 return -ENODEV; 272 273 data = (struct kfd_hsa_memory_exception_data *) 274 pdd->vm_fault_exc_data; 275 276 kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid); 277 kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data); 278 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION); 279 } 280 281 if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) { 282 /* 283 * block should only happen after the debugger receives runtime 284 * enable notice. 285 */ 286 up(&p->runtime_enable_sema); 287 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME); 288 } 289 290 if (error_reason) 291 return kfd_send_exception_to_runtime(p, queue_id, error_reason); 292 293 return 0; 294 } 295 296 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable) 297 { 298 struct mqd_update_info minfo = {0}; 299 int err; 300 301 if (!q) 302 return 0; 303 304 if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) || 305 KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0)) 306 return 0; 307 308 if (enable && q->properties.is_user_cu_masked) 309 return -EBUSY; 310 311 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE; 312 313 q->properties.is_dbg_wa = enable; 314 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo); 315 if (err) 316 q->properties.is_dbg_wa = false; 317 318 return err; 319 } 320 321 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable) 322 { 323 struct process_queue_manager *pqm = &target->pqm; 324 struct process_queue_node *pqn; 325 int r = 0; 326 327 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 328 r = kfd_dbg_set_queue_workaround(pqn->q, enable); 329 if (enable && r) 330 goto unwind; 331 } 332 333 return 0; 334 335 unwind: 336 list_for_each_entry(pqn, &pqm->queues, process_queue_list) 337 kfd_dbg_set_queue_workaround(pqn->q, false); 338 339 if (enable) 340 target->runtime_info.runtime_state = r == -EBUSY ? 341 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 342 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 343 344 return r; 345 } 346 347 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd) 348 { 349 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; 350 uint32_t flags = pdd->process->dbg_flags; 351 352 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 353 return 0; 354 355 return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl, 356 pdd->watch_points, flags); 357 } 358 359 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1 360 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id) 361 { 362 int i; 363 364 *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID; 365 366 spin_lock(&pdd->dev->kfd->watch_points_lock); 367 368 for (i = 0; i < MAX_WATCH_ADDRESSES; i++) { 369 /* device watchpoint in use so skip */ 370 if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1) 371 continue; 372 373 pdd->alloc_watch_ids |= 0x1 << i; 374 pdd->dev->kfd->alloc_watch_ids |= 0x1 << i; 375 *watch_id = i; 376 spin_unlock(&pdd->dev->kfd->watch_points_lock); 377 return 0; 378 } 379 380 spin_unlock(&pdd->dev->kfd->watch_points_lock); 381 382 return -ENOMEM; 383 } 384 385 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id) 386 { 387 spin_lock(&pdd->dev->kfd->watch_points_lock); 388 389 /* process owns device watch point so safe to clear */ 390 if ((pdd->alloc_watch_ids >> watch_id) & 0x1) { 391 pdd->alloc_watch_ids &= ~(0x1 << watch_id); 392 pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id); 393 } 394 395 spin_unlock(&pdd->dev->kfd->watch_points_lock); 396 } 397 398 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id) 399 { 400 bool owns_watch_id = false; 401 402 spin_lock(&pdd->dev->kfd->watch_points_lock); 403 owns_watch_id = watch_id < MAX_WATCH_ADDRESSES && 404 ((pdd->alloc_watch_ids >> watch_id) & 0x1); 405 406 spin_unlock(&pdd->dev->kfd->watch_points_lock); 407 408 return owns_watch_id; 409 } 410 411 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd, 412 uint32_t watch_id) 413 { 414 int r; 415 416 if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id)) 417 return -EINVAL; 418 419 if (!pdd->dev->kfd->shared_resources.enable_mes) { 420 r = debug_lock_and_unmap(pdd->dev->dqm); 421 if (r) 422 return r; 423 } 424 425 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 426 pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch( 427 pdd->dev->adev, 428 watch_id); 429 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 430 431 if (!pdd->dev->kfd->shared_resources.enable_mes) 432 r = debug_map_and_unlock(pdd->dev->dqm); 433 else 434 r = kfd_dbg_set_mes_debug_mode(pdd); 435 436 kfd_dbg_clear_dev_watch_id(pdd, watch_id); 437 438 return r; 439 } 440 441 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd, 442 uint64_t watch_address, 443 uint32_t watch_address_mask, 444 uint32_t *watch_id, 445 uint32_t watch_mode) 446 { 447 int r = kfd_dbg_get_dev_watch_id(pdd, watch_id); 448 449 if (r) 450 return r; 451 452 if (!pdd->dev->kfd->shared_resources.enable_mes) { 453 r = debug_lock_and_unmap(pdd->dev->dqm); 454 if (r) { 455 kfd_dbg_clear_dev_watch_id(pdd, *watch_id); 456 return r; 457 } 458 } 459 460 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 461 pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch( 462 pdd->dev->adev, 463 watch_address, 464 watch_address_mask, 465 *watch_id, 466 watch_mode, 467 pdd->dev->vm_info.last_vmid_kfd); 468 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 469 470 if (!pdd->dev->kfd->shared_resources.enable_mes) 471 r = debug_map_and_unlock(pdd->dev->dqm); 472 else 473 r = kfd_dbg_set_mes_debug_mode(pdd); 474 475 /* HWS is broken so no point in HW rollback but release the watchpoint anyways */ 476 if (r) 477 kfd_dbg_clear_dev_watch_id(pdd, *watch_id); 478 479 return 0; 480 } 481 482 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target) 483 { 484 int i, j; 485 486 for (i = 0; i < target->n_pdds; i++) 487 for (j = 0; j < MAX_WATCH_ADDRESSES; j++) 488 kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j); 489 } 490 491 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) 492 { 493 uint32_t prev_flags = target->dbg_flags; 494 int i, r = 0, rewind_count = 0; 495 496 for (i = 0; i < target->n_pdds; i++) { 497 if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) && 498 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) { 499 *flags = prev_flags; 500 return -EACCES; 501 } 502 } 503 504 target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP; 505 *flags = prev_flags; 506 for (i = 0; i < target->n_pdds; i++) { 507 struct kfd_process_device *pdd = target->pdds[i]; 508 509 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 510 continue; 511 512 if (!pdd->dev->kfd->shared_resources.enable_mes) 513 r = debug_refresh_runlist(pdd->dev->dqm); 514 else 515 r = kfd_dbg_set_mes_debug_mode(pdd); 516 517 if (r) { 518 target->dbg_flags = prev_flags; 519 break; 520 } 521 522 rewind_count++; 523 } 524 525 /* Rewind flags */ 526 if (r) { 527 target->dbg_flags = prev_flags; 528 529 for (i = 0; i < rewind_count; i++) { 530 struct kfd_process_device *pdd = target->pdds[i]; 531 532 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 533 continue; 534 535 if (!pdd->dev->kfd->shared_resources.enable_mes) 536 debug_refresh_runlist(pdd->dev->dqm); 537 else 538 kfd_dbg_set_mes_debug_mode(pdd); 539 } 540 } 541 542 return r; 543 } 544 545 /* kfd_dbg_trap_deactivate: 546 * target: target process 547 * unwind: If this is unwinding a failed kfd_dbg_trap_enable() 548 * unwind_count: 549 * If unwind == true, how far down the pdd list we need 550 * to unwind 551 * else: ignored 552 */ 553 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count) 554 { 555 int i; 556 557 if (!unwind) { 558 uint32_t flags = 0; 559 560 cancel_work_sync(&target->debug_event_workarea); 561 kfd_dbg_clear_process_address_watch(target); 562 kfd_dbg_trap_set_wave_launch_mode(target, 0); 563 564 kfd_dbg_trap_set_flags(target, &flags); 565 } 566 567 for (i = 0; i < target->n_pdds; i++) { 568 struct kfd_process_device *pdd = target->pdds[i]; 569 570 /* If this is an unwind, and we have unwound the required 571 * enable calls on the pdd list, we need to stop now 572 * otherwise we may mess up another debugger session. 573 */ 574 if (unwind && i == unwind_count) 575 break; 576 577 kfd_process_set_trap_debug_flag(&pdd->qpd, false); 578 579 /* GFX off is already disabled by debug activate if not RLC restore supported. */ 580 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 581 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 582 pdd->spi_dbg_override = 583 pdd->dev->kfd2kgd->disable_debug_trap( 584 pdd->dev->adev, 585 target->runtime_info.ttmp_setup, 586 pdd->dev->vm_info.last_vmid_kfd); 587 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 588 589 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) && 590 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd)) 591 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id); 592 593 if (!pdd->dev->kfd->shared_resources.enable_mes) 594 debug_refresh_runlist(pdd->dev->dqm); 595 else 596 kfd_dbg_set_mes_debug_mode(pdd); 597 } 598 599 kfd_dbg_set_workaround(target, false); 600 601 if (!unwind) { 602 int resume_count = resume_queues(target, 0, NULL); 603 604 if (resume_count) 605 pr_debug("Resumed %d queues\n", resume_count); 606 } 607 } 608 609 static void kfd_dbg_clean_exception_status(struct kfd_process *target) 610 { 611 struct process_queue_manager *pqm; 612 struct process_queue_node *pqn; 613 int i; 614 615 for (i = 0; i < target->n_pdds; i++) { 616 struct kfd_process_device *pdd = target->pdds[i]; 617 618 kfd_process_drain_interrupts(pdd); 619 620 pdd->exception_status = 0; 621 } 622 623 pqm = &target->pqm; 624 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 625 if (!pqn->q) 626 continue; 627 628 pqn->q->properties.exception_status = 0; 629 } 630 631 target->exception_status = 0; 632 } 633 634 int kfd_dbg_trap_disable(struct kfd_process *target) 635 { 636 if (!target->debug_trap_enabled) 637 return 0; 638 639 /* 640 * Defer deactivation to runtime if runtime not enabled otherwise reset 641 * attached running target runtime state to enable for re-attach. 642 */ 643 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 644 kfd_dbg_trap_deactivate(target, false, 0); 645 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) 646 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; 647 648 fput(target->dbg_ev_file); 649 target->dbg_ev_file = NULL; 650 651 if (target->debugger_process) { 652 atomic_dec(&target->debugger_process->debugged_process_count); 653 target->debugger_process = NULL; 654 } 655 656 target->debug_trap_enabled = false; 657 kfd_dbg_clean_exception_status(target); 658 kfd_unref_process(target); 659 660 return 0; 661 } 662 663 int kfd_dbg_trap_activate(struct kfd_process *target) 664 { 665 int i, r = 0; 666 667 r = kfd_dbg_set_workaround(target, true); 668 if (r) 669 return r; 670 671 for (i = 0; i < target->n_pdds; i++) { 672 struct kfd_process_device *pdd = target->pdds[i]; 673 674 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) { 675 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd); 676 677 if (r) { 678 target->runtime_info.runtime_state = (r == -EBUSY) ? 679 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 680 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 681 682 goto unwind_err; 683 } 684 } 685 686 /* Disable GFX OFF to prevent garbage read/writes to debug registers. 687 * If RLC restore of debug registers is not supported and runtime enable 688 * hasn't done so already on ttmp setup request, restore the trap config registers. 689 * 690 * If RLC restore of debug registers is not supported, keep gfx off disabled for 691 * the debug session. 692 */ 693 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 694 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) || 695 target->runtime_info.ttmp_setup)) 696 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true, 697 pdd->dev->vm_info.last_vmid_kfd); 698 699 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap( 700 pdd->dev->adev, 701 false, 702 pdd->dev->vm_info.last_vmid_kfd); 703 704 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 705 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 706 707 /* 708 * Setting the debug flag in the trap handler requires that the TMA has been 709 * allocated, which occurs during CWSR initialization. 710 * In the event that CWSR has not been initialized at this point, setting the 711 * flag will be called again during CWSR initialization if the target process 712 * is still debug enabled. 713 */ 714 kfd_process_set_trap_debug_flag(&pdd->qpd, true); 715 716 if (!pdd->dev->kfd->shared_resources.enable_mes) 717 r = debug_refresh_runlist(pdd->dev->dqm); 718 else 719 r = kfd_dbg_set_mes_debug_mode(pdd); 720 721 if (r) { 722 target->runtime_info.runtime_state = 723 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 724 goto unwind_err; 725 } 726 } 727 728 return 0; 729 730 unwind_err: 731 /* Enabling debug failed, we need to disable on 732 * all GPUs so the enable is all or nothing. 733 */ 734 kfd_dbg_trap_deactivate(target, true, i); 735 return r; 736 } 737 738 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, 739 void __user *runtime_info, uint32_t *runtime_size) 740 { 741 struct file *f; 742 uint32_t copy_size; 743 int i, r = 0; 744 745 if (target->debug_trap_enabled) 746 return -EALREADY; 747 748 /* Enable pre-checks */ 749 for (i = 0; i < target->n_pdds; i++) { 750 struct kfd_process_device *pdd = target->pdds[i]; 751 752 if (!KFD_IS_SOC15(pdd->dev)) 753 return -ENODEV; 754 755 if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws) 756 return -EBUSY; 757 } 758 759 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info)); 760 761 f = fget(fd); 762 if (!f) { 763 pr_err("Failed to get file for (%i)\n", fd); 764 return -EBADF; 765 } 766 767 target->dbg_ev_file = f; 768 769 /* defer activation to runtime if not runtime enabled */ 770 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 771 kfd_dbg_trap_activate(target); 772 773 /* We already hold the process reference but hold another one for the 774 * debug session. 775 */ 776 kref_get(&target->ref); 777 target->debug_trap_enabled = true; 778 779 if (target->debugger_process) 780 atomic_inc(&target->debugger_process->debugged_process_count); 781 782 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) { 783 kfd_dbg_trap_deactivate(target, false, 0); 784 r = -EFAULT; 785 } 786 787 *runtime_size = sizeof(target->runtime_info); 788 789 return r; 790 } 791 792 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p, 793 uint32_t trap_override, 794 uint32_t trap_mask_request, 795 uint32_t *trap_mask_supported) 796 { 797 int i = 0; 798 799 *trap_mask_supported = 0xffffffff; 800 801 for (i = 0; i < p->n_pdds; i++) { 802 struct kfd_process_device *pdd = p->pdds[i]; 803 int err = pdd->dev->kfd2kgd->validate_trap_override_request( 804 pdd->dev->adev, 805 trap_override, 806 trap_mask_supported); 807 808 if (err) 809 return err; 810 } 811 812 if (trap_mask_request & ~*trap_mask_supported) 813 return -EACCES; 814 815 return 0; 816 } 817 818 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target, 819 uint32_t trap_override, 820 uint32_t trap_mask_bits, 821 uint32_t trap_mask_request, 822 uint32_t *trap_mask_prev, 823 uint32_t *trap_mask_supported) 824 { 825 int r = 0, i; 826 827 r = kfd_dbg_validate_trap_override_request(target, 828 trap_override, 829 trap_mask_request, 830 trap_mask_supported); 831 832 if (r) 833 return r; 834 835 for (i = 0; i < target->n_pdds; i++) { 836 struct kfd_process_device *pdd = target->pdds[i]; 837 838 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 839 pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override( 840 pdd->dev->adev, 841 pdd->dev->vm_info.last_vmid_kfd, 842 trap_override, 843 trap_mask_bits, 844 trap_mask_request, 845 trap_mask_prev, 846 pdd->spi_dbg_override); 847 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 848 849 if (!pdd->dev->kfd->shared_resources.enable_mes) 850 r = debug_refresh_runlist(pdd->dev->dqm); 851 else 852 r = kfd_dbg_set_mes_debug_mode(pdd); 853 854 if (r) 855 break; 856 } 857 858 return r; 859 } 860 861 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target, 862 uint8_t wave_launch_mode) 863 { 864 int r = 0, i; 865 866 if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL && 867 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT && 868 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG) 869 return -EINVAL; 870 871 for (i = 0; i < target->n_pdds; i++) { 872 struct kfd_process_device *pdd = target->pdds[i]; 873 874 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 875 pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode( 876 pdd->dev->adev, 877 wave_launch_mode, 878 pdd->dev->vm_info.last_vmid_kfd); 879 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 880 881 if (!pdd->dev->kfd->shared_resources.enable_mes) 882 r = debug_refresh_runlist(pdd->dev->dqm); 883 else 884 r = kfd_dbg_set_mes_debug_mode(pdd); 885 886 if (r) 887 break; 888 } 889 890 return r; 891 } 892 893 int kfd_dbg_trap_query_exception_info(struct kfd_process *target, 894 uint32_t source_id, 895 uint32_t exception_code, 896 bool clear_exception, 897 void __user *info, 898 uint32_t *info_size) 899 { 900 bool found = false; 901 int r = 0; 902 uint32_t copy_size, actual_info_size = 0; 903 uint64_t *exception_status_ptr = NULL; 904 905 if (!target) 906 return -EINVAL; 907 908 if (!info || !info_size) 909 return -EINVAL; 910 911 mutex_lock(&target->event_mutex); 912 913 if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) { 914 /* Per queue exceptions */ 915 struct queue *queue = NULL; 916 int i; 917 918 for (i = 0; i < target->n_pdds; i++) { 919 struct kfd_process_device *pdd = target->pdds[i]; 920 struct qcm_process_device *qpd = &pdd->qpd; 921 922 list_for_each_entry(queue, &qpd->queues_list, list) { 923 if (!found && queue->properties.queue_id == source_id) { 924 found = true; 925 break; 926 } 927 } 928 if (found) 929 break; 930 } 931 932 if (!found) { 933 r = -EINVAL; 934 goto out; 935 } 936 937 if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) { 938 r = -ENODATA; 939 goto out; 940 } 941 exception_status_ptr = &queue->properties.exception_status; 942 } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) { 943 /* Per device exceptions */ 944 struct kfd_process_device *pdd = NULL; 945 int i; 946 947 for (i = 0; i < target->n_pdds; i++) { 948 pdd = target->pdds[i]; 949 if (pdd->dev->id == source_id) { 950 found = true; 951 break; 952 } 953 } 954 955 if (!found) { 956 r = -EINVAL; 957 goto out; 958 } 959 960 if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) { 961 r = -ENODATA; 962 goto out; 963 } 964 965 if (exception_code == EC_DEVICE_MEMORY_VIOLATION) { 966 copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size); 967 968 if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) { 969 r = -EFAULT; 970 goto out; 971 } 972 actual_info_size = pdd->vm_fault_exc_data_size; 973 if (clear_exception) { 974 kfree(pdd->vm_fault_exc_data); 975 pdd->vm_fault_exc_data = NULL; 976 pdd->vm_fault_exc_data_size = 0; 977 } 978 } 979 exception_status_ptr = &pdd->exception_status; 980 } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) { 981 /* Per process exceptions */ 982 if (!(target->exception_status & KFD_EC_MASK(exception_code))) { 983 r = -ENODATA; 984 goto out; 985 } 986 987 if (exception_code == EC_PROCESS_RUNTIME) { 988 copy_size = min((size_t)(*info_size), sizeof(target->runtime_info)); 989 990 if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) { 991 r = -EFAULT; 992 goto out; 993 } 994 995 actual_info_size = sizeof(target->runtime_info); 996 } 997 998 exception_status_ptr = &target->exception_status; 999 } else { 1000 pr_debug("Bad exception type [%i]\n", exception_code); 1001 r = -EINVAL; 1002 goto out; 1003 } 1004 1005 *info_size = actual_info_size; 1006 if (clear_exception) 1007 *exception_status_ptr &= ~KFD_EC_MASK(exception_code); 1008 out: 1009 mutex_unlock(&target->event_mutex); 1010 return r; 1011 } 1012 1013 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target, 1014 uint64_t exception_set_mask) 1015 { 1016 uint64_t found_mask = 0; 1017 struct process_queue_manager *pqm; 1018 struct process_queue_node *pqn; 1019 static const char write_data = '.'; 1020 loff_t pos = 0; 1021 int i; 1022 1023 mutex_lock(&target->event_mutex); 1024 1025 found_mask |= target->exception_status; 1026 1027 pqm = &target->pqm; 1028 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 1029 if (!pqn) 1030 continue; 1031 1032 found_mask |= pqn->q->properties.exception_status; 1033 } 1034 1035 for (i = 0; i < target->n_pdds; i++) { 1036 struct kfd_process_device *pdd = target->pdds[i]; 1037 1038 found_mask |= pdd->exception_status; 1039 } 1040 1041 if (exception_set_mask & found_mask) 1042 kernel_write(target->dbg_ev_file, &write_data, 1, &pos); 1043 1044 target->exception_enable_mask = exception_set_mask; 1045 1046 mutex_unlock(&target->event_mutex); 1047 } 1048