1 /* 2 * Copyright 2023 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "kfd_debug.h" 24 #include "kfd_device_queue_manager.h" 25 #include "kfd_topology.h" 26 #include <linux/file.h> 27 #include <uapi/linux/kfd_ioctl.h> 28 29 #define MAX_WATCH_ADDRESSES 4 30 31 int kfd_dbg_ev_query_debug_event(struct kfd_process *process, 32 unsigned int *queue_id, 33 unsigned int *gpu_id, 34 uint64_t exception_clear_mask, 35 uint64_t *event_status) 36 { 37 struct process_queue_manager *pqm; 38 struct process_queue_node *pqn; 39 int i; 40 41 if (!(process && process->debug_trap_enabled)) 42 return -ENODATA; 43 44 mutex_lock(&process->event_mutex); 45 *event_status = 0; 46 *queue_id = 0; 47 *gpu_id = 0; 48 49 /* find and report queue events */ 50 pqm = &process->pqm; 51 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 52 uint64_t tmp = process->exception_enable_mask; 53 54 if (!pqn->q) 55 continue; 56 57 tmp &= pqn->q->properties.exception_status; 58 59 if (!tmp) 60 continue; 61 62 *event_status = pqn->q->properties.exception_status; 63 *queue_id = pqn->q->properties.queue_id; 64 *gpu_id = pqn->q->device->id; 65 pqn->q->properties.exception_status &= ~exception_clear_mask; 66 goto out; 67 } 68 69 /* find and report device events */ 70 for (i = 0; i < process->n_pdds; i++) { 71 struct kfd_process_device *pdd = process->pdds[i]; 72 uint64_t tmp = process->exception_enable_mask 73 & pdd->exception_status; 74 75 if (!tmp) 76 continue; 77 78 *event_status = pdd->exception_status; 79 *gpu_id = pdd->dev->id; 80 pdd->exception_status &= ~exception_clear_mask; 81 goto out; 82 } 83 84 /* report process events */ 85 if (process->exception_enable_mask & process->exception_status) { 86 *event_status = process->exception_status; 87 process->exception_status &= ~exception_clear_mask; 88 } 89 90 out: 91 mutex_unlock(&process->event_mutex); 92 return *event_status ? 0 : -EAGAIN; 93 } 94 95 void debug_event_write_work_handler(struct work_struct *work) 96 { 97 struct kfd_process *process; 98 99 static const char write_data = '.'; 100 loff_t pos = 0; 101 102 process = container_of(work, 103 struct kfd_process, 104 debug_event_workarea); 105 106 kernel_write(process->dbg_ev_file, &write_data, 1, &pos); 107 } 108 109 /* update process/device/queue exception status, write to descriptor 110 * only if exception_status is enabled. 111 */ 112 bool kfd_dbg_ev_raise(uint64_t event_mask, 113 struct kfd_process *process, struct kfd_node *dev, 114 unsigned int source_id, bool use_worker, 115 void *exception_data, size_t exception_data_size) 116 { 117 struct process_queue_manager *pqm; 118 struct process_queue_node *pqn; 119 int i; 120 static const char write_data = '.'; 121 loff_t pos = 0; 122 bool is_subscribed = true; 123 124 if (!(process && process->debug_trap_enabled)) 125 return false; 126 127 mutex_lock(&process->event_mutex); 128 129 if (event_mask & KFD_EC_MASK_DEVICE) { 130 for (i = 0; i < process->n_pdds; i++) { 131 struct kfd_process_device *pdd = process->pdds[i]; 132 133 if (pdd->dev != dev) 134 continue; 135 136 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE; 137 138 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 139 if (!pdd->vm_fault_exc_data) { 140 pdd->vm_fault_exc_data = kmemdup( 141 exception_data, 142 exception_data_size, 143 GFP_KERNEL); 144 if (!pdd->vm_fault_exc_data) 145 pr_debug("Failed to allocate exception data memory"); 146 } else { 147 pr_debug("Debugger exception data not saved\n"); 148 print_hex_dump_bytes("exception data: ", 149 DUMP_PREFIX_OFFSET, 150 exception_data, 151 exception_data_size); 152 } 153 } 154 break; 155 } 156 } else if (event_mask & KFD_EC_MASK_PROCESS) { 157 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS; 158 } else { 159 pqm = &process->pqm; 160 list_for_each_entry(pqn, &pqm->queues, 161 process_queue_list) { 162 int target_id; 163 164 if (!pqn->q) 165 continue; 166 167 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ? 168 pqn->q->properties.queue_id : 169 pqn->q->doorbell_id; 170 171 if (pqn->q->device != dev || target_id != source_id) 172 continue; 173 174 pqn->q->properties.exception_status |= event_mask; 175 break; 176 } 177 } 178 179 if (process->exception_enable_mask & event_mask) { 180 if (use_worker) 181 schedule_work(&process->debug_event_workarea); 182 else 183 kernel_write(process->dbg_ev_file, 184 &write_data, 185 1, 186 &pos); 187 } else { 188 is_subscribed = false; 189 } 190 191 mutex_unlock(&process->event_mutex); 192 193 return is_subscribed; 194 } 195 196 /* set pending event queue entry from ring entry */ 197 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev, 198 unsigned int pasid, 199 uint32_t doorbell_id, 200 uint64_t trap_mask, 201 void *exception_data, 202 size_t exception_data_size) 203 { 204 struct kfd_process *p; 205 bool signaled_to_debugger_or_runtime = false; 206 207 p = kfd_lookup_process_by_pasid(pasid); 208 209 if (!p) 210 return false; 211 212 if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true, 213 exception_data, exception_data_size)) { 214 struct process_queue_manager *pqm; 215 struct process_queue_node *pqn; 216 217 if (!!(trap_mask & KFD_EC_MASK_QUEUE) && 218 p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) { 219 mutex_lock(&p->mutex); 220 221 pqm = &p->pqm; 222 list_for_each_entry(pqn, &pqm->queues, 223 process_queue_list) { 224 225 if (!(pqn->q && pqn->q->device == dev && 226 pqn->q->doorbell_id == doorbell_id)) 227 continue; 228 229 kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id, 230 trap_mask); 231 232 signaled_to_debugger_or_runtime = true; 233 234 break; 235 } 236 237 mutex_unlock(&p->mutex); 238 } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 239 kfd_dqm_evict_pasid(dev->dqm, p->pasid); 240 kfd_signal_vm_fault_event(dev, p->pasid, NULL, 241 exception_data); 242 243 signaled_to_debugger_or_runtime = true; 244 } 245 } else { 246 signaled_to_debugger_or_runtime = true; 247 } 248 249 kfd_unref_process(p); 250 251 return signaled_to_debugger_or_runtime; 252 } 253 254 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p, 255 unsigned int dev_id, 256 unsigned int queue_id, 257 uint64_t error_reason) 258 { 259 if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 260 struct kfd_process_device *pdd = NULL; 261 struct kfd_hsa_memory_exception_data *data; 262 int i; 263 264 for (i = 0; i < p->n_pdds; i++) { 265 if (p->pdds[i]->dev->id == dev_id) { 266 pdd = p->pdds[i]; 267 break; 268 } 269 } 270 271 if (!pdd) 272 return -ENODEV; 273 274 data = (struct kfd_hsa_memory_exception_data *) 275 pdd->vm_fault_exc_data; 276 277 kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid); 278 kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data); 279 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION); 280 } 281 282 if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) { 283 /* 284 * block should only happen after the debugger receives runtime 285 * enable notice. 286 */ 287 up(&p->runtime_enable_sema); 288 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME); 289 } 290 291 if (error_reason) 292 return kfd_send_exception_to_runtime(p, queue_id, error_reason); 293 294 return 0; 295 } 296 297 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable) 298 { 299 struct mqd_update_info minfo = {0}; 300 int err; 301 302 if (!q) 303 return 0; 304 305 if (!kfd_dbg_has_cwsr_workaround(q->device)) 306 return 0; 307 308 if (enable && q->properties.is_user_cu_masked) 309 return -EBUSY; 310 311 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE; 312 313 q->properties.is_dbg_wa = enable; 314 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo); 315 if (err) 316 q->properties.is_dbg_wa = false; 317 318 return err; 319 } 320 321 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable) 322 { 323 struct process_queue_manager *pqm = &target->pqm; 324 struct process_queue_node *pqn; 325 int r = 0; 326 327 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 328 r = kfd_dbg_set_queue_workaround(pqn->q, enable); 329 if (enable && r) 330 goto unwind; 331 } 332 333 return 0; 334 335 unwind: 336 list_for_each_entry(pqn, &pqm->queues, process_queue_list) 337 kfd_dbg_set_queue_workaround(pqn->q, false); 338 339 if (enable) 340 target->runtime_info.runtime_state = r == -EBUSY ? 341 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 342 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 343 344 return r; 345 } 346 347 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en) 348 { 349 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; 350 uint32_t flags = pdd->process->dbg_flags; 351 352 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 353 return 0; 354 355 return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl, 356 pdd->watch_points, flags, sq_trap_en); 357 } 358 359 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1 360 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id) 361 { 362 int i; 363 364 *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID; 365 366 spin_lock(&pdd->dev->kfd->watch_points_lock); 367 368 for (i = 0; i < MAX_WATCH_ADDRESSES; i++) { 369 /* device watchpoint in use so skip */ 370 if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1) 371 continue; 372 373 pdd->alloc_watch_ids |= 0x1 << i; 374 pdd->dev->kfd->alloc_watch_ids |= 0x1 << i; 375 *watch_id = i; 376 spin_unlock(&pdd->dev->kfd->watch_points_lock); 377 return 0; 378 } 379 380 spin_unlock(&pdd->dev->kfd->watch_points_lock); 381 382 return -ENOMEM; 383 } 384 385 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id) 386 { 387 spin_lock(&pdd->dev->kfd->watch_points_lock); 388 389 /* process owns device watch point so safe to clear */ 390 if ((pdd->alloc_watch_ids >> watch_id) & 0x1) { 391 pdd->alloc_watch_ids &= ~(0x1 << watch_id); 392 pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id); 393 } 394 395 spin_unlock(&pdd->dev->kfd->watch_points_lock); 396 } 397 398 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id) 399 { 400 bool owns_watch_id = false; 401 402 spin_lock(&pdd->dev->kfd->watch_points_lock); 403 owns_watch_id = watch_id < MAX_WATCH_ADDRESSES && 404 ((pdd->alloc_watch_ids >> watch_id) & 0x1); 405 406 spin_unlock(&pdd->dev->kfd->watch_points_lock); 407 408 return owns_watch_id; 409 } 410 411 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd, 412 uint32_t watch_id) 413 { 414 int r; 415 416 if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id)) 417 return -EINVAL; 418 419 if (!pdd->dev->kfd->shared_resources.enable_mes) { 420 r = debug_lock_and_unmap(pdd->dev->dqm); 421 if (r) 422 return r; 423 } 424 425 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 426 pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch( 427 pdd->dev->adev, 428 watch_id); 429 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 430 431 if (!pdd->dev->kfd->shared_resources.enable_mes) 432 r = debug_map_and_unlock(pdd->dev->dqm); 433 else 434 r = kfd_dbg_set_mes_debug_mode(pdd, true); 435 436 kfd_dbg_clear_dev_watch_id(pdd, watch_id); 437 438 return r; 439 } 440 441 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd, 442 uint64_t watch_address, 443 uint32_t watch_address_mask, 444 uint32_t *watch_id, 445 uint32_t watch_mode) 446 { 447 int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id); 448 uint32_t xcc_mask = pdd->dev->xcc_mask; 449 450 if (r) 451 return r; 452 453 if (!pdd->dev->kfd->shared_resources.enable_mes) { 454 r = debug_lock_and_unmap(pdd->dev->dqm); 455 if (r) { 456 kfd_dbg_clear_dev_watch_id(pdd, *watch_id); 457 return r; 458 } 459 } 460 461 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 462 for_each_inst(xcc_id, xcc_mask) 463 pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch( 464 pdd->dev->adev, 465 watch_address, 466 watch_address_mask, 467 *watch_id, 468 watch_mode, 469 pdd->dev->vm_info.last_vmid_kfd, 470 xcc_id); 471 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 472 473 if (!pdd->dev->kfd->shared_resources.enable_mes) 474 r = debug_map_and_unlock(pdd->dev->dqm); 475 else 476 r = kfd_dbg_set_mes_debug_mode(pdd, true); 477 478 /* HWS is broken so no point in HW rollback but release the watchpoint anyways */ 479 if (r) 480 kfd_dbg_clear_dev_watch_id(pdd, *watch_id); 481 482 return 0; 483 } 484 485 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target) 486 { 487 int i, j; 488 489 for (i = 0; i < target->n_pdds; i++) 490 for (j = 0; j < MAX_WATCH_ADDRESSES; j++) 491 kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j); 492 } 493 494 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) 495 { 496 uint32_t prev_flags = target->dbg_flags; 497 int i, r = 0, rewind_count = 0; 498 499 for (i = 0; i < target->n_pdds; i++) { 500 if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) && 501 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) { 502 *flags = prev_flags; 503 return -EACCES; 504 } 505 } 506 507 target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP; 508 *flags = prev_flags; 509 for (i = 0; i < target->n_pdds; i++) { 510 struct kfd_process_device *pdd = target->pdds[i]; 511 512 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 513 continue; 514 515 if (!pdd->dev->kfd->shared_resources.enable_mes) 516 r = debug_refresh_runlist(pdd->dev->dqm); 517 else 518 r = kfd_dbg_set_mes_debug_mode(pdd, true); 519 520 if (r) { 521 target->dbg_flags = prev_flags; 522 break; 523 } 524 525 rewind_count++; 526 } 527 528 /* Rewind flags */ 529 if (r) { 530 target->dbg_flags = prev_flags; 531 532 for (i = 0; i < rewind_count; i++) { 533 struct kfd_process_device *pdd = target->pdds[i]; 534 535 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 536 continue; 537 538 if (!pdd->dev->kfd->shared_resources.enable_mes) 539 debug_refresh_runlist(pdd->dev->dqm); 540 else 541 kfd_dbg_set_mes_debug_mode(pdd, true); 542 } 543 } 544 545 return r; 546 } 547 548 /* kfd_dbg_trap_deactivate: 549 * target: target process 550 * unwind: If this is unwinding a failed kfd_dbg_trap_enable() 551 * unwind_count: 552 * If unwind == true, how far down the pdd list we need 553 * to unwind 554 * else: ignored 555 */ 556 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count) 557 { 558 int i; 559 560 if (!unwind) { 561 uint32_t flags = 0; 562 int resume_count = resume_queues(target, 0, NULL); 563 564 if (resume_count) 565 pr_debug("Resumed %d queues\n", resume_count); 566 567 cancel_work_sync(&target->debug_event_workarea); 568 kfd_dbg_clear_process_address_watch(target); 569 kfd_dbg_trap_set_wave_launch_mode(target, 0); 570 571 kfd_dbg_trap_set_flags(target, &flags); 572 } 573 574 for (i = 0; i < target->n_pdds; i++) { 575 struct kfd_process_device *pdd = target->pdds[i]; 576 577 /* If this is an unwind, and we have unwound the required 578 * enable calls on the pdd list, we need to stop now 579 * otherwise we may mess up another debugger session. 580 */ 581 if (unwind && i == unwind_count) 582 break; 583 584 kfd_process_set_trap_debug_flag(&pdd->qpd, false); 585 586 /* GFX off is already disabled by debug activate if not RLC restore supported. */ 587 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 588 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 589 pdd->spi_dbg_override = 590 pdd->dev->kfd2kgd->disable_debug_trap( 591 pdd->dev->adev, 592 target->runtime_info.ttmp_setup, 593 pdd->dev->vm_info.last_vmid_kfd); 594 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 595 596 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) && 597 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd)) 598 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id); 599 600 if (!pdd->dev->kfd->shared_resources.enable_mes) 601 debug_refresh_runlist(pdd->dev->dqm); 602 else 603 kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev)); 604 } 605 606 kfd_dbg_set_workaround(target, false); 607 } 608 609 static void kfd_dbg_clean_exception_status(struct kfd_process *target) 610 { 611 struct process_queue_manager *pqm; 612 struct process_queue_node *pqn; 613 int i; 614 615 for (i = 0; i < target->n_pdds; i++) { 616 struct kfd_process_device *pdd = target->pdds[i]; 617 618 kfd_process_drain_interrupts(pdd); 619 620 pdd->exception_status = 0; 621 } 622 623 pqm = &target->pqm; 624 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 625 if (!pqn->q) 626 continue; 627 628 pqn->q->properties.exception_status = 0; 629 } 630 631 target->exception_status = 0; 632 } 633 634 int kfd_dbg_trap_disable(struct kfd_process *target) 635 { 636 if (!target->debug_trap_enabled) 637 return 0; 638 639 /* 640 * Defer deactivation to runtime if runtime not enabled otherwise reset 641 * attached running target runtime state to enable for re-attach. 642 */ 643 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 644 kfd_dbg_trap_deactivate(target, false, 0); 645 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) 646 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; 647 648 fput(target->dbg_ev_file); 649 target->dbg_ev_file = NULL; 650 651 if (target->debugger_process) { 652 atomic_dec(&target->debugger_process->debugged_process_count); 653 target->debugger_process = NULL; 654 } 655 656 target->debug_trap_enabled = false; 657 kfd_dbg_clean_exception_status(target); 658 kfd_unref_process(target); 659 660 return 0; 661 } 662 663 int kfd_dbg_trap_activate(struct kfd_process *target) 664 { 665 int i, r = 0; 666 667 r = kfd_dbg_set_workaround(target, true); 668 if (r) 669 return r; 670 671 for (i = 0; i < target->n_pdds; i++) { 672 struct kfd_process_device *pdd = target->pdds[i]; 673 674 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) { 675 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd); 676 677 if (r) { 678 target->runtime_info.runtime_state = (r == -EBUSY) ? 679 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 680 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 681 682 goto unwind_err; 683 } 684 } 685 686 /* Disable GFX OFF to prevent garbage read/writes to debug registers. 687 * If RLC restore of debug registers is not supported and runtime enable 688 * hasn't done so already on ttmp setup request, restore the trap config registers. 689 * 690 * If RLC restore of debug registers is not supported, keep gfx off disabled for 691 * the debug session. 692 */ 693 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 694 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) || 695 target->runtime_info.ttmp_setup)) 696 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true, 697 pdd->dev->vm_info.last_vmid_kfd); 698 699 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap( 700 pdd->dev->adev, 701 false, 702 pdd->dev->vm_info.last_vmid_kfd); 703 704 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 705 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 706 707 /* 708 * Setting the debug flag in the trap handler requires that the TMA has been 709 * allocated, which occurs during CWSR initialization. 710 * In the event that CWSR has not been initialized at this point, setting the 711 * flag will be called again during CWSR initialization if the target process 712 * is still debug enabled. 713 */ 714 kfd_process_set_trap_debug_flag(&pdd->qpd, true); 715 716 if (!pdd->dev->kfd->shared_resources.enable_mes) 717 r = debug_refresh_runlist(pdd->dev->dqm); 718 else 719 r = kfd_dbg_set_mes_debug_mode(pdd, true); 720 721 if (r) { 722 target->runtime_info.runtime_state = 723 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 724 goto unwind_err; 725 } 726 } 727 728 return 0; 729 730 unwind_err: 731 /* Enabling debug failed, we need to disable on 732 * all GPUs so the enable is all or nothing. 733 */ 734 kfd_dbg_trap_deactivate(target, true, i); 735 return r; 736 } 737 738 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, 739 void __user *runtime_info, uint32_t *runtime_size) 740 { 741 struct file *f; 742 uint32_t copy_size; 743 int i, r = 0; 744 745 if (target->debug_trap_enabled) 746 return -EALREADY; 747 748 /* Enable pre-checks */ 749 for (i = 0; i < target->n_pdds; i++) { 750 struct kfd_process_device *pdd = target->pdds[i]; 751 752 if (!KFD_IS_SOC15(pdd->dev)) 753 return -ENODEV; 754 755 if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) || 756 kfd_dbg_has_cwsr_workaround(pdd->dev))) 757 return -EBUSY; 758 } 759 760 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info)); 761 762 f = fget(fd); 763 if (!f) { 764 pr_err("Failed to get file for (%i)\n", fd); 765 return -EBADF; 766 } 767 768 target->dbg_ev_file = f; 769 770 /* defer activation to runtime if not runtime enabled */ 771 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 772 kfd_dbg_trap_activate(target); 773 774 /* We already hold the process reference but hold another one for the 775 * debug session. 776 */ 777 kref_get(&target->ref); 778 target->debug_trap_enabled = true; 779 780 if (target->debugger_process) 781 atomic_inc(&target->debugger_process->debugged_process_count); 782 783 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) { 784 kfd_dbg_trap_deactivate(target, false, 0); 785 r = -EFAULT; 786 } 787 788 *runtime_size = sizeof(target->runtime_info); 789 790 return r; 791 } 792 793 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p, 794 uint32_t trap_override, 795 uint32_t trap_mask_request, 796 uint32_t *trap_mask_supported) 797 { 798 int i = 0; 799 800 *trap_mask_supported = 0xffffffff; 801 802 for (i = 0; i < p->n_pdds; i++) { 803 struct kfd_process_device *pdd = p->pdds[i]; 804 int err = pdd->dev->kfd2kgd->validate_trap_override_request( 805 pdd->dev->adev, 806 trap_override, 807 trap_mask_supported); 808 809 if (err) 810 return err; 811 } 812 813 if (trap_mask_request & ~*trap_mask_supported) 814 return -EACCES; 815 816 return 0; 817 } 818 819 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target, 820 uint32_t trap_override, 821 uint32_t trap_mask_bits, 822 uint32_t trap_mask_request, 823 uint32_t *trap_mask_prev, 824 uint32_t *trap_mask_supported) 825 { 826 int r = 0, i; 827 828 r = kfd_dbg_validate_trap_override_request(target, 829 trap_override, 830 trap_mask_request, 831 trap_mask_supported); 832 833 if (r) 834 return r; 835 836 for (i = 0; i < target->n_pdds; i++) { 837 struct kfd_process_device *pdd = target->pdds[i]; 838 839 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 840 pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override( 841 pdd->dev->adev, 842 pdd->dev->vm_info.last_vmid_kfd, 843 trap_override, 844 trap_mask_bits, 845 trap_mask_request, 846 trap_mask_prev, 847 pdd->spi_dbg_override); 848 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 849 850 if (!pdd->dev->kfd->shared_resources.enable_mes) 851 r = debug_refresh_runlist(pdd->dev->dqm); 852 else 853 r = kfd_dbg_set_mes_debug_mode(pdd, true); 854 855 if (r) 856 break; 857 } 858 859 return r; 860 } 861 862 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target, 863 uint8_t wave_launch_mode) 864 { 865 int r = 0, i; 866 867 if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL && 868 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT && 869 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG) 870 return -EINVAL; 871 872 for (i = 0; i < target->n_pdds; i++) { 873 struct kfd_process_device *pdd = target->pdds[i]; 874 875 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 876 pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode( 877 pdd->dev->adev, 878 wave_launch_mode, 879 pdd->dev->vm_info.last_vmid_kfd); 880 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 881 882 if (!pdd->dev->kfd->shared_resources.enable_mes) 883 r = debug_refresh_runlist(pdd->dev->dqm); 884 else 885 r = kfd_dbg_set_mes_debug_mode(pdd, true); 886 887 if (r) 888 break; 889 } 890 891 return r; 892 } 893 894 int kfd_dbg_trap_query_exception_info(struct kfd_process *target, 895 uint32_t source_id, 896 uint32_t exception_code, 897 bool clear_exception, 898 void __user *info, 899 uint32_t *info_size) 900 { 901 bool found = false; 902 int r = 0; 903 uint32_t copy_size, actual_info_size = 0; 904 uint64_t *exception_status_ptr = NULL; 905 906 if (!target) 907 return -EINVAL; 908 909 if (!info || !info_size) 910 return -EINVAL; 911 912 mutex_lock(&target->event_mutex); 913 914 if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) { 915 /* Per queue exceptions */ 916 struct queue *queue = NULL; 917 int i; 918 919 for (i = 0; i < target->n_pdds; i++) { 920 struct kfd_process_device *pdd = target->pdds[i]; 921 struct qcm_process_device *qpd = &pdd->qpd; 922 923 list_for_each_entry(queue, &qpd->queues_list, list) { 924 if (!found && queue->properties.queue_id == source_id) { 925 found = true; 926 break; 927 } 928 } 929 if (found) 930 break; 931 } 932 933 if (!found) { 934 r = -EINVAL; 935 goto out; 936 } 937 938 if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) { 939 r = -ENODATA; 940 goto out; 941 } 942 exception_status_ptr = &queue->properties.exception_status; 943 } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) { 944 /* Per device exceptions */ 945 struct kfd_process_device *pdd = NULL; 946 int i; 947 948 for (i = 0; i < target->n_pdds; i++) { 949 pdd = target->pdds[i]; 950 if (pdd->dev->id == source_id) { 951 found = true; 952 break; 953 } 954 } 955 956 if (!found) { 957 r = -EINVAL; 958 goto out; 959 } 960 961 if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) { 962 r = -ENODATA; 963 goto out; 964 } 965 966 if (exception_code == EC_DEVICE_MEMORY_VIOLATION) { 967 copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size); 968 969 if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) { 970 r = -EFAULT; 971 goto out; 972 } 973 actual_info_size = pdd->vm_fault_exc_data_size; 974 if (clear_exception) { 975 kfree(pdd->vm_fault_exc_data); 976 pdd->vm_fault_exc_data = NULL; 977 pdd->vm_fault_exc_data_size = 0; 978 } 979 } 980 exception_status_ptr = &pdd->exception_status; 981 } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) { 982 /* Per process exceptions */ 983 if (!(target->exception_status & KFD_EC_MASK(exception_code))) { 984 r = -ENODATA; 985 goto out; 986 } 987 988 if (exception_code == EC_PROCESS_RUNTIME) { 989 copy_size = min((size_t)(*info_size), sizeof(target->runtime_info)); 990 991 if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) { 992 r = -EFAULT; 993 goto out; 994 } 995 996 actual_info_size = sizeof(target->runtime_info); 997 } 998 999 exception_status_ptr = &target->exception_status; 1000 } else { 1001 pr_debug("Bad exception type [%i]\n", exception_code); 1002 r = -EINVAL; 1003 goto out; 1004 } 1005 1006 *info_size = actual_info_size; 1007 if (clear_exception) 1008 *exception_status_ptr &= ~KFD_EC_MASK(exception_code); 1009 out: 1010 mutex_unlock(&target->event_mutex); 1011 return r; 1012 } 1013 1014 int kfd_dbg_trap_device_snapshot(struct kfd_process *target, 1015 uint64_t exception_clear_mask, 1016 void __user *user_info, 1017 uint32_t *number_of_device_infos, 1018 uint32_t *entry_size) 1019 { 1020 struct kfd_dbg_device_info_entry device_info; 1021 uint32_t tmp_entry_size = *entry_size, tmp_num_devices; 1022 int i, r = 0; 1023 1024 if (!(target && user_info && number_of_device_infos && entry_size)) 1025 return -EINVAL; 1026 1027 tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds); 1028 *number_of_device_infos = target->n_pdds; 1029 *entry_size = min_t(size_t, *entry_size, sizeof(device_info)); 1030 1031 if (!tmp_num_devices) 1032 return 0; 1033 1034 memset(&device_info, 0, sizeof(device_info)); 1035 1036 mutex_lock(&target->event_mutex); 1037 1038 /* Run over all pdd of the process */ 1039 for (i = 0; i < tmp_num_devices; i++) { 1040 struct kfd_process_device *pdd = target->pdds[i]; 1041 struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id); 1042 1043 device_info.gpu_id = pdd->dev->id; 1044 device_info.exception_status = pdd->exception_status; 1045 device_info.lds_base = pdd->lds_base; 1046 device_info.lds_limit = pdd->lds_limit; 1047 device_info.scratch_base = pdd->scratch_base; 1048 device_info.scratch_limit = pdd->scratch_limit; 1049 device_info.gpuvm_base = pdd->gpuvm_base; 1050 device_info.gpuvm_limit = pdd->gpuvm_limit; 1051 device_info.location_id = topo_dev->node_props.location_id; 1052 device_info.vendor_id = topo_dev->node_props.vendor_id; 1053 device_info.device_id = topo_dev->node_props.device_id; 1054 device_info.revision_id = pdd->dev->adev->pdev->revision; 1055 device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor; 1056 device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device; 1057 device_info.fw_version = pdd->dev->kfd->mec_fw_version; 1058 device_info.gfx_target_version = 1059 topo_dev->node_props.gfx_target_version; 1060 device_info.simd_count = topo_dev->node_props.simd_count; 1061 device_info.max_waves_per_simd = 1062 topo_dev->node_props.max_waves_per_simd; 1063 device_info.array_count = topo_dev->node_props.array_count; 1064 device_info.simd_arrays_per_engine = 1065 topo_dev->node_props.simd_arrays_per_engine; 1066 device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask); 1067 device_info.capability = topo_dev->node_props.capability; 1068 device_info.debug_prop = topo_dev->node_props.debug_prop; 1069 1070 if (exception_clear_mask) 1071 pdd->exception_status &= ~exception_clear_mask; 1072 1073 if (copy_to_user(user_info, &device_info, *entry_size)) { 1074 r = -EFAULT; 1075 break; 1076 } 1077 1078 user_info += tmp_entry_size; 1079 } 1080 1081 mutex_unlock(&target->event_mutex); 1082 1083 return r; 1084 } 1085 1086 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target, 1087 uint64_t exception_set_mask) 1088 { 1089 uint64_t found_mask = 0; 1090 struct process_queue_manager *pqm; 1091 struct process_queue_node *pqn; 1092 static const char write_data = '.'; 1093 loff_t pos = 0; 1094 int i; 1095 1096 mutex_lock(&target->event_mutex); 1097 1098 found_mask |= target->exception_status; 1099 1100 pqm = &target->pqm; 1101 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 1102 if (!pqn->q) 1103 continue; 1104 1105 found_mask |= pqn->q->properties.exception_status; 1106 } 1107 1108 for (i = 0; i < target->n_pdds; i++) { 1109 struct kfd_process_device *pdd = target->pdds[i]; 1110 1111 found_mask |= pdd->exception_status; 1112 } 1113 1114 if (exception_set_mask & found_mask) 1115 kernel_write(target->dbg_ev_file, &write_data, 1, &pos); 1116 1117 target->exception_enable_mask = exception_set_mask; 1118 1119 mutex_unlock(&target->event_mutex); 1120 } 1121