1 /* 2 * Copyright 2023 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "kfd_debug.h" 24 #include "kfd_device_queue_manager.h" 25 #include <linux/file.h> 26 27 void debug_event_write_work_handler(struct work_struct *work) 28 { 29 struct kfd_process *process; 30 31 static const char write_data = '.'; 32 loff_t pos = 0; 33 34 process = container_of(work, 35 struct kfd_process, 36 debug_event_workarea); 37 38 kernel_write(process->dbg_ev_file, &write_data, 1, &pos); 39 } 40 41 /* update process/device/queue exception status, write to descriptor 42 * only if exception_status is enabled. 43 */ 44 bool kfd_dbg_ev_raise(uint64_t event_mask, 45 struct kfd_process *process, struct kfd_node *dev, 46 unsigned int source_id, bool use_worker, 47 void *exception_data, size_t exception_data_size) 48 { 49 struct process_queue_manager *pqm; 50 struct process_queue_node *pqn; 51 int i; 52 static const char write_data = '.'; 53 loff_t pos = 0; 54 bool is_subscribed = true; 55 56 if (!(process && process->debug_trap_enabled)) 57 return false; 58 59 mutex_lock(&process->event_mutex); 60 61 if (event_mask & KFD_EC_MASK_DEVICE) { 62 for (i = 0; i < process->n_pdds; i++) { 63 struct kfd_process_device *pdd = process->pdds[i]; 64 65 if (pdd->dev != dev) 66 continue; 67 68 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE; 69 70 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 71 if (!pdd->vm_fault_exc_data) { 72 pdd->vm_fault_exc_data = kmemdup( 73 exception_data, 74 exception_data_size, 75 GFP_KERNEL); 76 if (!pdd->vm_fault_exc_data) 77 pr_debug("Failed to allocate exception data memory"); 78 } else { 79 pr_debug("Debugger exception data not saved\n"); 80 print_hex_dump_bytes("exception data: ", 81 DUMP_PREFIX_OFFSET, 82 exception_data, 83 exception_data_size); 84 } 85 } 86 break; 87 } 88 } else if (event_mask & KFD_EC_MASK_PROCESS) { 89 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS; 90 } else { 91 pqm = &process->pqm; 92 list_for_each_entry(pqn, &pqm->queues, 93 process_queue_list) { 94 int target_id; 95 96 if (!pqn->q) 97 continue; 98 99 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ? 100 pqn->q->properties.queue_id : 101 pqn->q->doorbell_id; 102 103 if (pqn->q->device != dev || target_id != source_id) 104 continue; 105 106 pqn->q->properties.exception_status |= event_mask; 107 break; 108 } 109 } 110 111 if (process->exception_enable_mask & event_mask) { 112 if (use_worker) 113 schedule_work(&process->debug_event_workarea); 114 else 115 kernel_write(process->dbg_ev_file, 116 &write_data, 117 1, 118 &pos); 119 } else { 120 is_subscribed = false; 121 } 122 123 mutex_unlock(&process->event_mutex); 124 125 return is_subscribed; 126 } 127 128 /* set pending event queue entry from ring entry */ 129 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev, 130 unsigned int pasid, 131 uint32_t doorbell_id, 132 uint64_t trap_mask, 133 void *exception_data, 134 size_t exception_data_size) 135 { 136 struct kfd_process *p; 137 bool signaled_to_debugger_or_runtime = false; 138 139 p = kfd_lookup_process_by_pasid(pasid); 140 141 if (!p) 142 return false; 143 144 if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true, 145 exception_data, exception_data_size)) { 146 struct process_queue_manager *pqm; 147 struct process_queue_node *pqn; 148 149 if (!!(trap_mask & KFD_EC_MASK_QUEUE) && 150 p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) { 151 mutex_lock(&p->mutex); 152 153 pqm = &p->pqm; 154 list_for_each_entry(pqn, &pqm->queues, 155 process_queue_list) { 156 157 if (!(pqn->q && pqn->q->device == dev && 158 pqn->q->doorbell_id == doorbell_id)) 159 continue; 160 161 kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id, 162 trap_mask); 163 164 signaled_to_debugger_or_runtime = true; 165 166 break; 167 } 168 169 mutex_unlock(&p->mutex); 170 } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 171 kfd_dqm_evict_pasid(dev->dqm, p->pasid); 172 kfd_signal_vm_fault_event(dev, p->pasid, NULL, 173 exception_data); 174 175 signaled_to_debugger_or_runtime = true; 176 } 177 } else { 178 signaled_to_debugger_or_runtime = true; 179 } 180 181 kfd_unref_process(p); 182 183 return signaled_to_debugger_or_runtime; 184 } 185 186 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p, 187 unsigned int dev_id, 188 unsigned int queue_id, 189 uint64_t error_reason) 190 { 191 if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 192 struct kfd_process_device *pdd = NULL; 193 struct kfd_hsa_memory_exception_data *data; 194 int i; 195 196 for (i = 0; i < p->n_pdds; i++) { 197 if (p->pdds[i]->dev->id == dev_id) { 198 pdd = p->pdds[i]; 199 break; 200 } 201 } 202 203 if (!pdd) 204 return -ENODEV; 205 206 data = (struct kfd_hsa_memory_exception_data *) 207 pdd->vm_fault_exc_data; 208 209 kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid); 210 kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data); 211 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION); 212 } 213 214 if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) { 215 /* 216 * block should only happen after the debugger receives runtime 217 * enable notice. 218 */ 219 up(&p->runtime_enable_sema); 220 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME); 221 } 222 223 if (error_reason) 224 return kfd_send_exception_to_runtime(p, queue_id, error_reason); 225 226 return 0; 227 } 228 229 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable) 230 { 231 struct mqd_update_info minfo = {0}; 232 int err; 233 234 if (!q) 235 return 0; 236 237 if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) || 238 KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0)) 239 return 0; 240 241 if (enable && q->properties.is_user_cu_masked) 242 return -EBUSY; 243 244 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE; 245 246 q->properties.is_dbg_wa = enable; 247 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo); 248 if (err) 249 q->properties.is_dbg_wa = false; 250 251 return err; 252 } 253 254 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable) 255 { 256 struct process_queue_manager *pqm = &target->pqm; 257 struct process_queue_node *pqn; 258 int r = 0; 259 260 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 261 r = kfd_dbg_set_queue_workaround(pqn->q, enable); 262 if (enable && r) 263 goto unwind; 264 } 265 266 return 0; 267 268 unwind: 269 list_for_each_entry(pqn, &pqm->queues, process_queue_list) 270 kfd_dbg_set_queue_workaround(pqn->q, false); 271 272 if (enable) 273 target->runtime_info.runtime_state = r == -EBUSY ? 274 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 275 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 276 277 return r; 278 } 279 280 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd) 281 { 282 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; 283 uint32_t flags = pdd->process->dbg_flags; 284 285 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 286 return 0; 287 288 return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl, 289 pdd->watch_points, flags); 290 } 291 292 /* kfd_dbg_trap_deactivate: 293 * target: target process 294 * unwind: If this is unwinding a failed kfd_dbg_trap_enable() 295 * unwind_count: 296 * If unwind == true, how far down the pdd list we need 297 * to unwind 298 * else: ignored 299 */ 300 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count) 301 { 302 int i; 303 304 if (!unwind) { 305 cancel_work_sync(&target->debug_event_workarea); 306 kfd_dbg_trap_set_wave_launch_mode(target, 0); 307 } 308 309 for (i = 0; i < target->n_pdds; i++) { 310 struct kfd_process_device *pdd = target->pdds[i]; 311 312 /* If this is an unwind, and we have unwound the required 313 * enable calls on the pdd list, we need to stop now 314 * otherwise we may mess up another debugger session. 315 */ 316 if (unwind && i == unwind_count) 317 break; 318 319 kfd_process_set_trap_debug_flag(&pdd->qpd, false); 320 321 /* GFX off is already disabled by debug activate if not RLC restore supported. */ 322 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 323 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 324 pdd->spi_dbg_override = 325 pdd->dev->kfd2kgd->disable_debug_trap( 326 pdd->dev->adev, 327 target->runtime_info.ttmp_setup, 328 pdd->dev->vm_info.last_vmid_kfd); 329 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 330 331 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) && 332 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd)) 333 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id); 334 335 if (!pdd->dev->kfd->shared_resources.enable_mes) 336 debug_refresh_runlist(pdd->dev->dqm); 337 else 338 kfd_dbg_set_mes_debug_mode(pdd); 339 } 340 341 kfd_dbg_set_workaround(target, false); 342 343 if (!unwind) { 344 int resume_count = resume_queues(target, 0, NULL); 345 346 if (resume_count) 347 pr_debug("Resumed %d queues\n", resume_count); 348 } 349 } 350 351 static void kfd_dbg_clean_exception_status(struct kfd_process *target) 352 { 353 struct process_queue_manager *pqm; 354 struct process_queue_node *pqn; 355 int i; 356 357 for (i = 0; i < target->n_pdds; i++) { 358 struct kfd_process_device *pdd = target->pdds[i]; 359 360 kfd_process_drain_interrupts(pdd); 361 362 pdd->exception_status = 0; 363 } 364 365 pqm = &target->pqm; 366 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 367 if (!pqn->q) 368 continue; 369 370 pqn->q->properties.exception_status = 0; 371 } 372 373 target->exception_status = 0; 374 } 375 376 int kfd_dbg_trap_disable(struct kfd_process *target) 377 { 378 if (!target->debug_trap_enabled) 379 return 0; 380 381 /* 382 * Defer deactivation to runtime if runtime not enabled otherwise reset 383 * attached running target runtime state to enable for re-attach. 384 */ 385 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 386 kfd_dbg_trap_deactivate(target, false, 0); 387 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) 388 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; 389 390 fput(target->dbg_ev_file); 391 target->dbg_ev_file = NULL; 392 393 if (target->debugger_process) { 394 atomic_dec(&target->debugger_process->debugged_process_count); 395 target->debugger_process = NULL; 396 } 397 398 target->debug_trap_enabled = false; 399 kfd_dbg_clean_exception_status(target); 400 kfd_unref_process(target); 401 402 return 0; 403 } 404 405 int kfd_dbg_trap_activate(struct kfd_process *target) 406 { 407 int i, r = 0; 408 409 r = kfd_dbg_set_workaround(target, true); 410 if (r) 411 return r; 412 413 for (i = 0; i < target->n_pdds; i++) { 414 struct kfd_process_device *pdd = target->pdds[i]; 415 416 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) { 417 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd); 418 419 if (r) { 420 target->runtime_info.runtime_state = (r == -EBUSY) ? 421 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 422 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 423 424 goto unwind_err; 425 } 426 } 427 428 /* Disable GFX OFF to prevent garbage read/writes to debug registers. 429 * If RLC restore of debug registers is not supported and runtime enable 430 * hasn't done so already on ttmp setup request, restore the trap config registers. 431 * 432 * If RLC restore of debug registers is not supported, keep gfx off disabled for 433 * the debug session. 434 */ 435 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 436 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) || 437 target->runtime_info.ttmp_setup)) 438 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true, 439 pdd->dev->vm_info.last_vmid_kfd); 440 441 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap( 442 pdd->dev->adev, 443 false, 444 pdd->dev->vm_info.last_vmid_kfd); 445 446 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 447 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 448 449 /* 450 * Setting the debug flag in the trap handler requires that the TMA has been 451 * allocated, which occurs during CWSR initialization. 452 * In the event that CWSR has not been initialized at this point, setting the 453 * flag will be called again during CWSR initialization if the target process 454 * is still debug enabled. 455 */ 456 kfd_process_set_trap_debug_flag(&pdd->qpd, true); 457 458 if (!pdd->dev->kfd->shared_resources.enable_mes) 459 r = debug_refresh_runlist(pdd->dev->dqm); 460 else 461 r = kfd_dbg_set_mes_debug_mode(pdd); 462 463 if (r) { 464 target->runtime_info.runtime_state = 465 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 466 goto unwind_err; 467 } 468 } 469 470 return 0; 471 472 unwind_err: 473 /* Enabling debug failed, we need to disable on 474 * all GPUs so the enable is all or nothing. 475 */ 476 kfd_dbg_trap_deactivate(target, true, i); 477 return r; 478 } 479 480 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, 481 void __user *runtime_info, uint32_t *runtime_size) 482 { 483 struct file *f; 484 uint32_t copy_size; 485 int i, r = 0; 486 487 if (target->debug_trap_enabled) 488 return -EALREADY; 489 490 /* Enable pre-checks */ 491 for (i = 0; i < target->n_pdds; i++) { 492 struct kfd_process_device *pdd = target->pdds[i]; 493 494 if (!KFD_IS_SOC15(pdd->dev)) 495 return -ENODEV; 496 497 if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws) 498 return -EBUSY; 499 } 500 501 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info)); 502 503 f = fget(fd); 504 if (!f) { 505 pr_err("Failed to get file for (%i)\n", fd); 506 return -EBADF; 507 } 508 509 target->dbg_ev_file = f; 510 511 /* defer activation to runtime if not runtime enabled */ 512 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 513 kfd_dbg_trap_activate(target); 514 515 /* We already hold the process reference but hold another one for the 516 * debug session. 517 */ 518 kref_get(&target->ref); 519 target->debug_trap_enabled = true; 520 521 if (target->debugger_process) 522 atomic_inc(&target->debugger_process->debugged_process_count); 523 524 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) { 525 kfd_dbg_trap_deactivate(target, false, 0); 526 r = -EFAULT; 527 } 528 529 *runtime_size = sizeof(target->runtime_info); 530 531 return r; 532 } 533 534 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p, 535 uint32_t trap_override, 536 uint32_t trap_mask_request, 537 uint32_t *trap_mask_supported) 538 { 539 int i = 0; 540 541 *trap_mask_supported = 0xffffffff; 542 543 for (i = 0; i < p->n_pdds; i++) { 544 struct kfd_process_device *pdd = p->pdds[i]; 545 int err = pdd->dev->kfd2kgd->validate_trap_override_request( 546 pdd->dev->adev, 547 trap_override, 548 trap_mask_supported); 549 550 if (err) 551 return err; 552 } 553 554 if (trap_mask_request & ~*trap_mask_supported) 555 return -EACCES; 556 557 return 0; 558 } 559 560 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target, 561 uint32_t trap_override, 562 uint32_t trap_mask_bits, 563 uint32_t trap_mask_request, 564 uint32_t *trap_mask_prev, 565 uint32_t *trap_mask_supported) 566 { 567 int r = 0, i; 568 569 r = kfd_dbg_validate_trap_override_request(target, 570 trap_override, 571 trap_mask_request, 572 trap_mask_supported); 573 574 if (r) 575 return r; 576 577 for (i = 0; i < target->n_pdds; i++) { 578 struct kfd_process_device *pdd = target->pdds[i]; 579 580 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 581 pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override( 582 pdd->dev->adev, 583 pdd->dev->vm_info.last_vmid_kfd, 584 trap_override, 585 trap_mask_bits, 586 trap_mask_request, 587 trap_mask_prev, 588 pdd->spi_dbg_override); 589 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 590 591 if (!pdd->dev->kfd->shared_resources.enable_mes) 592 r = debug_refresh_runlist(pdd->dev->dqm); 593 else 594 r = kfd_dbg_set_mes_debug_mode(pdd); 595 596 if (r) 597 break; 598 } 599 600 return r; 601 } 602 603 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target, 604 uint8_t wave_launch_mode) 605 { 606 int r = 0, i; 607 608 if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL && 609 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT && 610 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG) 611 return -EINVAL; 612 613 for (i = 0; i < target->n_pdds; i++) { 614 struct kfd_process_device *pdd = target->pdds[i]; 615 616 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 617 pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode( 618 pdd->dev->adev, 619 wave_launch_mode, 620 pdd->dev->vm_info.last_vmid_kfd); 621 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 622 623 if (!pdd->dev->kfd->shared_resources.enable_mes) 624 r = debug_refresh_runlist(pdd->dev->dqm); 625 else 626 r = kfd_dbg_set_mes_debug_mode(pdd); 627 628 if (r) 629 break; 630 } 631 632 return r; 633 } 634 635 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target, 636 uint64_t exception_set_mask) 637 { 638 uint64_t found_mask = 0; 639 struct process_queue_manager *pqm; 640 struct process_queue_node *pqn; 641 static const char write_data = '.'; 642 loff_t pos = 0; 643 int i; 644 645 mutex_lock(&target->event_mutex); 646 647 found_mask |= target->exception_status; 648 649 pqm = &target->pqm; 650 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 651 if (!pqn) 652 continue; 653 654 found_mask |= pqn->q->properties.exception_status; 655 } 656 657 for (i = 0; i < target->n_pdds; i++) { 658 struct kfd_process_device *pdd = target->pdds[i]; 659 660 found_mask |= pdd->exception_status; 661 } 662 663 if (exception_set_mask & found_mask) 664 kernel_write(target->dbg_ev_file, &write_data, 1, &pos); 665 666 target->exception_enable_mask = exception_set_mask; 667 668 mutex_unlock(&target->event_mutex); 669 } 670