1 /* 2 * Copyright 2023 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "kfd_debug.h" 24 #include "kfd_device_queue_manager.h" 25 #include <linux/file.h> 26 27 void debug_event_write_work_handler(struct work_struct *work) 28 { 29 struct kfd_process *process; 30 31 static const char write_data = '.'; 32 loff_t pos = 0; 33 34 process = container_of(work, 35 struct kfd_process, 36 debug_event_workarea); 37 38 kernel_write(process->dbg_ev_file, &write_data, 1, &pos); 39 } 40 41 /* update process/device/queue exception status, write to descriptor 42 * only if exception_status is enabled. 43 */ 44 bool kfd_dbg_ev_raise(uint64_t event_mask, 45 struct kfd_process *process, struct kfd_node *dev, 46 unsigned int source_id, bool use_worker, 47 void *exception_data, size_t exception_data_size) 48 { 49 struct process_queue_manager *pqm; 50 struct process_queue_node *pqn; 51 int i; 52 static const char write_data = '.'; 53 loff_t pos = 0; 54 bool is_subscribed = true; 55 56 if (!(process && process->debug_trap_enabled)) 57 return false; 58 59 mutex_lock(&process->event_mutex); 60 61 if (event_mask & KFD_EC_MASK_DEVICE) { 62 for (i = 0; i < process->n_pdds; i++) { 63 struct kfd_process_device *pdd = process->pdds[i]; 64 65 if (pdd->dev != dev) 66 continue; 67 68 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE; 69 70 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 71 if (!pdd->vm_fault_exc_data) { 72 pdd->vm_fault_exc_data = kmemdup( 73 exception_data, 74 exception_data_size, 75 GFP_KERNEL); 76 if (!pdd->vm_fault_exc_data) 77 pr_debug("Failed to allocate exception data memory"); 78 } else { 79 pr_debug("Debugger exception data not saved\n"); 80 print_hex_dump_bytes("exception data: ", 81 DUMP_PREFIX_OFFSET, 82 exception_data, 83 exception_data_size); 84 } 85 } 86 break; 87 } 88 } else if (event_mask & KFD_EC_MASK_PROCESS) { 89 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS; 90 } else { 91 pqm = &process->pqm; 92 list_for_each_entry(pqn, &pqm->queues, 93 process_queue_list) { 94 int target_id; 95 96 if (!pqn->q) 97 continue; 98 99 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ? 100 pqn->q->properties.queue_id : 101 pqn->q->doorbell_id; 102 103 if (pqn->q->device != dev || target_id != source_id) 104 continue; 105 106 pqn->q->properties.exception_status |= event_mask; 107 break; 108 } 109 } 110 111 if (process->exception_enable_mask & event_mask) { 112 if (use_worker) 113 schedule_work(&process->debug_event_workarea); 114 else 115 kernel_write(process->dbg_ev_file, 116 &write_data, 117 1, 118 &pos); 119 } else { 120 is_subscribed = false; 121 } 122 123 mutex_unlock(&process->event_mutex); 124 125 return is_subscribed; 126 } 127 128 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p, 129 unsigned int dev_id, 130 unsigned int queue_id, 131 uint64_t error_reason) 132 { 133 if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 134 struct kfd_process_device *pdd = NULL; 135 struct kfd_hsa_memory_exception_data *data; 136 int i; 137 138 for (i = 0; i < p->n_pdds; i++) { 139 if (p->pdds[i]->dev->id == dev_id) { 140 pdd = p->pdds[i]; 141 break; 142 } 143 } 144 145 if (!pdd) 146 return -ENODEV; 147 148 data = (struct kfd_hsa_memory_exception_data *) 149 pdd->vm_fault_exc_data; 150 151 kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid); 152 kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data); 153 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION); 154 } 155 156 if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) { 157 /* 158 * block should only happen after the debugger receives runtime 159 * enable notice. 160 */ 161 up(&p->runtime_enable_sema); 162 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME); 163 } 164 165 if (error_reason) 166 return kfd_send_exception_to_runtime(p, queue_id, error_reason); 167 168 return 0; 169 } 170 171 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable) 172 { 173 struct mqd_update_info minfo = {0}; 174 int err; 175 176 if (!q) 177 return 0; 178 179 if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) || 180 KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0)) 181 return 0; 182 183 if (enable && q->properties.is_user_cu_masked) 184 return -EBUSY; 185 186 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE; 187 188 q->properties.is_dbg_wa = enable; 189 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo); 190 if (err) 191 q->properties.is_dbg_wa = false; 192 193 return err; 194 } 195 196 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable) 197 { 198 struct process_queue_manager *pqm = &target->pqm; 199 struct process_queue_node *pqn; 200 int r = 0; 201 202 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 203 r = kfd_dbg_set_queue_workaround(pqn->q, enable); 204 if (enable && r) 205 goto unwind; 206 } 207 208 return 0; 209 210 unwind: 211 list_for_each_entry(pqn, &pqm->queues, process_queue_list) 212 kfd_dbg_set_queue_workaround(pqn->q, false); 213 214 if (enable) 215 target->runtime_info.runtime_state = r == -EBUSY ? 216 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 217 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 218 219 return r; 220 } 221 222 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd) 223 { 224 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; 225 uint32_t flags = pdd->process->dbg_flags; 226 227 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 228 return 0; 229 230 return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl, 231 pdd->watch_points, flags); 232 } 233 234 /* kfd_dbg_trap_deactivate: 235 * target: target process 236 * unwind: If this is unwinding a failed kfd_dbg_trap_enable() 237 * unwind_count: 238 * If unwind == true, how far down the pdd list we need 239 * to unwind 240 * else: ignored 241 */ 242 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count) 243 { 244 int i; 245 246 if (!unwind) 247 cancel_work_sync(&target->debug_event_workarea); 248 249 for (i = 0; i < target->n_pdds; i++) { 250 struct kfd_process_device *pdd = target->pdds[i]; 251 252 /* If this is an unwind, and we have unwound the required 253 * enable calls on the pdd list, we need to stop now 254 * otherwise we may mess up another debugger session. 255 */ 256 if (unwind && i == unwind_count) 257 break; 258 259 kfd_process_set_trap_debug_flag(&pdd->qpd, false); 260 261 /* GFX off is already disabled by debug activate if not RLC restore supported. */ 262 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 263 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 264 pdd->spi_dbg_override = 265 pdd->dev->kfd2kgd->disable_debug_trap( 266 pdd->dev->adev, 267 target->runtime_info.ttmp_setup, 268 pdd->dev->vm_info.last_vmid_kfd); 269 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 270 271 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) && 272 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd)) 273 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id); 274 275 if (!pdd->dev->kfd->shared_resources.enable_mes) 276 debug_refresh_runlist(pdd->dev->dqm); 277 else 278 kfd_dbg_set_mes_debug_mode(pdd); 279 } 280 281 kfd_dbg_set_workaround(target, false); 282 } 283 284 int kfd_dbg_trap_disable(struct kfd_process *target) 285 { 286 if (!target->debug_trap_enabled) 287 return 0; 288 289 /* 290 * Defer deactivation to runtime if runtime not enabled otherwise reset 291 * attached running target runtime state to enable for re-attach. 292 */ 293 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 294 kfd_dbg_trap_deactivate(target, false, 0); 295 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) 296 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; 297 298 fput(target->dbg_ev_file); 299 target->dbg_ev_file = NULL; 300 301 if (target->debugger_process) { 302 atomic_dec(&target->debugger_process->debugged_process_count); 303 target->debugger_process = NULL; 304 } 305 306 target->debug_trap_enabled = false; 307 kfd_unref_process(target); 308 309 return 0; 310 } 311 312 int kfd_dbg_trap_activate(struct kfd_process *target) 313 { 314 int i, r = 0; 315 316 r = kfd_dbg_set_workaround(target, true); 317 if (r) 318 return r; 319 320 for (i = 0; i < target->n_pdds; i++) { 321 struct kfd_process_device *pdd = target->pdds[i]; 322 323 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) { 324 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd); 325 326 if (r) { 327 target->runtime_info.runtime_state = (r == -EBUSY) ? 328 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 329 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 330 331 goto unwind_err; 332 } 333 } 334 335 /* Disable GFX OFF to prevent garbage read/writes to debug registers. 336 * If RLC restore of debug registers is not supported and runtime enable 337 * hasn't done so already on ttmp setup request, restore the trap config registers. 338 * 339 * If RLC restore of debug registers is not supported, keep gfx off disabled for 340 * the debug session. 341 */ 342 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 343 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) || 344 target->runtime_info.ttmp_setup)) 345 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true, 346 pdd->dev->vm_info.last_vmid_kfd); 347 348 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap( 349 pdd->dev->adev, 350 false, 351 pdd->dev->vm_info.last_vmid_kfd); 352 353 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 354 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 355 356 /* 357 * Setting the debug flag in the trap handler requires that the TMA has been 358 * allocated, which occurs during CWSR initialization. 359 * In the event that CWSR has not been initialized at this point, setting the 360 * flag will be called again during CWSR initialization if the target process 361 * is still debug enabled. 362 */ 363 kfd_process_set_trap_debug_flag(&pdd->qpd, true); 364 365 if (!pdd->dev->kfd->shared_resources.enable_mes) 366 r = debug_refresh_runlist(pdd->dev->dqm); 367 else 368 r = kfd_dbg_set_mes_debug_mode(pdd); 369 370 if (r) { 371 target->runtime_info.runtime_state = 372 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 373 goto unwind_err; 374 } 375 } 376 377 return 0; 378 379 unwind_err: 380 /* Enabling debug failed, we need to disable on 381 * all GPUs so the enable is all or nothing. 382 */ 383 kfd_dbg_trap_deactivate(target, true, i); 384 return r; 385 } 386 387 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, 388 void __user *runtime_info, uint32_t *runtime_size) 389 { 390 struct file *f; 391 uint32_t copy_size; 392 int i, r = 0; 393 394 if (target->debug_trap_enabled) 395 return -EALREADY; 396 397 /* Enable pre-checks */ 398 for (i = 0; i < target->n_pdds; i++) { 399 struct kfd_process_device *pdd = target->pdds[i]; 400 401 if (!KFD_IS_SOC15(pdd->dev)) 402 return -ENODEV; 403 404 if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws) 405 return -EBUSY; 406 } 407 408 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info)); 409 410 f = fget(fd); 411 if (!f) { 412 pr_err("Failed to get file for (%i)\n", fd); 413 return -EBADF; 414 } 415 416 target->dbg_ev_file = f; 417 418 /* defer activation to runtime if not runtime enabled */ 419 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 420 kfd_dbg_trap_activate(target); 421 422 /* We already hold the process reference but hold another one for the 423 * debug session. 424 */ 425 kref_get(&target->ref); 426 target->debug_trap_enabled = true; 427 428 if (target->debugger_process) 429 atomic_inc(&target->debugger_process->debugged_process_count); 430 431 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) { 432 kfd_dbg_trap_deactivate(target, false, 0); 433 r = -EFAULT; 434 } 435 436 *runtime_size = sizeof(target->runtime_info); 437 438 return r; 439 } 440