1 /* 2 * Copyright 2023 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "kfd_debug.h" 24 #include "kfd_device_queue_manager.h" 25 #include <linux/file.h> 26 27 void debug_event_write_work_handler(struct work_struct *work) 28 { 29 struct kfd_process *process; 30 31 static const char write_data = '.'; 32 loff_t pos = 0; 33 34 process = container_of(work, 35 struct kfd_process, 36 debug_event_workarea); 37 38 kernel_write(process->dbg_ev_file, &write_data, 1, &pos); 39 } 40 41 /* update process/device/queue exception status, write to descriptor 42 * only if exception_status is enabled. 43 */ 44 bool kfd_dbg_ev_raise(uint64_t event_mask, 45 struct kfd_process *process, struct kfd_node *dev, 46 unsigned int source_id, bool use_worker, 47 void *exception_data, size_t exception_data_size) 48 { 49 struct process_queue_manager *pqm; 50 struct process_queue_node *pqn; 51 int i; 52 static const char write_data = '.'; 53 loff_t pos = 0; 54 bool is_subscribed = true; 55 56 if (!(process && process->debug_trap_enabled)) 57 return false; 58 59 mutex_lock(&process->event_mutex); 60 61 if (event_mask & KFD_EC_MASK_DEVICE) { 62 for (i = 0; i < process->n_pdds; i++) { 63 struct kfd_process_device *pdd = process->pdds[i]; 64 65 if (pdd->dev != dev) 66 continue; 67 68 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE; 69 70 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 71 if (!pdd->vm_fault_exc_data) { 72 pdd->vm_fault_exc_data = kmemdup( 73 exception_data, 74 exception_data_size, 75 GFP_KERNEL); 76 if (!pdd->vm_fault_exc_data) 77 pr_debug("Failed to allocate exception data memory"); 78 } else { 79 pr_debug("Debugger exception data not saved\n"); 80 print_hex_dump_bytes("exception data: ", 81 DUMP_PREFIX_OFFSET, 82 exception_data, 83 exception_data_size); 84 } 85 } 86 break; 87 } 88 } else if (event_mask & KFD_EC_MASK_PROCESS) { 89 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS; 90 } else { 91 pqm = &process->pqm; 92 list_for_each_entry(pqn, &pqm->queues, 93 process_queue_list) { 94 int target_id; 95 96 if (!pqn->q) 97 continue; 98 99 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ? 100 pqn->q->properties.queue_id : 101 pqn->q->doorbell_id; 102 103 if (pqn->q->device != dev || target_id != source_id) 104 continue; 105 106 pqn->q->properties.exception_status |= event_mask; 107 break; 108 } 109 } 110 111 if (process->exception_enable_mask & event_mask) { 112 if (use_worker) 113 schedule_work(&process->debug_event_workarea); 114 else 115 kernel_write(process->dbg_ev_file, 116 &write_data, 117 1, 118 &pos); 119 } else { 120 is_subscribed = false; 121 } 122 123 mutex_unlock(&process->event_mutex); 124 125 return is_subscribed; 126 } 127 128 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p, 129 unsigned int dev_id, 130 unsigned int queue_id, 131 uint64_t error_reason) 132 { 133 if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 134 struct kfd_process_device *pdd = NULL; 135 struct kfd_hsa_memory_exception_data *data; 136 int i; 137 138 for (i = 0; i < p->n_pdds; i++) { 139 if (p->pdds[i]->dev->id == dev_id) { 140 pdd = p->pdds[i]; 141 break; 142 } 143 } 144 145 if (!pdd) 146 return -ENODEV; 147 148 data = (struct kfd_hsa_memory_exception_data *) 149 pdd->vm_fault_exc_data; 150 151 kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid); 152 kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data); 153 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION); 154 } 155 156 if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) { 157 /* 158 * block should only happen after the debugger receives runtime 159 * enable notice. 160 */ 161 up(&p->runtime_enable_sema); 162 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME); 163 } 164 165 if (error_reason) 166 return kfd_send_exception_to_runtime(p, queue_id, error_reason); 167 168 return 0; 169 } 170 171 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable) 172 { 173 struct mqd_update_info minfo = {0}; 174 int err; 175 176 if (!q) 177 return 0; 178 179 if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) || 180 KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0)) 181 return 0; 182 183 if (enable && q->properties.is_user_cu_masked) 184 return -EBUSY; 185 186 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE; 187 188 q->properties.is_dbg_wa = enable; 189 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo); 190 if (err) 191 q->properties.is_dbg_wa = false; 192 193 return err; 194 } 195 196 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable) 197 { 198 struct process_queue_manager *pqm = &target->pqm; 199 struct process_queue_node *pqn; 200 int r = 0; 201 202 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 203 r = kfd_dbg_set_queue_workaround(pqn->q, enable); 204 if (enable && r) 205 goto unwind; 206 } 207 208 return 0; 209 210 unwind: 211 list_for_each_entry(pqn, &pqm->queues, process_queue_list) 212 kfd_dbg_set_queue_workaround(pqn->q, false); 213 214 if (enable) 215 target->runtime_info.runtime_state = r == -EBUSY ? 216 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 217 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 218 219 return r; 220 } 221 222 static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd) 223 { 224 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; 225 uint32_t flags = pdd->process->dbg_flags; 226 227 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 228 return 0; 229 230 return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl, 231 pdd->watch_points, flags); 232 } 233 234 /* kfd_dbg_trap_deactivate: 235 * target: target process 236 * unwind: If this is unwinding a failed kfd_dbg_trap_enable() 237 * unwind_count: 238 * If unwind == true, how far down the pdd list we need 239 * to unwind 240 * else: ignored 241 */ 242 static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count) 243 { 244 int i; 245 246 if (!unwind) 247 cancel_work_sync(&target->debug_event_workarea); 248 249 for (i = 0; i < target->n_pdds; i++) { 250 struct kfd_process_device *pdd = target->pdds[i]; 251 252 /* If this is an unwind, and we have unwound the required 253 * enable calls on the pdd list, we need to stop now 254 * otherwise we may mess up another debugger session. 255 */ 256 if (unwind && i == unwind_count) 257 break; 258 259 /* GFX off is already disabled by debug activate if not RLC restore supported. */ 260 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 261 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 262 pdd->spi_dbg_override = 263 pdd->dev->kfd2kgd->disable_debug_trap( 264 pdd->dev->adev, 265 target->runtime_info.ttmp_setup, 266 pdd->dev->vm_info.last_vmid_kfd); 267 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 268 269 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) && 270 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd)) 271 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id); 272 273 if (!pdd->dev->kfd->shared_resources.enable_mes) 274 debug_refresh_runlist(pdd->dev->dqm); 275 else 276 kfd_dbg_set_mes_debug_mode(pdd); 277 } 278 279 kfd_dbg_set_workaround(target, false); 280 } 281 282 int kfd_dbg_trap_disable(struct kfd_process *target) 283 { 284 if (!target->debug_trap_enabled) 285 return 0; 286 287 /* 288 * Defer deactivation to runtime if runtime not enabled otherwise reset 289 * attached running target runtime state to enable for re-attach. 290 */ 291 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 292 kfd_dbg_trap_deactivate(target, false, 0); 293 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) 294 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; 295 296 fput(target->dbg_ev_file); 297 target->dbg_ev_file = NULL; 298 299 if (target->debugger_process) { 300 atomic_dec(&target->debugger_process->debugged_process_count); 301 target->debugger_process = NULL; 302 } 303 304 target->debug_trap_enabled = false; 305 kfd_unref_process(target); 306 307 return 0; 308 } 309 310 static int kfd_dbg_trap_activate(struct kfd_process *target) 311 { 312 int i, r = 0; 313 314 r = kfd_dbg_set_workaround(target, true); 315 if (r) 316 return r; 317 318 for (i = 0; i < target->n_pdds; i++) { 319 struct kfd_process_device *pdd = target->pdds[i]; 320 321 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) { 322 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd); 323 324 if (r) { 325 target->runtime_info.runtime_state = (r == -EBUSY) ? 326 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 327 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 328 329 goto unwind_err; 330 } 331 } 332 333 /* Disable GFX OFF to prevent garbage read/writes to debug registers. 334 * If RLC restore of debug registers is not supported and runtime enable 335 * hasn't done so already on ttmp setup request, restore the trap config registers. 336 * 337 * If RLC restore of debug registers is not supported, keep gfx off disabled for 338 * the debug session. 339 */ 340 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 341 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) || 342 target->runtime_info.ttmp_setup)) 343 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true, 344 pdd->dev->vm_info.last_vmid_kfd); 345 346 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap( 347 pdd->dev->adev, 348 false, 349 pdd->dev->vm_info.last_vmid_kfd); 350 351 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 352 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 353 354 if (!pdd->dev->kfd->shared_resources.enable_mes) 355 r = debug_refresh_runlist(pdd->dev->dqm); 356 else 357 r = kfd_dbg_set_mes_debug_mode(pdd); 358 359 if (r) { 360 target->runtime_info.runtime_state = 361 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 362 goto unwind_err; 363 } 364 } 365 366 return 0; 367 368 unwind_err: 369 /* Enabling debug failed, we need to disable on 370 * all GPUs so the enable is all or nothing. 371 */ 372 kfd_dbg_trap_deactivate(target, true, i); 373 return r; 374 } 375 376 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, 377 void __user *runtime_info, uint32_t *runtime_size) 378 { 379 struct file *f; 380 uint32_t copy_size; 381 int i, r = 0; 382 383 if (target->debug_trap_enabled) 384 return -EALREADY; 385 386 /* Enable pre-checks */ 387 for (i = 0; i < target->n_pdds; i++) { 388 struct kfd_process_device *pdd = target->pdds[i]; 389 390 if (!KFD_IS_SOC15(pdd->dev)) 391 return -ENODEV; 392 393 if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws) 394 return -EBUSY; 395 } 396 397 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info)); 398 399 f = fget(fd); 400 if (!f) { 401 pr_err("Failed to get file for (%i)\n", fd); 402 return -EBADF; 403 } 404 405 target->dbg_ev_file = f; 406 407 /* defer activation to runtime if not runtime enabled */ 408 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 409 kfd_dbg_trap_activate(target); 410 411 /* We already hold the process reference but hold another one for the 412 * debug session. 413 */ 414 kref_get(&target->ref); 415 target->debug_trap_enabled = true; 416 417 if (target->debugger_process) 418 atomic_inc(&target->debugger_process->debugged_process_count); 419 420 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) { 421 kfd_dbg_trap_deactivate(target, false, 0); 422 r = -EFAULT; 423 } 424 425 *runtime_size = sizeof(target->runtime_info); 426 427 return r; 428 } 429