1 /* 2 * Copyright 2023 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "kfd_debug.h" 24 #include "kfd_device_queue_manager.h" 25 #include <linux/file.h> 26 27 void debug_event_write_work_handler(struct work_struct *work) 28 { 29 struct kfd_process *process; 30 31 static const char write_data = '.'; 32 loff_t pos = 0; 33 34 process = container_of(work, 35 struct kfd_process, 36 debug_event_workarea); 37 38 kernel_write(process->dbg_ev_file, &write_data, 1, &pos); 39 } 40 41 /* update process/device/queue exception status, write to descriptor 42 * only if exception_status is enabled. 43 */ 44 bool kfd_dbg_ev_raise(uint64_t event_mask, 45 struct kfd_process *process, struct kfd_node *dev, 46 unsigned int source_id, bool use_worker, 47 void *exception_data, size_t exception_data_size) 48 { 49 struct process_queue_manager *pqm; 50 struct process_queue_node *pqn; 51 int i; 52 static const char write_data = '.'; 53 loff_t pos = 0; 54 bool is_subscribed = true; 55 56 if (!(process && process->debug_trap_enabled)) 57 return false; 58 59 mutex_lock(&process->event_mutex); 60 61 if (event_mask & KFD_EC_MASK_DEVICE) { 62 for (i = 0; i < process->n_pdds; i++) { 63 struct kfd_process_device *pdd = process->pdds[i]; 64 65 if (pdd->dev != dev) 66 continue; 67 68 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE; 69 70 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 71 if (!pdd->vm_fault_exc_data) { 72 pdd->vm_fault_exc_data = kmemdup( 73 exception_data, 74 exception_data_size, 75 GFP_KERNEL); 76 if (!pdd->vm_fault_exc_data) 77 pr_debug("Failed to allocate exception data memory"); 78 } else { 79 pr_debug("Debugger exception data not saved\n"); 80 print_hex_dump_bytes("exception data: ", 81 DUMP_PREFIX_OFFSET, 82 exception_data, 83 exception_data_size); 84 } 85 } 86 break; 87 } 88 } else if (event_mask & KFD_EC_MASK_PROCESS) { 89 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS; 90 } else { 91 pqm = &process->pqm; 92 list_for_each_entry(pqn, &pqm->queues, 93 process_queue_list) { 94 int target_id; 95 96 if (!pqn->q) 97 continue; 98 99 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ? 100 pqn->q->properties.queue_id : 101 pqn->q->doorbell_id; 102 103 if (pqn->q->device != dev || target_id != source_id) 104 continue; 105 106 pqn->q->properties.exception_status |= event_mask; 107 break; 108 } 109 } 110 111 if (process->exception_enable_mask & event_mask) { 112 if (use_worker) 113 schedule_work(&process->debug_event_workarea); 114 else 115 kernel_write(process->dbg_ev_file, 116 &write_data, 117 1, 118 &pos); 119 } else { 120 is_subscribed = false; 121 } 122 123 mutex_unlock(&process->event_mutex); 124 125 return is_subscribed; 126 } 127 128 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable) 129 { 130 struct mqd_update_info minfo = {0}; 131 int err; 132 133 if (!q) 134 return 0; 135 136 if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) || 137 KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0)) 138 return 0; 139 140 if (enable && q->properties.is_user_cu_masked) 141 return -EBUSY; 142 143 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE; 144 145 q->properties.is_dbg_wa = enable; 146 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo); 147 if (err) 148 q->properties.is_dbg_wa = false; 149 150 return err; 151 } 152 153 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable) 154 { 155 struct process_queue_manager *pqm = &target->pqm; 156 struct process_queue_node *pqn; 157 int r = 0; 158 159 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 160 r = kfd_dbg_set_queue_workaround(pqn->q, enable); 161 if (enable && r) 162 goto unwind; 163 } 164 165 return 0; 166 167 unwind: 168 list_for_each_entry(pqn, &pqm->queues, process_queue_list) 169 kfd_dbg_set_queue_workaround(pqn->q, false); 170 171 if (enable) 172 target->runtime_info.runtime_state = r == -EBUSY ? 173 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 174 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 175 176 return r; 177 } 178 179 static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd) 180 { 181 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; 182 uint32_t flags = pdd->process->dbg_flags; 183 184 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 185 return 0; 186 187 return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl, 188 pdd->watch_points, flags); 189 } 190 191 /* kfd_dbg_trap_deactivate: 192 * target: target process 193 * unwind: If this is unwinding a failed kfd_dbg_trap_enable() 194 * unwind_count: 195 * If unwind == true, how far down the pdd list we need 196 * to unwind 197 * else: ignored 198 */ 199 static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count) 200 { 201 int i; 202 203 if (!unwind) 204 cancel_work_sync(&target->debug_event_workarea); 205 206 for (i = 0; i < target->n_pdds; i++) { 207 struct kfd_process_device *pdd = target->pdds[i]; 208 209 /* If this is an unwind, and we have unwound the required 210 * enable calls on the pdd list, we need to stop now 211 * otherwise we may mess up another debugger session. 212 */ 213 if (unwind && i == unwind_count) 214 break; 215 216 /* GFX off is already disabled by debug activate if not RLC restore supported. */ 217 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 218 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 219 pdd->spi_dbg_override = 220 pdd->dev->kfd2kgd->disable_debug_trap( 221 pdd->dev->adev, 222 target->runtime_info.ttmp_setup, 223 pdd->dev->vm_info.last_vmid_kfd); 224 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 225 226 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) && 227 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd)) 228 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id); 229 230 if (!pdd->dev->kfd->shared_resources.enable_mes) 231 debug_refresh_runlist(pdd->dev->dqm); 232 else 233 kfd_dbg_set_mes_debug_mode(pdd); 234 } 235 236 kfd_dbg_set_workaround(target, false); 237 } 238 239 int kfd_dbg_trap_disable(struct kfd_process *target) 240 { 241 if (!target->debug_trap_enabled) 242 return 0; 243 244 /* 245 * Defer deactivation to runtime if runtime not enabled otherwise reset 246 * attached running target runtime state to enable for re-attach. 247 */ 248 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 249 kfd_dbg_trap_deactivate(target, false, 0); 250 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) 251 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; 252 253 fput(target->dbg_ev_file); 254 target->dbg_ev_file = NULL; 255 256 if (target->debugger_process) { 257 atomic_dec(&target->debugger_process->debugged_process_count); 258 target->debugger_process = NULL; 259 } 260 261 target->debug_trap_enabled = false; 262 kfd_unref_process(target); 263 264 return 0; 265 } 266 267 static int kfd_dbg_trap_activate(struct kfd_process *target) 268 { 269 int i, r = 0; 270 271 r = kfd_dbg_set_workaround(target, true); 272 if (r) 273 return r; 274 275 for (i = 0; i < target->n_pdds; i++) { 276 struct kfd_process_device *pdd = target->pdds[i]; 277 278 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) { 279 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd); 280 281 if (r) { 282 target->runtime_info.runtime_state = (r == -EBUSY) ? 283 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 284 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 285 286 goto unwind_err; 287 } 288 } 289 290 /* Disable GFX OFF to prevent garbage read/writes to debug registers. 291 * If RLC restore of debug registers is not supported and runtime enable 292 * hasn't done so already on ttmp setup request, restore the trap config registers. 293 * 294 * If RLC restore of debug registers is not supported, keep gfx off disabled for 295 * the debug session. 296 */ 297 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 298 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) || 299 target->runtime_info.ttmp_setup)) 300 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true, 301 pdd->dev->vm_info.last_vmid_kfd); 302 303 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap( 304 pdd->dev->adev, 305 false, 306 pdd->dev->vm_info.last_vmid_kfd); 307 308 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 309 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 310 311 if (!pdd->dev->kfd->shared_resources.enable_mes) 312 r = debug_refresh_runlist(pdd->dev->dqm); 313 else 314 r = kfd_dbg_set_mes_debug_mode(pdd); 315 316 if (r) { 317 target->runtime_info.runtime_state = 318 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 319 goto unwind_err; 320 } 321 } 322 323 return 0; 324 325 unwind_err: 326 /* Enabling debug failed, we need to disable on 327 * all GPUs so the enable is all or nothing. 328 */ 329 kfd_dbg_trap_deactivate(target, true, i); 330 return r; 331 } 332 333 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, 334 void __user *runtime_info, uint32_t *runtime_size) 335 { 336 struct file *f; 337 uint32_t copy_size; 338 int i, r = 0; 339 340 if (target->debug_trap_enabled) 341 return -EALREADY; 342 343 /* Enable pre-checks */ 344 for (i = 0; i < target->n_pdds; i++) { 345 struct kfd_process_device *pdd = target->pdds[i]; 346 347 if (!KFD_IS_SOC15(pdd->dev)) 348 return -ENODEV; 349 350 if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws) 351 return -EBUSY; 352 } 353 354 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info)); 355 356 f = fget(fd); 357 if (!f) { 358 pr_err("Failed to get file for (%i)\n", fd); 359 return -EBADF; 360 } 361 362 target->dbg_ev_file = f; 363 364 /* defer activation to runtime if not runtime enabled */ 365 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 366 kfd_dbg_trap_activate(target); 367 368 /* We already hold the process reference but hold another one for the 369 * debug session. 370 */ 371 kref_get(&target->ref); 372 target->debug_trap_enabled = true; 373 374 if (target->debugger_process) 375 atomic_inc(&target->debugger_process->debugged_process_count); 376 377 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) { 378 kfd_dbg_trap_deactivate(target, false, 0); 379 r = -EFAULT; 380 } 381 382 *runtime_size = sizeof(target->runtime_info); 383 384 return r; 385 } 386