1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include <linux/file.h>
26 
27 void debug_event_write_work_handler(struct work_struct *work)
28 {
29 	struct kfd_process *process;
30 
31 	static const char write_data = '.';
32 	loff_t pos = 0;
33 
34 	process = container_of(work,
35 			struct kfd_process,
36 			debug_event_workarea);
37 
38 	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
39 }
40 
41 /* update process/device/queue exception status, write to descriptor
42  * only if exception_status is enabled.
43  */
44 bool kfd_dbg_ev_raise(uint64_t event_mask,
45 			struct kfd_process *process, struct kfd_node *dev,
46 			unsigned int source_id, bool use_worker,
47 			void *exception_data, size_t exception_data_size)
48 {
49 	struct process_queue_manager *pqm;
50 	struct process_queue_node *pqn;
51 	int i;
52 	static const char write_data = '.';
53 	loff_t pos = 0;
54 	bool is_subscribed = true;
55 
56 	if (!(process && process->debug_trap_enabled))
57 		return false;
58 
59 	mutex_lock(&process->event_mutex);
60 
61 	if (event_mask & KFD_EC_MASK_DEVICE) {
62 		for (i = 0; i < process->n_pdds; i++) {
63 			struct kfd_process_device *pdd = process->pdds[i];
64 
65 			if (pdd->dev != dev)
66 				continue;
67 
68 			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
69 
70 			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
71 				if (!pdd->vm_fault_exc_data) {
72 					pdd->vm_fault_exc_data = kmemdup(
73 							exception_data,
74 							exception_data_size,
75 							GFP_KERNEL);
76 					if (!pdd->vm_fault_exc_data)
77 						pr_debug("Failed to allocate exception data memory");
78 				} else {
79 					pr_debug("Debugger exception data not saved\n");
80 					print_hex_dump_bytes("exception data: ",
81 							DUMP_PREFIX_OFFSET,
82 							exception_data,
83 							exception_data_size);
84 				}
85 			}
86 			break;
87 		}
88 	} else if (event_mask & KFD_EC_MASK_PROCESS) {
89 		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
90 	} else {
91 		pqm = &process->pqm;
92 		list_for_each_entry(pqn, &pqm->queues,
93 				process_queue_list) {
94 			int target_id;
95 
96 			if (!pqn->q)
97 				continue;
98 
99 			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
100 					pqn->q->properties.queue_id :
101 							pqn->q->doorbell_id;
102 
103 			if (pqn->q->device != dev || target_id != source_id)
104 				continue;
105 
106 			pqn->q->properties.exception_status |= event_mask;
107 			break;
108 		}
109 	}
110 
111 	if (process->exception_enable_mask & event_mask) {
112 		if (use_worker)
113 			schedule_work(&process->debug_event_workarea);
114 		else
115 			kernel_write(process->dbg_ev_file,
116 					&write_data,
117 					1,
118 					&pos);
119 	} else {
120 		is_subscribed = false;
121 	}
122 
123 	mutex_unlock(&process->event_mutex);
124 
125 	return is_subscribed;
126 }
127 
128 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
129 					unsigned int dev_id,
130 					unsigned int queue_id,
131 					uint64_t error_reason)
132 {
133 	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
134 		struct kfd_process_device *pdd = NULL;
135 		struct kfd_hsa_memory_exception_data *data;
136 		int i;
137 
138 		for (i = 0; i < p->n_pdds; i++) {
139 			if (p->pdds[i]->dev->id == dev_id) {
140 				pdd = p->pdds[i];
141 				break;
142 			}
143 		}
144 
145 		if (!pdd)
146 			return -ENODEV;
147 
148 		data = (struct kfd_hsa_memory_exception_data *)
149 						pdd->vm_fault_exc_data;
150 
151 		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
152 		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
153 		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
154 	}
155 
156 	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
157 		/*
158 		 * block should only happen after the debugger receives runtime
159 		 * enable notice.
160 		 */
161 		up(&p->runtime_enable_sema);
162 		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
163 	}
164 
165 	if (error_reason)
166 		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
167 
168 	return 0;
169 }
170 
171 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
172 {
173 	struct mqd_update_info minfo = {0};
174 	int err;
175 
176 	if (!q)
177 		return 0;
178 
179 	if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
180 	    KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
181 		return 0;
182 
183 	if (enable && q->properties.is_user_cu_masked)
184 		return -EBUSY;
185 
186 	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
187 
188 	q->properties.is_dbg_wa = enable;
189 	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
190 	if (err)
191 		q->properties.is_dbg_wa = false;
192 
193 	return err;
194 }
195 
196 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
197 {
198 	struct process_queue_manager *pqm = &target->pqm;
199 	struct process_queue_node *pqn;
200 	int r = 0;
201 
202 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
203 		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
204 		if (enable && r)
205 			goto unwind;
206 	}
207 
208 	return 0;
209 
210 unwind:
211 	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
212 		kfd_dbg_set_queue_workaround(pqn->q, false);
213 
214 	if (enable)
215 		target->runtime_info.runtime_state = r == -EBUSY ?
216 				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
217 				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
218 
219 	return r;
220 }
221 
222 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
223 {
224 	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
225 	uint32_t flags = pdd->process->dbg_flags;
226 
227 	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
228 		return 0;
229 
230 	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
231 						pdd->watch_points, flags);
232 }
233 
234 /* kfd_dbg_trap_deactivate:
235  *	target: target process
236  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
237  *	unwind_count:
238  *		If unwind == true, how far down the pdd list we need
239  *				to unwind
240  *		else: ignored
241  */
242 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
243 {
244 	int i;
245 
246 	if (!unwind)
247 		cancel_work_sync(&target->debug_event_workarea);
248 
249 	for (i = 0; i < target->n_pdds; i++) {
250 		struct kfd_process_device *pdd = target->pdds[i];
251 
252 		/* If this is an unwind, and we have unwound the required
253 		 * enable calls on the pdd list, we need to stop now
254 		 * otherwise we may mess up another debugger session.
255 		 */
256 		if (unwind && i == unwind_count)
257 			break;
258 
259 		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
260 
261 		/* GFX off is already disabled by debug activate if not RLC restore supported. */
262 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
263 			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
264 		pdd->spi_dbg_override =
265 				pdd->dev->kfd2kgd->disable_debug_trap(
266 				pdd->dev->adev,
267 				target->runtime_info.ttmp_setup,
268 				pdd->dev->vm_info.last_vmid_kfd);
269 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
270 
271 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
272 				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
273 			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
274 
275 		if (!pdd->dev->kfd->shared_resources.enable_mes)
276 			debug_refresh_runlist(pdd->dev->dqm);
277 		else
278 			kfd_dbg_set_mes_debug_mode(pdd);
279 	}
280 
281 	kfd_dbg_set_workaround(target, false);
282 }
283 
284 int kfd_dbg_trap_disable(struct kfd_process *target)
285 {
286 	if (!target->debug_trap_enabled)
287 		return 0;
288 
289 	/*
290 	 * Defer deactivation to runtime if runtime not enabled otherwise reset
291 	 * attached running target runtime state to enable for re-attach.
292 	 */
293 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
294 		kfd_dbg_trap_deactivate(target, false, 0);
295 	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
296 		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
297 
298 	fput(target->dbg_ev_file);
299 	target->dbg_ev_file = NULL;
300 
301 	if (target->debugger_process) {
302 		atomic_dec(&target->debugger_process->debugged_process_count);
303 		target->debugger_process = NULL;
304 	}
305 
306 	target->debug_trap_enabled = false;
307 	kfd_unref_process(target);
308 
309 	return 0;
310 }
311 
312 int kfd_dbg_trap_activate(struct kfd_process *target)
313 {
314 	int i, r = 0;
315 
316 	r = kfd_dbg_set_workaround(target, true);
317 	if (r)
318 		return r;
319 
320 	for (i = 0; i < target->n_pdds; i++) {
321 		struct kfd_process_device *pdd = target->pdds[i];
322 
323 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
324 			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
325 
326 			if (r) {
327 				target->runtime_info.runtime_state = (r == -EBUSY) ?
328 							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
329 							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
330 
331 				goto unwind_err;
332 			}
333 		}
334 
335 		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
336 		 * If RLC restore of debug registers is not supported and runtime enable
337 		 * hasn't done so already on ttmp setup request, restore the trap config registers.
338 		 *
339 		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
340 		 * the debug session.
341 		 */
342 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
343 		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
344 						target->runtime_info.ttmp_setup))
345 			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
346 								pdd->dev->vm_info.last_vmid_kfd);
347 
348 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
349 					pdd->dev->adev,
350 					false,
351 					pdd->dev->vm_info.last_vmid_kfd);
352 
353 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
354 			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
355 
356 		/*
357 		 * Setting the debug flag in the trap handler requires that the TMA has been
358 		 * allocated, which occurs during CWSR initialization.
359 		 * In the event that CWSR has not been initialized at this point, setting the
360 		 * flag will be called again during CWSR initialization if the target process
361 		 * is still debug enabled.
362 		 */
363 		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
364 
365 		if (!pdd->dev->kfd->shared_resources.enable_mes)
366 			r = debug_refresh_runlist(pdd->dev->dqm);
367 		else
368 			r = kfd_dbg_set_mes_debug_mode(pdd);
369 
370 		if (r) {
371 			target->runtime_info.runtime_state =
372 					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
373 			goto unwind_err;
374 		}
375 	}
376 
377 	return 0;
378 
379 unwind_err:
380 	/* Enabling debug failed, we need to disable on
381 	 * all GPUs so the enable is all or nothing.
382 	 */
383 	kfd_dbg_trap_deactivate(target, true, i);
384 	return r;
385 }
386 
387 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
388 			void __user *runtime_info, uint32_t *runtime_size)
389 {
390 	struct file *f;
391 	uint32_t copy_size;
392 	int i, r = 0;
393 
394 	if (target->debug_trap_enabled)
395 		return -EALREADY;
396 
397 	/* Enable pre-checks */
398 	for (i = 0; i < target->n_pdds; i++) {
399 		struct kfd_process_device *pdd = target->pdds[i];
400 
401 		if (!KFD_IS_SOC15(pdd->dev))
402 			return -ENODEV;
403 
404 		if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
405 			return -EBUSY;
406 	}
407 
408 	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
409 
410 	f = fget(fd);
411 	if (!f) {
412 		pr_err("Failed to get file for (%i)\n", fd);
413 		return -EBADF;
414 	}
415 
416 	target->dbg_ev_file = f;
417 
418 	/* defer activation to runtime if not runtime enabled */
419 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
420 		kfd_dbg_trap_activate(target);
421 
422 	/* We already hold the process reference but hold another one for the
423 	 * debug session.
424 	 */
425 	kref_get(&target->ref);
426 	target->debug_trap_enabled = true;
427 
428 	if (target->debugger_process)
429 		atomic_inc(&target->debugger_process->debugged_process_count);
430 
431 	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
432 		kfd_dbg_trap_deactivate(target, false, 0);
433 		r = -EFAULT;
434 	}
435 
436 	*runtime_size = sizeof(target->runtime_info);
437 
438 	return r;
439 }
440