1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include <linux/file.h>
26 
27 void debug_event_write_work_handler(struct work_struct *work)
28 {
29 	struct kfd_process *process;
30 
31 	static const char write_data = '.';
32 	loff_t pos = 0;
33 
34 	process = container_of(work,
35 			struct kfd_process,
36 			debug_event_workarea);
37 
38 	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
39 }
40 
41 /* update process/device/queue exception status, write to descriptor
42  * only if exception_status is enabled.
43  */
44 bool kfd_dbg_ev_raise(uint64_t event_mask,
45 			struct kfd_process *process, struct kfd_node *dev,
46 			unsigned int source_id, bool use_worker,
47 			void *exception_data, size_t exception_data_size)
48 {
49 	struct process_queue_manager *pqm;
50 	struct process_queue_node *pqn;
51 	int i;
52 	static const char write_data = '.';
53 	loff_t pos = 0;
54 	bool is_subscribed = true;
55 
56 	if (!(process && process->debug_trap_enabled))
57 		return false;
58 
59 	mutex_lock(&process->event_mutex);
60 
61 	if (event_mask & KFD_EC_MASK_DEVICE) {
62 		for (i = 0; i < process->n_pdds; i++) {
63 			struct kfd_process_device *pdd = process->pdds[i];
64 
65 			if (pdd->dev != dev)
66 				continue;
67 
68 			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
69 
70 			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
71 				if (!pdd->vm_fault_exc_data) {
72 					pdd->vm_fault_exc_data = kmemdup(
73 							exception_data,
74 							exception_data_size,
75 							GFP_KERNEL);
76 					if (!pdd->vm_fault_exc_data)
77 						pr_debug("Failed to allocate exception data memory");
78 				} else {
79 					pr_debug("Debugger exception data not saved\n");
80 					print_hex_dump_bytes("exception data: ",
81 							DUMP_PREFIX_OFFSET,
82 							exception_data,
83 							exception_data_size);
84 				}
85 			}
86 			break;
87 		}
88 	} else if (event_mask & KFD_EC_MASK_PROCESS) {
89 		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
90 	} else {
91 		pqm = &process->pqm;
92 		list_for_each_entry(pqn, &pqm->queues,
93 				process_queue_list) {
94 			int target_id;
95 
96 			if (!pqn->q)
97 				continue;
98 
99 			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
100 					pqn->q->properties.queue_id :
101 							pqn->q->doorbell_id;
102 
103 			if (pqn->q->device != dev || target_id != source_id)
104 				continue;
105 
106 			pqn->q->properties.exception_status |= event_mask;
107 			break;
108 		}
109 	}
110 
111 	if (process->exception_enable_mask & event_mask) {
112 		if (use_worker)
113 			schedule_work(&process->debug_event_workarea);
114 		else
115 			kernel_write(process->dbg_ev_file,
116 					&write_data,
117 					1,
118 					&pos);
119 	} else {
120 		is_subscribed = false;
121 	}
122 
123 	mutex_unlock(&process->event_mutex);
124 
125 	return is_subscribed;
126 }
127 
128 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
129 					unsigned int dev_id,
130 					unsigned int queue_id,
131 					uint64_t error_reason)
132 {
133 	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
134 		struct kfd_process_device *pdd = NULL;
135 		struct kfd_hsa_memory_exception_data *data;
136 		int i;
137 
138 		for (i = 0; i < p->n_pdds; i++) {
139 			if (p->pdds[i]->dev->id == dev_id) {
140 				pdd = p->pdds[i];
141 				break;
142 			}
143 		}
144 
145 		if (!pdd)
146 			return -ENODEV;
147 
148 		data = (struct kfd_hsa_memory_exception_data *)
149 						pdd->vm_fault_exc_data;
150 
151 		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
152 		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
153 		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
154 	}
155 
156 	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
157 		/*
158 		 * block should only happen after the debugger receives runtime
159 		 * enable notice.
160 		 */
161 		up(&p->runtime_enable_sema);
162 		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
163 	}
164 
165 	if (error_reason)
166 		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
167 
168 	return 0;
169 }
170 
171 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
172 {
173 	struct mqd_update_info minfo = {0};
174 	int err;
175 
176 	if (!q)
177 		return 0;
178 
179 	if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
180 	    KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
181 		return 0;
182 
183 	if (enable && q->properties.is_user_cu_masked)
184 		return -EBUSY;
185 
186 	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
187 
188 	q->properties.is_dbg_wa = enable;
189 	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
190 	if (err)
191 		q->properties.is_dbg_wa = false;
192 
193 	return err;
194 }
195 
196 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
197 {
198 	struct process_queue_manager *pqm = &target->pqm;
199 	struct process_queue_node *pqn;
200 	int r = 0;
201 
202 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
203 		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
204 		if (enable && r)
205 			goto unwind;
206 	}
207 
208 	return 0;
209 
210 unwind:
211 	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
212 		kfd_dbg_set_queue_workaround(pqn->q, false);
213 
214 	if (enable)
215 		target->runtime_info.runtime_state = r == -EBUSY ?
216 				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
217 				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
218 
219 	return r;
220 }
221 
222 static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
223 {
224 	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
225 	uint32_t flags = pdd->process->dbg_flags;
226 
227 	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
228 		return 0;
229 
230 	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
231 						pdd->watch_points, flags);
232 }
233 
234 /* kfd_dbg_trap_deactivate:
235  *	target: target process
236  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
237  *	unwind_count:
238  *		If unwind == true, how far down the pdd list we need
239  *				to unwind
240  *		else: ignored
241  */
242 static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
243 {
244 	int i;
245 
246 	if (!unwind)
247 		cancel_work_sync(&target->debug_event_workarea);
248 
249 	for (i = 0; i < target->n_pdds; i++) {
250 		struct kfd_process_device *pdd = target->pdds[i];
251 
252 		/* If this is an unwind, and we have unwound the required
253 		 * enable calls on the pdd list, we need to stop now
254 		 * otherwise we may mess up another debugger session.
255 		 */
256 		if (unwind && i == unwind_count)
257 			break;
258 
259 		/* GFX off is already disabled by debug activate if not RLC restore supported. */
260 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
261 			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
262 		pdd->spi_dbg_override =
263 				pdd->dev->kfd2kgd->disable_debug_trap(
264 				pdd->dev->adev,
265 				target->runtime_info.ttmp_setup,
266 				pdd->dev->vm_info.last_vmid_kfd);
267 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
268 
269 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
270 				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
271 			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
272 
273 		if (!pdd->dev->kfd->shared_resources.enable_mes)
274 			debug_refresh_runlist(pdd->dev->dqm);
275 		else
276 			kfd_dbg_set_mes_debug_mode(pdd);
277 	}
278 
279 	kfd_dbg_set_workaround(target, false);
280 }
281 
282 int kfd_dbg_trap_disable(struct kfd_process *target)
283 {
284 	if (!target->debug_trap_enabled)
285 		return 0;
286 
287 	/*
288 	 * Defer deactivation to runtime if runtime not enabled otherwise reset
289 	 * attached running target runtime state to enable for re-attach.
290 	 */
291 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
292 		kfd_dbg_trap_deactivate(target, false, 0);
293 	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
294 		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
295 
296 	fput(target->dbg_ev_file);
297 	target->dbg_ev_file = NULL;
298 
299 	if (target->debugger_process) {
300 		atomic_dec(&target->debugger_process->debugged_process_count);
301 		target->debugger_process = NULL;
302 	}
303 
304 	target->debug_trap_enabled = false;
305 	kfd_unref_process(target);
306 
307 	return 0;
308 }
309 
310 static int kfd_dbg_trap_activate(struct kfd_process *target)
311 {
312 	int i, r = 0;
313 
314 	r = kfd_dbg_set_workaround(target, true);
315 	if (r)
316 		return r;
317 
318 	for (i = 0; i < target->n_pdds; i++) {
319 		struct kfd_process_device *pdd = target->pdds[i];
320 
321 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
322 			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
323 
324 			if (r) {
325 				target->runtime_info.runtime_state = (r == -EBUSY) ?
326 							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
327 							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
328 
329 				goto unwind_err;
330 			}
331 		}
332 
333 		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
334 		 * If RLC restore of debug registers is not supported and runtime enable
335 		 * hasn't done so already on ttmp setup request, restore the trap config registers.
336 		 *
337 		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
338 		 * the debug session.
339 		 */
340 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
341 		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
342 						target->runtime_info.ttmp_setup))
343 			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
344 								pdd->dev->vm_info.last_vmid_kfd);
345 
346 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
347 					pdd->dev->adev,
348 					false,
349 					pdd->dev->vm_info.last_vmid_kfd);
350 
351 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
352 			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
353 
354 		if (!pdd->dev->kfd->shared_resources.enable_mes)
355 			r = debug_refresh_runlist(pdd->dev->dqm);
356 		else
357 			r = kfd_dbg_set_mes_debug_mode(pdd);
358 
359 		if (r) {
360 			target->runtime_info.runtime_state =
361 					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
362 			goto unwind_err;
363 		}
364 	}
365 
366 	return 0;
367 
368 unwind_err:
369 	/* Enabling debug failed, we need to disable on
370 	 * all GPUs so the enable is all or nothing.
371 	 */
372 	kfd_dbg_trap_deactivate(target, true, i);
373 	return r;
374 }
375 
376 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
377 			void __user *runtime_info, uint32_t *runtime_size)
378 {
379 	struct file *f;
380 	uint32_t copy_size;
381 	int i, r = 0;
382 
383 	if (target->debug_trap_enabled)
384 		return -EALREADY;
385 
386 	/* Enable pre-checks */
387 	for (i = 0; i < target->n_pdds; i++) {
388 		struct kfd_process_device *pdd = target->pdds[i];
389 
390 		if (!KFD_IS_SOC15(pdd->dev))
391 			return -ENODEV;
392 
393 		if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
394 			return -EBUSY;
395 	}
396 
397 	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
398 
399 	f = fget(fd);
400 	if (!f) {
401 		pr_err("Failed to get file for (%i)\n", fd);
402 		return -EBADF;
403 	}
404 
405 	target->dbg_ev_file = f;
406 
407 	/* defer activation to runtime if not runtime enabled */
408 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
409 		kfd_dbg_trap_activate(target);
410 
411 	/* We already hold the process reference but hold another one for the
412 	 * debug session.
413 	 */
414 	kref_get(&target->ref);
415 	target->debug_trap_enabled = true;
416 
417 	if (target->debugger_process)
418 		atomic_inc(&target->debugger_process->debugged_process_count);
419 
420 	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
421 		kfd_dbg_trap_deactivate(target, false, 0);
422 		r = -EFAULT;
423 	}
424 
425 	*runtime_size = sizeof(target->runtime_info);
426 
427 	return r;
428 }
429