1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include <linux/file.h>
26 
27 void debug_event_write_work_handler(struct work_struct *work)
28 {
29 	struct kfd_process *process;
30 
31 	static const char write_data = '.';
32 	loff_t pos = 0;
33 
34 	process = container_of(work,
35 			struct kfd_process,
36 			debug_event_workarea);
37 
38 	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
39 }
40 
41 /* update process/device/queue exception status, write to descriptor
42  * only if exception_status is enabled.
43  */
44 bool kfd_dbg_ev_raise(uint64_t event_mask,
45 			struct kfd_process *process, struct kfd_node *dev,
46 			unsigned int source_id, bool use_worker,
47 			void *exception_data, size_t exception_data_size)
48 {
49 	struct process_queue_manager *pqm;
50 	struct process_queue_node *pqn;
51 	int i;
52 	static const char write_data = '.';
53 	loff_t pos = 0;
54 	bool is_subscribed = true;
55 
56 	if (!(process && process->debug_trap_enabled))
57 		return false;
58 
59 	mutex_lock(&process->event_mutex);
60 
61 	if (event_mask & KFD_EC_MASK_DEVICE) {
62 		for (i = 0; i < process->n_pdds; i++) {
63 			struct kfd_process_device *pdd = process->pdds[i];
64 
65 			if (pdd->dev != dev)
66 				continue;
67 
68 			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
69 
70 			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
71 				if (!pdd->vm_fault_exc_data) {
72 					pdd->vm_fault_exc_data = kmemdup(
73 							exception_data,
74 							exception_data_size,
75 							GFP_KERNEL);
76 					if (!pdd->vm_fault_exc_data)
77 						pr_debug("Failed to allocate exception data memory");
78 				} else {
79 					pr_debug("Debugger exception data not saved\n");
80 					print_hex_dump_bytes("exception data: ",
81 							DUMP_PREFIX_OFFSET,
82 							exception_data,
83 							exception_data_size);
84 				}
85 			}
86 			break;
87 		}
88 	} else if (event_mask & KFD_EC_MASK_PROCESS) {
89 		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
90 	} else {
91 		pqm = &process->pqm;
92 		list_for_each_entry(pqn, &pqm->queues,
93 				process_queue_list) {
94 			int target_id;
95 
96 			if (!pqn->q)
97 				continue;
98 
99 			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
100 					pqn->q->properties.queue_id :
101 							pqn->q->doorbell_id;
102 
103 			if (pqn->q->device != dev || target_id != source_id)
104 				continue;
105 
106 			pqn->q->properties.exception_status |= event_mask;
107 			break;
108 		}
109 	}
110 
111 	if (process->exception_enable_mask & event_mask) {
112 		if (use_worker)
113 			schedule_work(&process->debug_event_workarea);
114 		else
115 			kernel_write(process->dbg_ev_file,
116 					&write_data,
117 					1,
118 					&pos);
119 	} else {
120 		is_subscribed = false;
121 	}
122 
123 	mutex_unlock(&process->event_mutex);
124 
125 	return is_subscribed;
126 }
127 
128 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
129 {
130 	struct mqd_update_info minfo = {0};
131 	int err;
132 
133 	if (!q)
134 		return 0;
135 
136 	if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
137 	    KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
138 		return 0;
139 
140 	if (enable && q->properties.is_user_cu_masked)
141 		return -EBUSY;
142 
143 	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
144 
145 	q->properties.is_dbg_wa = enable;
146 	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
147 	if (err)
148 		q->properties.is_dbg_wa = false;
149 
150 	return err;
151 }
152 
153 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
154 {
155 	struct process_queue_manager *pqm = &target->pqm;
156 	struct process_queue_node *pqn;
157 	int r = 0;
158 
159 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
160 		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
161 		if (enable && r)
162 			goto unwind;
163 	}
164 
165 	return 0;
166 
167 unwind:
168 	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
169 		kfd_dbg_set_queue_workaround(pqn->q, false);
170 
171 	if (enable)
172 		target->runtime_info.runtime_state = r == -EBUSY ?
173 				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
174 				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
175 
176 	return r;
177 }
178 
179 static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
180 {
181 	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
182 	uint32_t flags = pdd->process->dbg_flags;
183 
184 	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
185 		return 0;
186 
187 	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
188 						pdd->watch_points, flags);
189 }
190 
191 /* kfd_dbg_trap_deactivate:
192  *	target: target process
193  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
194  *	unwind_count:
195  *		If unwind == true, how far down the pdd list we need
196  *				to unwind
197  *		else: ignored
198  */
199 static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
200 {
201 	int i;
202 
203 	if (!unwind)
204 		cancel_work_sync(&target->debug_event_workarea);
205 
206 	for (i = 0; i < target->n_pdds; i++) {
207 		struct kfd_process_device *pdd = target->pdds[i];
208 
209 		/* If this is an unwind, and we have unwound the required
210 		 * enable calls on the pdd list, we need to stop now
211 		 * otherwise we may mess up another debugger session.
212 		 */
213 		if (unwind && i == unwind_count)
214 			break;
215 
216 		/* GFX off is already disabled by debug activate if not RLC restore supported. */
217 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
218 			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
219 		pdd->spi_dbg_override =
220 				pdd->dev->kfd2kgd->disable_debug_trap(
221 				pdd->dev->adev,
222 				target->runtime_info.ttmp_setup,
223 				pdd->dev->vm_info.last_vmid_kfd);
224 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
225 
226 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
227 				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
228 			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
229 
230 		if (!pdd->dev->kfd->shared_resources.enable_mes)
231 			debug_refresh_runlist(pdd->dev->dqm);
232 		else
233 			kfd_dbg_set_mes_debug_mode(pdd);
234 	}
235 
236 	kfd_dbg_set_workaround(target, false);
237 }
238 
239 int kfd_dbg_trap_disable(struct kfd_process *target)
240 {
241 	if (!target->debug_trap_enabled)
242 		return 0;
243 
244 	/*
245 	 * Defer deactivation to runtime if runtime not enabled otherwise reset
246 	 * attached running target runtime state to enable for re-attach.
247 	 */
248 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
249 		kfd_dbg_trap_deactivate(target, false, 0);
250 	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
251 		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
252 
253 	fput(target->dbg_ev_file);
254 	target->dbg_ev_file = NULL;
255 
256 	if (target->debugger_process) {
257 		atomic_dec(&target->debugger_process->debugged_process_count);
258 		target->debugger_process = NULL;
259 	}
260 
261 	target->debug_trap_enabled = false;
262 	kfd_unref_process(target);
263 
264 	return 0;
265 }
266 
267 static int kfd_dbg_trap_activate(struct kfd_process *target)
268 {
269 	int i, r = 0;
270 
271 	r = kfd_dbg_set_workaround(target, true);
272 	if (r)
273 		return r;
274 
275 	for (i = 0; i < target->n_pdds; i++) {
276 		struct kfd_process_device *pdd = target->pdds[i];
277 
278 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
279 			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
280 
281 			if (r) {
282 				target->runtime_info.runtime_state = (r == -EBUSY) ?
283 							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
284 							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
285 
286 				goto unwind_err;
287 			}
288 		}
289 
290 		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
291 		 * If RLC restore of debug registers is not supported and runtime enable
292 		 * hasn't done so already on ttmp setup request, restore the trap config registers.
293 		 *
294 		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
295 		 * the debug session.
296 		 */
297 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
298 		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
299 						target->runtime_info.ttmp_setup))
300 			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
301 								pdd->dev->vm_info.last_vmid_kfd);
302 
303 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
304 					pdd->dev->adev,
305 					false,
306 					pdd->dev->vm_info.last_vmid_kfd);
307 
308 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
309 			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
310 
311 		if (!pdd->dev->kfd->shared_resources.enable_mes)
312 			r = debug_refresh_runlist(pdd->dev->dqm);
313 		else
314 			r = kfd_dbg_set_mes_debug_mode(pdd);
315 
316 		if (r) {
317 			target->runtime_info.runtime_state =
318 					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
319 			goto unwind_err;
320 		}
321 	}
322 
323 	return 0;
324 
325 unwind_err:
326 	/* Enabling debug failed, we need to disable on
327 	 * all GPUs so the enable is all or nothing.
328 	 */
329 	kfd_dbg_trap_deactivate(target, true, i);
330 	return r;
331 }
332 
333 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
334 			void __user *runtime_info, uint32_t *runtime_size)
335 {
336 	struct file *f;
337 	uint32_t copy_size;
338 	int i, r = 0;
339 
340 	if (target->debug_trap_enabled)
341 		return -EALREADY;
342 
343 	/* Enable pre-checks */
344 	for (i = 0; i < target->n_pdds; i++) {
345 		struct kfd_process_device *pdd = target->pdds[i];
346 
347 		if (!KFD_IS_SOC15(pdd->dev))
348 			return -ENODEV;
349 
350 		if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
351 			return -EBUSY;
352 	}
353 
354 	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
355 
356 	f = fget(fd);
357 	if (!f) {
358 		pr_err("Failed to get file for (%i)\n", fd);
359 		return -EBADF;
360 	}
361 
362 	target->dbg_ev_file = f;
363 
364 	/* defer activation to runtime if not runtime enabled */
365 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
366 		kfd_dbg_trap_activate(target);
367 
368 	/* We already hold the process reference but hold another one for the
369 	 * debug session.
370 	 */
371 	kref_get(&target->ref);
372 	target->debug_trap_enabled = true;
373 
374 	if (target->debugger_process)
375 		atomic_inc(&target->debugger_process->debugged_process_count);
376 
377 	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
378 		kfd_dbg_trap_deactivate(target, false, 0);
379 		r = -EFAULT;
380 	}
381 
382 	*runtime_size = sizeof(target->runtime_info);
383 
384 	return r;
385 }
386