1 /*
2  * SPDX-License-Identifier: MIT
3  *
4  * Copyright © 2008-2018 Intel Corporation
5  */
6 
7 #ifndef _I915_GPU_ERROR_H_
8 #define _I915_GPU_ERROR_H_
9 
10 #include <linux/atomic.h>
11 #include <linux/kref.h>
12 #include <linux/ktime.h>
13 #include <linux/sched.h>
14 
15 #include <drm/drm_mm.h>
16 
17 #include "gt/intel_engine.h"
18 #include "gt/intel_gt_types.h"
19 #include "gt/uc/intel_uc_fw.h"
20 
21 #include "intel_device_info.h"
22 
23 #include "i915_gem.h"
24 #include "i915_gem_gtt.h"
25 #include "i915_params.h"
26 #include "i915_scheduler.h"
27 
28 struct drm_i915_private;
29 struct i915_vma_compress;
30 struct intel_engine_capture_vma;
31 struct intel_overlay_error_state;
32 
33 struct i915_vma_coredump {
34 	struct i915_vma_coredump *next;
35 
36 	char name[20];
37 
38 	u64 gtt_offset;
39 	u64 gtt_size;
40 	u32 gtt_page_sizes;
41 
42 	int unused;
43 	struct list_head page_list;
44 };
45 
46 struct i915_request_coredump {
47 	unsigned long flags;
48 	pid_t pid;
49 	u32 context;
50 	u32 seqno;
51 	u32 head;
52 	u32 tail;
53 	struct i915_sched_attr sched_attr;
54 };
55 
56 struct intel_engine_coredump {
57 	const struct intel_engine_cs *engine;
58 
59 	bool hung;
60 	bool simulated;
61 	u32 reset_count;
62 
63 	/* position of active request inside the ring */
64 	u32 rq_head, rq_post, rq_tail;
65 
66 	/* Register state */
67 	u32 ccid;
68 	u32 start;
69 	u32 tail;
70 	u32 head;
71 	u32 ctl;
72 	u32 mode;
73 	u32 hws;
74 	u32 ipeir;
75 	u32 ipehr;
76 	u32 esr;
77 	u32 bbstate;
78 	u32 instpm;
79 	u32 instps;
80 	u64 bbaddr;
81 	u64 acthd;
82 	u32 fault_reg;
83 	u64 faddr;
84 	u32 rc_psmi; /* sleep state */
85 	struct intel_instdone instdone;
86 
87 	struct i915_gem_context_coredump {
88 		char comm[TASK_COMM_LEN];
89 
90 		u64 total_runtime;
91 		u32 avg_runtime;
92 
93 		pid_t pid;
94 		int active;
95 		int guilty;
96 		struct i915_sched_attr sched_attr;
97 	} context;
98 
99 	struct i915_vma_coredump *vma;
100 
101 	struct i915_request_coredump execlist[EXECLIST_MAX_PORTS];
102 	unsigned int num_ports;
103 
104 	struct {
105 		u32 gfx_mode;
106 		union {
107 			u64 pdp[4];
108 			u32 pp_dir_base;
109 		};
110 	} vm_info;
111 
112 	struct intel_engine_coredump *next;
113 };
114 
115 struct intel_gt_coredump {
116 	const struct intel_gt *_gt;
117 	bool awake;
118 	bool simulated;
119 
120 	struct intel_gt_info info;
121 
122 	/* Generic register state */
123 	u32 eir;
124 	u32 pgtbl_er;
125 	u32 ier;
126 	u32 gtier[6], ngtier;
127 	u32 derrmr;
128 	u32 forcewake;
129 	u32 error; /* gen6+ */
130 	u32 err_int; /* gen7 */
131 	u32 fault_data0; /* gen8, gen9 */
132 	u32 fault_data1; /* gen8, gen9 */
133 	u32 done_reg;
134 	u32 gac_eco;
135 	u32 gam_ecochk;
136 	u32 gab_ctl;
137 	u32 gfx_mode;
138 	u32 gtt_cache;
139 	u32 aux_err; /* gen12 */
140 	u32 sfc_done[GEN12_SFC_DONE_MAX]; /* gen12 */
141 	u32 gam_done; /* gen12 */
142 
143 	u32 nfence;
144 	u64 fence[I915_MAX_NUM_FENCES];
145 
146 	struct intel_engine_coredump *engine;
147 
148 	struct intel_uc_coredump {
149 		struct intel_uc_fw guc_fw;
150 		struct intel_uc_fw huc_fw;
151 		struct i915_vma_coredump *guc_log;
152 	} *uc;
153 
154 	struct intel_gt_coredump *next;
155 };
156 
157 struct i915_gpu_coredump {
158 	struct kref ref;
159 	ktime_t time;
160 	ktime_t boottime;
161 	ktime_t uptime;
162 	unsigned long capture;
163 
164 	struct drm_i915_private *i915;
165 
166 	struct intel_gt_coredump *gt;
167 
168 	char error_msg[128];
169 	bool simulated;
170 	bool wakelock;
171 	bool suspended;
172 	int iommu;
173 	u32 reset_count;
174 	u32 suspend_count;
175 
176 	struct intel_device_info device_info;
177 	struct intel_runtime_info runtime_info;
178 	struct intel_driver_caps driver_caps;
179 	struct i915_params params;
180 
181 	struct intel_overlay_error_state *overlay;
182 
183 	struct scatterlist *sgl, *fit;
184 };
185 
186 struct i915_gpu_error {
187 	/* For reset and error_state handling. */
188 	spinlock_t lock;
189 	/* Protected by the above dev->gpu_error.lock. */
190 	struct i915_gpu_coredump *first_error;
191 
192 	atomic_t pending_fb_pin;
193 
194 	/** Number of times the device has been reset (global) */
195 	atomic_t reset_count;
196 
197 	/** Number of times an engine has been reset */
198 	atomic_t reset_engine_count[I915_NUM_ENGINES];
199 };
200 
201 struct drm_i915_error_state_buf {
202 	struct drm_i915_private *i915;
203 	struct scatterlist *sgl, *cur, *end;
204 
205 	char *buf;
206 	size_t bytes;
207 	size_t size;
208 	loff_t iter;
209 
210 	int err;
211 };
212 
213 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
214 
215 __printf(2, 3)
216 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
217 
218 struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,
219 					    intel_engine_mask_t engine_mask);
220 void i915_capture_error_state(struct intel_gt *gt,
221 			      intel_engine_mask_t engine_mask);
222 
223 struct i915_gpu_coredump *
224 i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);
225 
226 struct intel_gt_coredump *
227 intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp);
228 
229 struct intel_engine_coredump *
230 intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp);
231 
232 struct intel_engine_capture_vma *
233 intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
234 				  struct i915_request *rq,
235 				  gfp_t gfp);
236 
237 void intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
238 				   struct intel_engine_capture_vma *capture,
239 				   struct i915_vma_compress *compress);
240 
241 struct i915_vma_compress *
242 i915_vma_capture_prepare(struct intel_gt_coredump *gt);
243 
244 void i915_vma_capture_finish(struct intel_gt_coredump *gt,
245 			     struct i915_vma_compress *compress);
246 
247 void i915_error_state_store(struct i915_gpu_coredump *error);
248 
249 static inline struct i915_gpu_coredump *
250 i915_gpu_coredump_get(struct i915_gpu_coredump *gpu)
251 {
252 	kref_get(&gpu->ref);
253 	return gpu;
254 }
255 
256 ssize_t
257 i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
258 				 char *buf, loff_t offset, size_t count);
259 
260 void __i915_gpu_coredump_free(struct kref *kref);
261 static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
262 {
263 	if (gpu)
264 		kref_put(&gpu->ref, __i915_gpu_coredump_free);
265 }
266 
267 struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915);
268 void i915_reset_error_state(struct drm_i915_private *i915);
269 void i915_disable_error_state(struct drm_i915_private *i915, int err);
270 
271 #else
272 
273 static inline void
274 i915_capture_error_state(struct intel_gt *gt, intel_engine_mask_t engine_mask)
275 {
276 }
277 
278 static inline struct i915_gpu_coredump *
279 i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
280 {
281 	return NULL;
282 }
283 
284 static inline struct intel_gt_coredump *
285 intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp)
286 {
287 	return NULL;
288 }
289 
290 static inline struct intel_engine_coredump *
291 intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp)
292 {
293 	return NULL;
294 }
295 
296 static inline struct intel_engine_capture_vma *
297 intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
298 				  struct i915_request *rq,
299 				  gfp_t gfp)
300 {
301 	return NULL;
302 }
303 
304 static inline void
305 intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
306 			      struct intel_engine_capture_vma *capture,
307 			      struct i915_vma_compress *compress)
308 {
309 }
310 
311 static inline struct i915_vma_compress *
312 i915_vma_capture_prepare(struct intel_gt_coredump *gt)
313 {
314 	return NULL;
315 }
316 
317 static inline void
318 i915_vma_capture_finish(struct intel_gt_coredump *gt,
319 			struct i915_vma_compress *compress)
320 {
321 }
322 
323 static inline void
324 i915_error_state_store(struct i915_gpu_coredump *error)
325 {
326 }
327 
328 static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
329 {
330 }
331 
332 static inline struct i915_gpu_coredump *
333 i915_first_error_state(struct drm_i915_private *i915)
334 {
335 	return ERR_PTR(-ENODEV);
336 }
337 
338 static inline void i915_reset_error_state(struct drm_i915_private *i915)
339 {
340 }
341 
342 static inline void i915_disable_error_state(struct drm_i915_private *i915,
343 					    int err)
344 {
345 }
346 
347 #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
348 
349 #endif /* _I915_GPU_ERROR_H_ */
350