xref: /openbmc/linux/drivers/gpu/drm/i915/i915_gpu_error.h (revision e6eff2056b64376910b72e90fb9140500979da0b)
1  /*
2   * SPDX-License-Identifier: MIT
3   *
4   * Copyright © 2008-2018 Intel Corporation
5   */
6  
7  #ifndef _I915_GPU_ERROR_H_
8  #define _I915_GPU_ERROR_H_
9  
10  #include <linux/atomic.h>
11  #include <linux/kref.h>
12  #include <linux/ktime.h>
13  #include <linux/sched.h>
14  
15  #include <drm/drm_mm.h>
16  
17  #include "display/intel_display_device.h"
18  #include "gt/intel_engine.h"
19  #include "gt/intel_engine_types.h"
20  #include "gt/intel_gt_types.h"
21  #include "gt/uc/intel_uc_fw.h"
22  
23  #include "intel_device_info.h"
24  
25  #include "i915_gem.h"
26  #include "i915_gem_gtt.h"
27  #include "i915_params.h"
28  #include "i915_scheduler.h"
29  
30  struct drm_i915_private;
31  struct i915_vma_compress;
32  struct intel_engine_capture_vma;
33  struct intel_overlay_error_state;
34  
35  struct i915_vma_coredump {
36  	struct i915_vma_coredump *next;
37  
38  	char name[20];
39  
40  	u64 gtt_offset;
41  	u64 gtt_size;
42  	u32 gtt_page_sizes;
43  
44  	int unused;
45  	struct list_head page_list;
46  };
47  
48  struct i915_request_coredump {
49  	unsigned long flags;
50  	pid_t pid;
51  	u32 context;
52  	u32 seqno;
53  	u32 head;
54  	u32 tail;
55  	struct i915_sched_attr sched_attr;
56  };
57  
58  struct __guc_capture_parsed_output;
59  
60  struct intel_engine_coredump {
61  	const struct intel_engine_cs *engine;
62  
63  	bool hung;
64  	bool simulated;
65  	u32 reset_count;
66  
67  	/* position of active request inside the ring */
68  	u32 rq_head, rq_post, rq_tail;
69  
70  	/* Register state */
71  	u32 ccid;
72  	u32 start;
73  	u32 tail;
74  	u32 head;
75  	u32 ctl;
76  	u32 mode;
77  	u32 hws;
78  	u32 ipeir;
79  	u32 ipehr;
80  	u32 esr;
81  	u32 bbstate;
82  	u32 instpm;
83  	u32 instps;
84  	u64 bbaddr;
85  	u64 acthd;
86  	u32 fault_reg;
87  	u64 faddr;
88  	u32 rc_psmi; /* sleep state */
89  	u32 nopid;
90  	u32 excc;
91  	u32 cmd_cctl;
92  	u32 cscmdop;
93  	u32 ctx_sr_ctl;
94  	u32 dma_faddr_hi;
95  	u32 dma_faddr_lo;
96  	struct intel_instdone instdone;
97  
98  	/* GuC matched capture-lists info */
99  	struct intel_guc_state_capture *guc_capture;
100  	struct __guc_capture_parsed_output *guc_capture_node;
101  
102  	struct i915_gem_context_coredump {
103  		char comm[TASK_COMM_LEN];
104  
105  		u64 total_runtime;
106  		u64 avg_runtime;
107  
108  		pid_t pid;
109  		int active;
110  		int guilty;
111  		struct i915_sched_attr sched_attr;
112  		u32 hwsp_seqno;
113  	} context;
114  
115  	struct i915_vma_coredump *vma;
116  
117  	struct i915_request_coredump execlist[EXECLIST_MAX_PORTS];
118  	unsigned int num_ports;
119  
120  	struct {
121  		u32 gfx_mode;
122  		union {
123  			u64 pdp[4];
124  			u32 pp_dir_base;
125  		};
126  	} vm_info;
127  
128  	struct intel_engine_coredump *next;
129  };
130  
131  struct intel_ctb_coredump {
132  	u32 raw_head, head;
133  	u32 raw_tail, tail;
134  	u32 raw_status;
135  	u32 desc_offset;
136  	u32 cmds_offset;
137  	u32 size;
138  };
139  
140  struct intel_gt_coredump {
141  	const struct intel_gt *_gt;
142  	bool awake;
143  	bool simulated;
144  
145  	struct intel_gt_info info;
146  
147  	/* Generic register state */
148  	u32 eir;
149  	u32 pgtbl_er;
150  	u32 ier;
151  	u32 gtier[6], ngtier;
152  	u32 forcewake;
153  	u32 error; /* gen6+ */
154  	u32 err_int; /* gen7 */
155  	u32 fault_data0; /* gen8, gen9 */
156  	u32 fault_data1; /* gen8, gen9 */
157  	u32 done_reg;
158  	u32 gac_eco;
159  	u32 gam_ecochk;
160  	u32 gab_ctl;
161  	u32 gfx_mode;
162  	u32 gtt_cache;
163  	u32 aux_err; /* gen12 */
164  	u32 gam_done; /* gen12 */
165  	u32 clock_frequency;
166  	u32 clock_period_ns;
167  
168  	/* Display related */
169  	u32 derrmr;
170  	u32 sfc_done[I915_MAX_SFC]; /* gen12 */
171  
172  	u32 nfence;
173  	u64 fence[I915_MAX_NUM_FENCES];
174  
175  	struct intel_engine_coredump *engine;
176  
177  	struct intel_uc_coredump {
178  		struct intel_uc_fw guc_fw;
179  		struct intel_uc_fw huc_fw;
180  		struct guc_info {
181  			struct intel_ctb_coredump ctb[2];
182  			struct i915_vma_coredump *vma_ctb;
183  			struct i915_vma_coredump *vma_log;
184  			u32 timestamp;
185  			u16 last_fence;
186  			bool is_guc_capture;
187  		} guc;
188  	} *uc;
189  
190  	struct intel_gt_coredump *next;
191  };
192  
193  struct i915_gpu_coredump {
194  	struct kref ref;
195  	ktime_t time;
196  	ktime_t boottime;
197  	ktime_t uptime;
198  	unsigned long capture;
199  
200  	struct drm_i915_private *i915;
201  
202  	struct intel_gt_coredump *gt;
203  
204  	char error_msg[128];
205  	bool simulated;
206  	bool wakelock;
207  	bool suspended;
208  	int iommu;
209  	u32 reset_count;
210  	u32 suspend_count;
211  
212  	struct intel_device_info device_info;
213  	struct intel_runtime_info runtime_info;
214  	struct intel_display_device_info display_device_info;
215  	struct intel_display_runtime_info display_runtime_info;
216  	struct intel_driver_caps driver_caps;
217  	struct i915_params params;
218  
219  	struct intel_overlay_error_state *overlay;
220  
221  	struct scatterlist *sgl, *fit;
222  };
223  
224  struct i915_gpu_error {
225  	/* For reset and error_state handling. */
226  	spinlock_t lock;
227  	/* Protected by the above dev->gpu_error.lock. */
228  	struct i915_gpu_coredump *first_error;
229  
230  	atomic_t pending_fb_pin;
231  
232  	/** Number of times the device has been reset (global) */
233  	atomic_t reset_count;
234  
235  	/** Number of times an engine has been reset */
236  	atomic_t reset_engine_count[MAX_ENGINE_CLASS];
237  };
238  
239  struct drm_i915_error_state_buf {
240  	struct drm_i915_private *i915;
241  	struct scatterlist *sgl, *cur, *end;
242  
243  	char *buf;
244  	size_t bytes;
245  	size_t size;
246  	loff_t iter;
247  
248  	int err;
249  };
250  
i915_reset_count(struct i915_gpu_error * error)251  static inline u32 i915_reset_count(struct i915_gpu_error *error)
252  {
253  	return atomic_read(&error->reset_count);
254  }
255  
i915_reset_engine_count(struct i915_gpu_error * error,const struct intel_engine_cs * engine)256  static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
257  					  const struct intel_engine_cs *engine)
258  {
259  	return atomic_read(&error->reset_engine_count[engine->class]);
260  }
261  
262  static inline void
i915_increase_reset_engine_count(struct i915_gpu_error * error,const struct intel_engine_cs * engine)263  i915_increase_reset_engine_count(struct i915_gpu_error *error,
264  				 const struct intel_engine_cs *engine)
265  {
266  	atomic_inc(&error->reset_engine_count[engine->class]);
267  }
268  
269  #define CORE_DUMP_FLAG_NONE           0x0
270  #define CORE_DUMP_FLAG_IS_GUC_CAPTURE BIT(0)
271  
272  #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) && IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
273  void intel_klog_error_capture(struct intel_gt *gt,
274  			      intel_engine_mask_t engine_mask);
275  #else
intel_klog_error_capture(struct intel_gt * gt,intel_engine_mask_t engine_mask)276  static inline void intel_klog_error_capture(struct intel_gt *gt,
277  					    intel_engine_mask_t engine_mask)
278  {
279  }
280  #endif
281  
282  #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
283  
284  __printf(2, 3)
285  void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
286  void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m,
287  			       const struct intel_engine_cs *engine,
288  			       const struct i915_vma_coredump *vma);
289  struct i915_vma_coredump *
290  intel_gpu_error_find_batch(const struct intel_engine_coredump *ee);
291  
292  struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,
293  					    intel_engine_mask_t engine_mask, u32 dump_flags);
294  void i915_capture_error_state(struct intel_gt *gt,
295  			      intel_engine_mask_t engine_mask, u32 dump_flags);
296  
297  struct i915_gpu_coredump *
298  i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);
299  
300  struct intel_gt_coredump *
301  intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags);
302  
303  struct intel_engine_coredump *
304  intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags);
305  
306  struct intel_engine_capture_vma *
307  intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
308  				  struct i915_request *rq,
309  				  gfp_t gfp);
310  
311  void intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
312  				   struct intel_engine_capture_vma *capture,
313  				   struct i915_vma_compress *compress);
314  
315  struct i915_vma_compress *
316  i915_vma_capture_prepare(struct intel_gt_coredump *gt);
317  
318  void i915_vma_capture_finish(struct intel_gt_coredump *gt,
319  			     struct i915_vma_compress *compress);
320  
321  void i915_error_state_store(struct i915_gpu_coredump *error);
322  
323  static inline struct i915_gpu_coredump *
i915_gpu_coredump_get(struct i915_gpu_coredump * gpu)324  i915_gpu_coredump_get(struct i915_gpu_coredump *gpu)
325  {
326  	kref_get(&gpu->ref);
327  	return gpu;
328  }
329  
330  ssize_t
331  i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
332  				 char *buf, loff_t offset, size_t count);
333  
334  void __i915_gpu_coredump_free(struct kref *kref);
i915_gpu_coredump_put(struct i915_gpu_coredump * gpu)335  static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
336  {
337  	if (gpu)
338  		kref_put(&gpu->ref, __i915_gpu_coredump_free);
339  }
340  
341  struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915);
342  void i915_reset_error_state(struct drm_i915_private *i915);
343  void i915_disable_error_state(struct drm_i915_private *i915, int err);
344  
345  #else
346  
347  __printf(2, 3)
348  static inline void
i915_error_printf(struct drm_i915_error_state_buf * e,const char * f,...)349  i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
350  {
351  }
352  
353  static inline void
i915_capture_error_state(struct intel_gt * gt,intel_engine_mask_t engine_mask,u32 dump_flags)354  i915_capture_error_state(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags)
355  {
356  }
357  
358  static inline struct i915_gpu_coredump *
i915_gpu_coredump_alloc(struct drm_i915_private * i915,gfp_t gfp)359  i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
360  {
361  	return NULL;
362  }
363  
364  static inline struct intel_gt_coredump *
intel_gt_coredump_alloc(struct intel_gt * gt,gfp_t gfp,u32 dump_flags)365  intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags)
366  {
367  	return NULL;
368  }
369  
370  static inline struct intel_engine_coredump *
intel_engine_coredump_alloc(struct intel_engine_cs * engine,gfp_t gfp,u32 dump_flags)371  intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags)
372  {
373  	return NULL;
374  }
375  
376  static inline struct intel_engine_capture_vma *
intel_engine_coredump_add_request(struct intel_engine_coredump * ee,struct i915_request * rq,gfp_t gfp)377  intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
378  				  struct i915_request *rq,
379  				  gfp_t gfp)
380  {
381  	return NULL;
382  }
383  
384  static inline void
intel_engine_coredump_add_vma(struct intel_engine_coredump * ee,struct intel_engine_capture_vma * capture,struct i915_vma_compress * compress)385  intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
386  			      struct intel_engine_capture_vma *capture,
387  			      struct i915_vma_compress *compress)
388  {
389  }
390  
391  static inline struct i915_vma_compress *
i915_vma_capture_prepare(struct intel_gt_coredump * gt)392  i915_vma_capture_prepare(struct intel_gt_coredump *gt)
393  {
394  	return NULL;
395  }
396  
397  static inline void
i915_vma_capture_finish(struct intel_gt_coredump * gt,struct i915_vma_compress * compress)398  i915_vma_capture_finish(struct intel_gt_coredump *gt,
399  			struct i915_vma_compress *compress)
400  {
401  }
402  
403  static inline void
i915_error_state_store(struct i915_gpu_coredump * error)404  i915_error_state_store(struct i915_gpu_coredump *error)
405  {
406  }
407  
i915_gpu_coredump_put(struct i915_gpu_coredump * gpu)408  static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
409  {
410  }
411  
412  static inline struct i915_gpu_coredump *
i915_first_error_state(struct drm_i915_private * i915)413  i915_first_error_state(struct drm_i915_private *i915)
414  {
415  	return ERR_PTR(-ENODEV);
416  }
417  
i915_reset_error_state(struct drm_i915_private * i915)418  static inline void i915_reset_error_state(struct drm_i915_private *i915)
419  {
420  }
421  
i915_disable_error_state(struct drm_i915_private * i915,int err)422  static inline void i915_disable_error_state(struct drm_i915_private *i915,
423  					    int err)
424  {
425  }
426  
427  #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
428  
429  #endif /* _I915_GPU_ERROR_H_ */
430