1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014-2018 Intel Corporation
4  */
5 
6 #include "i915_drv.h"
7 #include "intel_context.h"
8 #include "intel_engine_pm.h"
9 #include "intel_engine_regs.h"
10 #include "intel_gpu_commands.h"
11 #include "intel_gt.h"
12 #include "intel_gt_mcr.h"
13 #include "intel_gt_regs.h"
14 #include "intel_ring.h"
15 #include "intel_workarounds.h"
16 
17 /**
18  * DOC: Hardware workarounds
19  *
20  * This file is intended as a central place to implement most [1]_ of the
21  * required workarounds for hardware to work as originally intended. They fall
22  * in five basic categories depending on how/when they are applied:
23  *
24  * - Workarounds that touch registers that are saved/restored to/from the HW
25  *   context image. The list is emitted (via Load Register Immediate commands)
26  *   everytime a new context is created.
27  * - GT workarounds. The list of these WAs is applied whenever these registers
28  *   revert to default values (on GPU reset, suspend/resume [2]_, etc..).
29  * - Display workarounds. The list is applied during display clock-gating
30  *   initialization.
31  * - Workarounds that whitelist a privileged register, so that UMDs can manage
32  *   them directly. This is just a special case of a MMMIO workaround (as we
33  *   write the list of these to/be-whitelisted registers to some special HW
34  *   registers).
35  * - Workaround batchbuffers, that get executed automatically by the hardware
36  *   on every HW context restore.
37  *
38  * .. [1] Please notice that there are other WAs that, due to their nature,
39  *    cannot be applied from a central place. Those are peppered around the rest
40  *    of the code, as needed.
41  *
42  * .. [2] Technically, some registers are powercontext saved & restored, so they
43  *    survive a suspend/resume. In practice, writing them again is not too
44  *    costly and simplifies things. We can revisit this in the future.
45  *
46  * Layout
47  * ~~~~~~
48  *
49  * Keep things in this file ordered by WA type, as per the above (context, GT,
50  * display, register whitelist, batchbuffer). Then, inside each type, keep the
51  * following order:
52  *
53  * - Infrastructure functions and macros
54  * - WAs per platform in standard gen/chrono order
55  * - Public functions to init or apply the given workaround type.
56  */
57 
58 static void wa_init_start(struct i915_wa_list *wal, const char *name, const char *engine_name)
59 {
60 	wal->name = name;
61 	wal->engine_name = engine_name;
62 }
63 
64 #define WA_LIST_CHUNK (1 << 4)
65 
66 static void wa_init_finish(struct i915_wa_list *wal)
67 {
68 	/* Trim unused entries. */
69 	if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
70 		struct i915_wa *list = kmemdup(wal->list,
71 					       wal->count * sizeof(*list),
72 					       GFP_KERNEL);
73 
74 		if (list) {
75 			kfree(wal->list);
76 			wal->list = list;
77 		}
78 	}
79 
80 	if (!wal->count)
81 		return;
82 
83 	DRM_DEBUG_DRIVER("Initialized %u %s workarounds on %s\n",
84 			 wal->wa_count, wal->name, wal->engine_name);
85 }
86 
87 static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
88 {
89 	unsigned int addr = i915_mmio_reg_offset(wa->reg);
90 	unsigned int start = 0, end = wal->count;
91 	const unsigned int grow = WA_LIST_CHUNK;
92 	struct i915_wa *wa_;
93 
94 	GEM_BUG_ON(!is_power_of_2(grow));
95 
96 	if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
97 		struct i915_wa *list;
98 
99 		list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
100 				     GFP_KERNEL);
101 		if (!list) {
102 			DRM_ERROR("No space for workaround init!\n");
103 			return;
104 		}
105 
106 		if (wal->list) {
107 			memcpy(list, wal->list, sizeof(*wa) * wal->count);
108 			kfree(wal->list);
109 		}
110 
111 		wal->list = list;
112 	}
113 
114 	while (start < end) {
115 		unsigned int mid = start + (end - start) / 2;
116 
117 		if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
118 			start = mid + 1;
119 		} else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
120 			end = mid;
121 		} else {
122 			wa_ = &wal->list[mid];
123 
124 			if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
125 				DRM_ERROR("Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
126 					  i915_mmio_reg_offset(wa_->reg),
127 					  wa_->clr, wa_->set);
128 
129 				wa_->set &= ~wa->clr;
130 			}
131 
132 			wal->wa_count++;
133 			wa_->set |= wa->set;
134 			wa_->clr |= wa->clr;
135 			wa_->read |= wa->read;
136 			return;
137 		}
138 	}
139 
140 	wal->wa_count++;
141 	wa_ = &wal->list[wal->count++];
142 	*wa_ = *wa;
143 
144 	while (wa_-- > wal->list) {
145 		GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
146 			   i915_mmio_reg_offset(wa_[1].reg));
147 		if (i915_mmio_reg_offset(wa_[1].reg) >
148 		    i915_mmio_reg_offset(wa_[0].reg))
149 			break;
150 
151 		swap(wa_[1], wa_[0]);
152 	}
153 }
154 
155 static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
156 		   u32 clear, u32 set, u32 read_mask, bool masked_reg)
157 {
158 	struct i915_wa wa = {
159 		.reg  = reg,
160 		.clr  = clear,
161 		.set  = set,
162 		.read = read_mask,
163 		.masked_reg = masked_reg,
164 	};
165 
166 	_wa_add(wal, &wa);
167 }
168 
169 static void
170 wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
171 {
172 	wa_add(wal, reg, clear, set, clear, false);
173 }
174 
175 static void
176 wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
177 {
178 	wa_write_clr_set(wal, reg, ~0, set);
179 }
180 
181 static void
182 wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
183 {
184 	wa_write_clr_set(wal, reg, set, set);
185 }
186 
187 static void
188 wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
189 {
190 	wa_write_clr_set(wal, reg, clr, 0);
191 }
192 
193 /*
194  * WA operations on "masked register". A masked register has the upper 16 bits
195  * documented as "masked" in b-spec. Its purpose is to allow writing to just a
196  * portion of the register without a rmw: you simply write in the upper 16 bits
197  * the mask of bits you are going to modify.
198  *
199  * The wa_masked_* family of functions already does the necessary operations to
200  * calculate the mask based on the parameters passed, so user only has to
201  * provide the lower 16 bits of that register.
202  */
203 
204 static void
205 wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
206 {
207 	wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
208 }
209 
210 static void
211 wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
212 {
213 	wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
214 }
215 
216 static void
217 wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg,
218 		    u32 mask, u32 val)
219 {
220 	wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
221 }
222 
223 static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
224 				      struct i915_wa_list *wal)
225 {
226 	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
227 }
228 
229 static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
230 				      struct i915_wa_list *wal)
231 {
232 	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
233 }
234 
235 static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
236 				      struct i915_wa_list *wal)
237 {
238 	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
239 
240 	/* WaDisableAsyncFlipPerfMode:bdw,chv */
241 	wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE);
242 
243 	/* WaDisablePartialInstShootdown:bdw,chv */
244 	wa_masked_en(wal, GEN8_ROW_CHICKEN,
245 		     PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
246 
247 	/* Use Force Non-Coherent whenever executing a 3D context. This is a
248 	 * workaround for a possible hang in the unlikely event a TLB
249 	 * invalidation occurs during a PSD flush.
250 	 */
251 	/* WaForceEnableNonCoherent:bdw,chv */
252 	/* WaHdcDisableFetchWhenMasked:bdw,chv */
253 	wa_masked_en(wal, HDC_CHICKEN0,
254 		     HDC_DONOT_FETCH_MEM_WHEN_MASKED |
255 		     HDC_FORCE_NON_COHERENT);
256 
257 	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
258 	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
259 	 *  polygons in the same 8x4 pixel/sample area to be processed without
260 	 *  stalling waiting for the earlier ones to write to Hierarchical Z
261 	 *  buffer."
262 	 *
263 	 * This optimization is off by default for BDW and CHV; turn it on.
264 	 */
265 	wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
266 
267 	/* Wa4x4STCOptimizationDisable:bdw,chv */
268 	wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
269 
270 	/*
271 	 * BSpec recommends 8x4 when MSAA is used,
272 	 * however in practice 16x4 seems fastest.
273 	 *
274 	 * Note that PS/WM thread counts depend on the WIZ hashing
275 	 * disable bit, which we don't touch here, but it's good
276 	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
277 	 */
278 	wa_masked_field_set(wal, GEN7_GT_MODE,
279 			    GEN6_WIZ_HASHING_MASK,
280 			    GEN6_WIZ_HASHING_16x4);
281 }
282 
283 static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
284 				     struct i915_wa_list *wal)
285 {
286 	struct drm_i915_private *i915 = engine->i915;
287 
288 	gen8_ctx_workarounds_init(engine, wal);
289 
290 	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
291 	wa_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
292 
293 	/* WaDisableDopClockGating:bdw
294 	 *
295 	 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
296 	 * to disable EUTC clock gating.
297 	 */
298 	wa_masked_en(wal, GEN7_ROW_CHICKEN2,
299 		     DOP_CLOCK_GATING_DISABLE);
300 
301 	wa_masked_en(wal, HALF_SLICE_CHICKEN3,
302 		     GEN8_SAMPLER_POWER_BYPASS_DIS);
303 
304 	wa_masked_en(wal, HDC_CHICKEN0,
305 		     /* WaForceContextSaveRestoreNonCoherent:bdw */
306 		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
307 		     /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
308 		     (IS_BDW_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
309 }
310 
311 static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
312 				     struct i915_wa_list *wal)
313 {
314 	gen8_ctx_workarounds_init(engine, wal);
315 
316 	/* WaDisableThreadStallDopClockGating:chv */
317 	wa_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
318 
319 	/* Improve HiZ throughput on CHV. */
320 	wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
321 }
322 
323 static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
324 				      struct i915_wa_list *wal)
325 {
326 	struct drm_i915_private *i915 = engine->i915;
327 
328 	if (HAS_LLC(i915)) {
329 		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
330 		 *
331 		 * Must match Display Engine. See
332 		 * WaCompressedResourceDisplayNewHashMode.
333 		 */
334 		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
335 			     GEN9_PBE_COMPRESSED_HASH_SELECTION);
336 		wa_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
337 			     GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
338 	}
339 
340 	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
341 	/* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
342 	wa_masked_en(wal, GEN8_ROW_CHICKEN,
343 		     FLOW_CONTROL_ENABLE |
344 		     PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
345 
346 	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
347 	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
348 	wa_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
349 		     GEN9_ENABLE_YV12_BUGFIX |
350 		     GEN9_ENABLE_GPGPU_PREEMPTION);
351 
352 	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
353 	/* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
354 	wa_masked_en(wal, CACHE_MODE_1,
355 		     GEN8_4x4_STC_OPTIMIZATION_DISABLE |
356 		     GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
357 
358 	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
359 	wa_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5,
360 		      GEN9_CCS_TLB_PREFETCH_ENABLE);
361 
362 	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
363 	wa_masked_en(wal, HDC_CHICKEN0,
364 		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
365 		     HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
366 
367 	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
368 	 * both tied to WaForceContextSaveRestoreNonCoherent
369 	 * in some hsds for skl. We keep the tie for all gen9. The
370 	 * documentation is a bit hazy and so we want to get common behaviour,
371 	 * even though there is no clear evidence we would need both on kbl/bxt.
372 	 * This area has been source of system hangs so we play it safe
373 	 * and mimic the skl regardless of what bspec says.
374 	 *
375 	 * Use Force Non-Coherent whenever executing a 3D context. This
376 	 * is a workaround for a possible hang in the unlikely event
377 	 * a TLB invalidation occurs during a PSD flush.
378 	 */
379 
380 	/* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
381 	wa_masked_en(wal, HDC_CHICKEN0,
382 		     HDC_FORCE_NON_COHERENT);
383 
384 	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
385 	if (IS_SKYLAKE(i915) ||
386 	    IS_KABYLAKE(i915) ||
387 	    IS_COFFEELAKE(i915) ||
388 	    IS_COMETLAKE(i915))
389 		wa_masked_en(wal, HALF_SLICE_CHICKEN3,
390 			     GEN8_SAMPLER_POWER_BYPASS_DIS);
391 
392 	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
393 	wa_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
394 
395 	/*
396 	 * Supporting preemption with fine-granularity requires changes in the
397 	 * batch buffer programming. Since we can't break old userspace, we
398 	 * need to set our default preemption level to safe value. Userspace is
399 	 * still able to use more fine-grained preemption levels, since in
400 	 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
401 	 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
402 	 * not real HW workarounds, but merely a way to start using preemption
403 	 * while maintaining old contract with userspace.
404 	 */
405 
406 	/* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
407 	wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
408 
409 	/* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
410 	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
411 			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
412 			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
413 
414 	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */
415 	if (IS_GEN9_LP(i915))
416 		wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
417 }
418 
419 static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
420 				struct i915_wa_list *wal)
421 {
422 	struct intel_gt *gt = engine->gt;
423 	u8 vals[3] = { 0, 0, 0 };
424 	unsigned int i;
425 
426 	for (i = 0; i < 3; i++) {
427 		u8 ss;
428 
429 		/*
430 		 * Only consider slices where one, and only one, subslice has 7
431 		 * EUs
432 		 */
433 		if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
434 			continue;
435 
436 		/*
437 		 * subslice_7eu[i] != 0 (because of the check above) and
438 		 * ss_max == 4 (maximum number of subslices possible per slice)
439 		 *
440 		 * ->    0 <= ss <= 3;
441 		 */
442 		ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
443 		vals[i] = 3 - ss;
444 	}
445 
446 	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
447 		return;
448 
449 	/* Tune IZ hashing. See intel_device_info_runtime_init() */
450 	wa_masked_field_set(wal, GEN7_GT_MODE,
451 			    GEN9_IZ_HASHING_MASK(2) |
452 			    GEN9_IZ_HASHING_MASK(1) |
453 			    GEN9_IZ_HASHING_MASK(0),
454 			    GEN9_IZ_HASHING(2, vals[2]) |
455 			    GEN9_IZ_HASHING(1, vals[1]) |
456 			    GEN9_IZ_HASHING(0, vals[0]));
457 }
458 
459 static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
460 				     struct i915_wa_list *wal)
461 {
462 	gen9_ctx_workarounds_init(engine, wal);
463 	skl_tune_iz_hashing(engine, wal);
464 }
465 
466 static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
467 				     struct i915_wa_list *wal)
468 {
469 	gen9_ctx_workarounds_init(engine, wal);
470 
471 	/* WaDisableThreadStallDopClockGating:bxt */
472 	wa_masked_en(wal, GEN8_ROW_CHICKEN,
473 		     STALL_DOP_GATING_DISABLE);
474 
475 	/* WaToEnableHwFixForPushConstHWBug:bxt */
476 	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
477 		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
478 }
479 
480 static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
481 				     struct i915_wa_list *wal)
482 {
483 	struct drm_i915_private *i915 = engine->i915;
484 
485 	gen9_ctx_workarounds_init(engine, wal);
486 
487 	/* WaToEnableHwFixForPushConstHWBug:kbl */
488 	if (IS_KBL_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER))
489 		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
490 			     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
491 
492 	/* WaDisableSbeCacheDispatchPortSharing:kbl */
493 	wa_masked_en(wal, GEN7_HALF_SLICE_CHICKEN1,
494 		     GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
495 }
496 
497 static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
498 				     struct i915_wa_list *wal)
499 {
500 	gen9_ctx_workarounds_init(engine, wal);
501 
502 	/* WaToEnableHwFixForPushConstHWBug:glk */
503 	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
504 		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
505 }
506 
507 static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
508 				     struct i915_wa_list *wal)
509 {
510 	gen9_ctx_workarounds_init(engine, wal);
511 
512 	/* WaToEnableHwFixForPushConstHWBug:cfl */
513 	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
514 		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
515 
516 	/* WaDisableSbeCacheDispatchPortSharing:cfl */
517 	wa_masked_en(wal, GEN7_HALF_SLICE_CHICKEN1,
518 		     GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
519 }
520 
521 static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
522 				     struct i915_wa_list *wal)
523 {
524 	/* Wa_1406697149 (WaDisableBankHangMode:icl) */
525 	wa_write(wal,
526 		 GEN8_L3CNTLREG,
527 		 intel_uncore_read(engine->uncore, GEN8_L3CNTLREG) |
528 		 GEN8_ERRDETBCTRL);
529 
530 	/* WaForceEnableNonCoherent:icl
531 	 * This is not the same workaround as in early Gen9 platforms, where
532 	 * lacking this could cause system hangs, but coherency performance
533 	 * overhead is high and only a few compute workloads really need it
534 	 * (the register is whitelisted in hardware now, so UMDs can opt in
535 	 * for coherency if they have a good reason).
536 	 */
537 	wa_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
538 
539 	/* WaEnableFloatBlendOptimization:icl */
540 	wa_add(wal, GEN10_CACHE_MODE_SS, 0,
541 	       _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE),
542 	       0 /* write-only, so skip validation */,
543 	       true);
544 
545 	/* WaDisableGPGPUMidThreadPreemption:icl */
546 	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
547 			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
548 			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
549 
550 	/* allow headerless messages for preemptible GPGPU context */
551 	wa_masked_en(wal, GEN10_SAMPLER_MODE,
552 		     GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
553 
554 	/* Wa_1604278689:icl,ehl */
555 	wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
556 	wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER,
557 			 0, /* write-only register; skip validation */
558 			 0xFFFFFFFF);
559 
560 	/* Wa_1406306137:icl,ehl */
561 	wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
562 }
563 
564 /*
565  * These settings aren't actually workarounds, but general tuning settings that
566  * need to be programmed on dg2 platform.
567  */
568 static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine,
569 				   struct i915_wa_list *wal)
570 {
571 	wa_write_clr_set(wal, GEN11_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK,
572 			 REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f));
573 	wa_add(wal,
574 	       FF_MODE2,
575 	       FF_MODE2_TDS_TIMER_MASK,
576 	       FF_MODE2_TDS_TIMER_128,
577 	       0, false);
578 }
579 
580 /*
581  * These settings aren't actually workarounds, but general tuning settings that
582  * need to be programmed on several platforms.
583  */
584 static void gen12_ctx_gt_tuning_init(struct intel_engine_cs *engine,
585 				     struct i915_wa_list *wal)
586 {
587 	/*
588 	 * Although some platforms refer to it as Wa_1604555607, we need to
589 	 * program it even on those that don't explicitly list that
590 	 * workaround.
591 	 *
592 	 * Note that the programming of this register is further modified
593 	 * according to the FF_MODE2 guidance given by Wa_1608008084:gen12.
594 	 * Wa_1608008084 tells us the FF_MODE2 register will return the wrong
595 	 * value when read. The default value for this register is zero for all
596 	 * fields and there are no bit masks. So instead of doing a RMW we
597 	 * should just write TDS timer value. For the same reason read
598 	 * verification is ignored.
599 	 */
600 	wa_add(wal,
601 	       FF_MODE2,
602 	       FF_MODE2_TDS_TIMER_MASK,
603 	       FF_MODE2_TDS_TIMER_128,
604 	       0, false);
605 }
606 
607 static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine,
608 				       struct i915_wa_list *wal)
609 {
610 	gen12_ctx_gt_tuning_init(engine, wal);
611 
612 	/*
613 	 * Wa_1409142259:tgl,dg1,adl-p
614 	 * Wa_1409347922:tgl,dg1,adl-p
615 	 * Wa_1409252684:tgl,dg1,adl-p
616 	 * Wa_1409217633:tgl,dg1,adl-p
617 	 * Wa_1409207793:tgl,dg1,adl-p
618 	 * Wa_1409178076:tgl,dg1,adl-p
619 	 * Wa_1408979724:tgl,dg1,adl-p
620 	 * Wa_14010443199:tgl,rkl,dg1,adl-p
621 	 * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p
622 	 * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p
623 	 */
624 	wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
625 		     GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
626 
627 	/* WaDisableGPGPUMidThreadPreemption:gen12 */
628 	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
629 			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
630 			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
631 
632 	/*
633 	 * Wa_16011163337
634 	 *
635 	 * Like in gen12_ctx_gt_tuning_init(), read verification is ignored due
636 	 * to Wa_1608008084.
637 	 */
638 	wa_add(wal,
639 	       FF_MODE2,
640 	       FF_MODE2_GS_TIMER_MASK,
641 	       FF_MODE2_GS_TIMER_224,
642 	       0, false);
643 }
644 
645 static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine,
646 				     struct i915_wa_list *wal)
647 {
648 	gen12_ctx_workarounds_init(engine, wal);
649 
650 	/* Wa_1409044764 */
651 	wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3,
652 		      DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN);
653 
654 	/* Wa_22010493298 */
655 	wa_masked_en(wal, HIZ_CHICKEN,
656 		     DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE);
657 }
658 
659 static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine,
660 				     struct i915_wa_list *wal)
661 {
662 	dg2_ctx_gt_tuning_init(engine, wal);
663 
664 	/* Wa_16011186671:dg2_g11 */
665 	if (IS_DG2_GRAPHICS_STEP(engine->i915, G11, STEP_A0, STEP_B0)) {
666 		wa_masked_dis(wal, VFLSKPD, DIS_MULT_MISS_RD_SQUASH);
667 		wa_masked_en(wal, VFLSKPD, DIS_OVER_FETCH_CACHE);
668 	}
669 
670 	if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0)) {
671 		/* Wa_14010469329:dg2_g10 */
672 		wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
673 			     XEHP_DUAL_SIMD8_SEQ_MERGE_DISABLE);
674 
675 		/*
676 		 * Wa_22010465075:dg2_g10
677 		 * Wa_22010613112:dg2_g10
678 		 * Wa_14010698770:dg2_g10
679 		 */
680 		wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
681 			     GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
682 	}
683 
684 	/* Wa_16013271637:dg2 */
685 	wa_masked_en(wal, SLICE_COMMON_ECO_CHICKEN1,
686 		     MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
687 
688 	/* Wa_14014947963:dg2 */
689 	if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_B0, STEP_FOREVER) ||
690 		IS_DG2_G11(engine->i915) || IS_DG2_G12(engine->i915))
691 		wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000);
692 }
693 
694 static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine,
695 					 struct i915_wa_list *wal)
696 {
697 	/*
698 	 * This is a "fake" workaround defined by software to ensure we
699 	 * maintain reliable, backward-compatible behavior for userspace with
700 	 * regards to how nested MI_BATCH_BUFFER_START commands are handled.
701 	 *
702 	 * The per-context setting of MI_MODE[12] determines whether the bits
703 	 * of a nested MI_BATCH_BUFFER_START instruction should be interpreted
704 	 * in the traditional manner or whether they should instead use a new
705 	 * tgl+ meaning that breaks backward compatibility, but allows nesting
706 	 * into 3rd-level batchbuffers.  When this new capability was first
707 	 * added in TGL, it remained off by default unless a context
708 	 * intentionally opted in to the new behavior.  However Xe_HPG now
709 	 * flips this on by default and requires that we explicitly opt out if
710 	 * we don't want the new behavior.
711 	 *
712 	 * From a SW perspective, we want to maintain the backward-compatible
713 	 * behavior for userspace, so we'll apply a fake workaround to set it
714 	 * back to the legacy behavior on platforms where the hardware default
715 	 * is to break compatibility.  At the moment there is no Linux
716 	 * userspace that utilizes third-level batchbuffers, so this will avoid
717 	 * userspace from needing to make any changes.  using the legacy
718 	 * meaning is the correct thing to do.  If/when we have userspace
719 	 * consumers that want to utilize third-level batch nesting, we can
720 	 * provide a context parameter to allow them to opt-in.
721 	 */
722 	wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN);
723 }
724 
725 static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine,
726 				   struct i915_wa_list *wal)
727 {
728 	u8 mocs;
729 
730 	/*
731 	 * Some blitter commands do not have a field for MOCS, those
732 	 * commands will use MOCS index pointed by BLIT_CCTL.
733 	 * BLIT_CCTL registers are needed to be programmed to un-cached.
734 	 */
735 	if (engine->class == COPY_ENGINE_CLASS) {
736 		mocs = engine->gt->mocs.uc_index;
737 		wa_write_clr_set(wal,
738 				 BLIT_CCTL(engine->mmio_base),
739 				 BLIT_CCTL_MASK,
740 				 BLIT_CCTL_MOCS(mocs, mocs));
741 	}
742 }
743 
744 /*
745  * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround
746  * defined by the hardware team, but it programming general context registers.
747  * Adding those context register programming in context workaround
748  * allow us to use the wa framework for proper application and validation.
749  */
750 static void
751 gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine,
752 			  struct i915_wa_list *wal)
753 {
754 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
755 		fakewa_disable_nestedbb_mode(engine, wal);
756 
757 	gen12_ctx_gt_mocs_init(engine, wal);
758 }
759 
760 static void
761 __intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
762 			   struct i915_wa_list *wal,
763 			   const char *name)
764 {
765 	struct drm_i915_private *i915 = engine->i915;
766 
767 	wa_init_start(wal, name, engine->name);
768 
769 	/* Applies to all engines */
770 	/*
771 	 * Fake workarounds are not the actual workaround but
772 	 * programming of context registers using workaround framework.
773 	 */
774 	if (GRAPHICS_VER(i915) >= 12)
775 		gen12_ctx_gt_fake_wa_init(engine, wal);
776 
777 	if (engine->class != RENDER_CLASS)
778 		goto done;
779 
780 	if (IS_PONTEVECCHIO(i915))
781 		; /* noop; none at this time */
782 	else if (IS_DG2(i915))
783 		dg2_ctx_workarounds_init(engine, wal);
784 	else if (IS_XEHPSDV(i915))
785 		; /* noop; none at this time */
786 	else if (IS_DG1(i915))
787 		dg1_ctx_workarounds_init(engine, wal);
788 	else if (GRAPHICS_VER(i915) == 12)
789 		gen12_ctx_workarounds_init(engine, wal);
790 	else if (GRAPHICS_VER(i915) == 11)
791 		icl_ctx_workarounds_init(engine, wal);
792 	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
793 		cfl_ctx_workarounds_init(engine, wal);
794 	else if (IS_GEMINILAKE(i915))
795 		glk_ctx_workarounds_init(engine, wal);
796 	else if (IS_KABYLAKE(i915))
797 		kbl_ctx_workarounds_init(engine, wal);
798 	else if (IS_BROXTON(i915))
799 		bxt_ctx_workarounds_init(engine, wal);
800 	else if (IS_SKYLAKE(i915))
801 		skl_ctx_workarounds_init(engine, wal);
802 	else if (IS_CHERRYVIEW(i915))
803 		chv_ctx_workarounds_init(engine, wal);
804 	else if (IS_BROADWELL(i915))
805 		bdw_ctx_workarounds_init(engine, wal);
806 	else if (GRAPHICS_VER(i915) == 7)
807 		gen7_ctx_workarounds_init(engine, wal);
808 	else if (GRAPHICS_VER(i915) == 6)
809 		gen6_ctx_workarounds_init(engine, wal);
810 	else if (GRAPHICS_VER(i915) < 8)
811 		;
812 	else
813 		MISSING_CASE(GRAPHICS_VER(i915));
814 
815 done:
816 	wa_init_finish(wal);
817 }
818 
819 void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
820 {
821 	__intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
822 }
823 
824 int intel_engine_emit_ctx_wa(struct i915_request *rq)
825 {
826 	struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
827 	struct i915_wa *wa;
828 	unsigned int i;
829 	u32 *cs;
830 	int ret;
831 
832 	if (wal->count == 0)
833 		return 0;
834 
835 	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
836 	if (ret)
837 		return ret;
838 
839 	cs = intel_ring_begin(rq, (wal->count * 2 + 2));
840 	if (IS_ERR(cs))
841 		return PTR_ERR(cs);
842 
843 	*cs++ = MI_LOAD_REGISTER_IMM(wal->count);
844 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
845 		*cs++ = i915_mmio_reg_offset(wa->reg);
846 		*cs++ = wa->set;
847 	}
848 	*cs++ = MI_NOOP;
849 
850 	intel_ring_advance(rq, cs);
851 
852 	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
853 	if (ret)
854 		return ret;
855 
856 	return 0;
857 }
858 
859 static void
860 gen4_gt_workarounds_init(struct intel_gt *gt,
861 			 struct i915_wa_list *wal)
862 {
863 	/* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
864 	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
865 }
866 
867 static void
868 g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
869 {
870 	gen4_gt_workarounds_init(gt, wal);
871 
872 	/* WaDisableRenderCachePipelinedFlush:g4x,ilk */
873 	wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
874 }
875 
876 static void
877 ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
878 {
879 	g4x_gt_workarounds_init(gt, wal);
880 
881 	wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
882 }
883 
884 static void
885 snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
886 {
887 }
888 
889 static void
890 ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
891 {
892 	/* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
893 	wa_masked_dis(wal,
894 		      GEN7_COMMON_SLICE_CHICKEN1,
895 		      GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
896 
897 	/* WaApplyL3ControlAndL3ChickenMode:ivb */
898 	wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
899 	wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
900 
901 	/* WaForceL3Serialization:ivb */
902 	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
903 }
904 
905 static void
906 vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
907 {
908 	/* WaForceL3Serialization:vlv */
909 	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
910 
911 	/*
912 	 * WaIncreaseL3CreditsForVLVB0:vlv
913 	 * This is the hardware default actually.
914 	 */
915 	wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
916 }
917 
918 static void
919 hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
920 {
921 	/* L3 caching of data atomics doesn't work -- disable it. */
922 	wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
923 
924 	wa_add(wal,
925 	       HSW_ROW_CHICKEN3, 0,
926 	       _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
927 	       0 /* XXX does this reg exist? */, true);
928 
929 	/* WaVSRefCountFullforceMissDisable:hsw */
930 	wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
931 }
932 
933 static void
934 gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
935 {
936 	const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu;
937 	unsigned int slice, subslice;
938 	u32 mcr, mcr_mask;
939 
940 	GEM_BUG_ON(GRAPHICS_VER(i915) != 9);
941 
942 	/*
943 	 * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml
944 	 * Before any MMIO read into slice/subslice specific registers, MCR
945 	 * packet control register needs to be programmed to point to any
946 	 * enabled s/ss pair. Otherwise, incorrect values will be returned.
947 	 * This means each subsequent MMIO read will be forwarded to an
948 	 * specific s/ss combination, but this is OK since these registers
949 	 * are consistent across s/ss in almost all cases. In the rare
950 	 * occasions, such as INSTDONE, where this value is dependent
951 	 * on s/ss combo, the read should be done with read_subslice_reg.
952 	 */
953 	slice = ffs(sseu->slice_mask) - 1;
954 	GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw));
955 	subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice));
956 	GEM_BUG_ON(!subslice);
957 	subslice--;
958 
959 	/*
960 	 * We use GEN8_MCR..() macros to calculate the |mcr| value for
961 	 * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads
962 	 */
963 	mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
964 	mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
965 
966 	drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr);
967 
968 	wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
969 }
970 
971 static void
972 gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
973 {
974 	struct drm_i915_private *i915 = gt->i915;
975 
976 	/* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */
977 	gen9_wa_init_mcr(i915, wal);
978 
979 	/* WaDisableKillLogic:bxt,skl,kbl */
980 	if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
981 		wa_write_or(wal,
982 			    GAM_ECOCHK,
983 			    ECOCHK_DIS_TLB);
984 
985 	if (HAS_LLC(i915)) {
986 		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
987 		 *
988 		 * Must match Display Engine. See
989 		 * WaCompressedResourceDisplayNewHashMode.
990 		 */
991 		wa_write_or(wal,
992 			    MMCD_MISC_CTRL,
993 			    MMCD_PCLA | MMCD_HOTSPOT_EN);
994 	}
995 
996 	/* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
997 	wa_write_or(wal,
998 		    GAM_ECOCHK,
999 		    BDW_DISABLE_HDC_INVALIDATION);
1000 }
1001 
1002 static void
1003 skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1004 {
1005 	gen9_gt_workarounds_init(gt, wal);
1006 
1007 	/* WaDisableGafsUnitClkGating:skl */
1008 	wa_write_or(wal,
1009 		    GEN7_UCGCTL4,
1010 		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1011 
1012 	/* WaInPlaceDecompressionHang:skl */
1013 	if (IS_SKL_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0))
1014 		wa_write_or(wal,
1015 			    GEN9_GAMT_ECO_REG_RW_IA,
1016 			    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1017 }
1018 
1019 static void
1020 kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1021 {
1022 	gen9_gt_workarounds_init(gt, wal);
1023 
1024 	/* WaDisableDynamicCreditSharing:kbl */
1025 	if (IS_KBL_GRAPHICS_STEP(gt->i915, 0, STEP_C0))
1026 		wa_write_or(wal,
1027 			    GAMT_CHKN_BIT_REG,
1028 			    GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1029 
1030 	/* WaDisableGafsUnitClkGating:kbl */
1031 	wa_write_or(wal,
1032 		    GEN7_UCGCTL4,
1033 		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1034 
1035 	/* WaInPlaceDecompressionHang:kbl */
1036 	wa_write_or(wal,
1037 		    GEN9_GAMT_ECO_REG_RW_IA,
1038 		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1039 }
1040 
1041 static void
1042 glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1043 {
1044 	gen9_gt_workarounds_init(gt, wal);
1045 }
1046 
1047 static void
1048 cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1049 {
1050 	gen9_gt_workarounds_init(gt, wal);
1051 
1052 	/* WaDisableGafsUnitClkGating:cfl */
1053 	wa_write_or(wal,
1054 		    GEN7_UCGCTL4,
1055 		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1056 
1057 	/* WaInPlaceDecompressionHang:cfl */
1058 	wa_write_or(wal,
1059 		    GEN9_GAMT_ECO_REG_RW_IA,
1060 		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1061 }
1062 
1063 static void __set_mcr_steering(struct i915_wa_list *wal,
1064 			       i915_reg_t steering_reg,
1065 			       unsigned int slice, unsigned int subslice)
1066 {
1067 	u32 mcr, mcr_mask;
1068 
1069 	mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1070 	mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1071 
1072 	wa_write_clr_set(wal, steering_reg, mcr_mask, mcr);
1073 }
1074 
1075 static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal,
1076 			 unsigned int slice, unsigned int subslice)
1077 {
1078 	struct drm_printer p = drm_debug_printer("MCR Steering:");
1079 
1080 	__set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice);
1081 
1082 	gt->default_steering.groupid = slice;
1083 	gt->default_steering.instanceid = subslice;
1084 
1085 	if (drm_debug_enabled(DRM_UT_DRIVER))
1086 		intel_gt_mcr_report_steering(&p, gt, false);
1087 }
1088 
1089 static void
1090 icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1091 {
1092 	const struct sseu_dev_info *sseu = &gt->info.sseu;
1093 	unsigned int subslice;
1094 
1095 	GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11);
1096 	GEM_BUG_ON(hweight8(sseu->slice_mask) > 1);
1097 
1098 	/*
1099 	 * Although a platform may have subslices, we need to always steer
1100 	 * reads to the lowest instance that isn't fused off.  When Render
1101 	 * Power Gating is enabled, grabbing forcewake will only power up a
1102 	 * single subslice (the "minconfig") if there isn't a real workload
1103 	 * that needs to be run; this means that if we steer register reads to
1104 	 * one of the higher subslices, we run the risk of reading back 0's or
1105 	 * random garbage.
1106 	 */
1107 	subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0));
1108 
1109 	/*
1110 	 * If the subslice we picked above also steers us to a valid L3 bank,
1111 	 * then we can just rely on the default steering and won't need to
1112 	 * worry about explicitly re-steering L3BANK reads later.
1113 	 */
1114 	if (gt->info.l3bank_mask & BIT(subslice))
1115 		gt->steering_table[L3BANK] = NULL;
1116 
1117 	__add_mcr_wa(gt, wal, 0, subslice);
1118 }
1119 
1120 static void
1121 xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1122 {
1123 	const struct sseu_dev_info *sseu = &gt->info.sseu;
1124 	unsigned long slice, subslice = 0, slice_mask = 0;
1125 	u32 lncf_mask = 0;
1126 	int i;
1127 
1128 	/*
1129 	 * On Xe_HP the steering increases in complexity. There are now several
1130 	 * more units that require steering and we're not guaranteed to be able
1131 	 * to find a common setting for all of them. These are:
1132 	 * - GSLICE (fusable)
1133 	 * - DSS (sub-unit within gslice; fusable)
1134 	 * - L3 Bank (fusable)
1135 	 * - MSLICE (fusable)
1136 	 * - LNCF (sub-unit within mslice; always present if mslice is present)
1137 	 *
1138 	 * We'll do our default/implicit steering based on GSLICE (in the
1139 	 * sliceid field) and DSS (in the subsliceid field).  If we can
1140 	 * find overlap between the valid MSLICE and/or LNCF values with
1141 	 * a suitable GSLICE, then we can just re-use the default value and
1142 	 * skip and explicit steering at runtime.
1143 	 *
1144 	 * We only need to look for overlap between GSLICE/MSLICE/LNCF to find
1145 	 * a valid sliceid value.  DSS steering is the only type of steering
1146 	 * that utilizes the 'subsliceid' bits.
1147 	 *
1148 	 * Also note that, even though the steering domain is called "GSlice"
1149 	 * and it is encoded in the register using the gslice format, the spec
1150 	 * says that the combined (geometry | compute) fuse should be used to
1151 	 * select the steering.
1152 	 */
1153 
1154 	/* Find the potential gslice candidates */
1155 	slice_mask = intel_slicemask_from_xehp_dssmask(sseu->subslice_mask,
1156 						       GEN_DSS_PER_GSLICE);
1157 
1158 	/*
1159 	 * Find the potential LNCF candidates.  Either LNCF within a valid
1160 	 * mslice is fine.
1161 	 */
1162 	for_each_set_bit(i, &gt->info.mslice_mask, GEN12_MAX_MSLICES)
1163 		lncf_mask |= (0x3 << (i * 2));
1164 
1165 	/*
1166 	 * Are there any sliceid values that work for both GSLICE and LNCF
1167 	 * steering?
1168 	 */
1169 	if (slice_mask & lncf_mask) {
1170 		slice_mask &= lncf_mask;
1171 		gt->steering_table[LNCF] = NULL;
1172 	}
1173 
1174 	/* How about sliceid values that also work for MSLICE steering? */
1175 	if (slice_mask & gt->info.mslice_mask) {
1176 		slice_mask &= gt->info.mslice_mask;
1177 		gt->steering_table[MSLICE] = NULL;
1178 	}
1179 
1180 	slice = __ffs(slice_mask);
1181 	subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, slice) %
1182 		GEN_DSS_PER_GSLICE;
1183 
1184 	__add_mcr_wa(gt, wal, slice, subslice);
1185 
1186 	/*
1187 	 * SQIDI ranges are special because they use different steering
1188 	 * registers than everything else we work with.  On XeHP SDV and
1189 	 * DG2-G10, any value in the steering registers will work fine since
1190 	 * all instances are present, but DG2-G11 only has SQIDI instances at
1191 	 * ID's 2 and 3, so we need to steer to one of those.  For simplicity
1192 	 * we'll just steer to a hardcoded "2" since that value will work
1193 	 * everywhere.
1194 	 */
1195 	__set_mcr_steering(wal, MCFG_MCR_SELECTOR, 0, 2);
1196 	__set_mcr_steering(wal, SF_MCR_SELECTOR, 0, 2);
1197 }
1198 
1199 static void
1200 pvc_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1201 {
1202 	unsigned int dss;
1203 
1204 	/*
1205 	 * Setup implicit steering for COMPUTE and DSS ranges to the first
1206 	 * non-fused-off DSS.  All other types of MCR registers will be
1207 	 * explicitly steered.
1208 	 */
1209 	dss = intel_sseu_find_first_xehp_dss(&gt->info.sseu, 0, 0);
1210 	__add_mcr_wa(gt, wal, dss / GEN_DSS_PER_CSLICE, dss % GEN_DSS_PER_CSLICE);
1211 }
1212 
1213 static void
1214 icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1215 {
1216 	struct drm_i915_private *i915 = gt->i915;
1217 
1218 	icl_wa_init_mcr(gt, wal);
1219 
1220 	/* WaModifyGamTlbPartitioning:icl */
1221 	wa_write_clr_set(wal,
1222 			 GEN11_GACB_PERF_CTRL,
1223 			 GEN11_HASH_CTRL_MASK,
1224 			 GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1225 
1226 	/* Wa_1405766107:icl
1227 	 * Formerly known as WaCL2SFHalfMaxAlloc
1228 	 */
1229 	wa_write_or(wal,
1230 		    GEN11_LSN_UNSLCVC,
1231 		    GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1232 		    GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1233 
1234 	/* Wa_220166154:icl
1235 	 * Formerly known as WaDisCtxReload
1236 	 */
1237 	wa_write_or(wal,
1238 		    GEN8_GAMW_ECO_DEV_RW_IA,
1239 		    GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1240 
1241 	/* Wa_1406463099:icl
1242 	 * Formerly known as WaGamTlbPendError
1243 	 */
1244 	wa_write_or(wal,
1245 		    GAMT_CHKN_BIT_REG,
1246 		    GAMT_CHKN_DISABLE_L3_COH_PIPE);
1247 
1248 	/* Wa_1407352427:icl,ehl */
1249 	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1250 		    PSDUNIT_CLKGATE_DIS);
1251 
1252 	/* Wa_1406680159:icl,ehl */
1253 	wa_write_or(wal,
1254 		    SUBSLICE_UNIT_LEVEL_CLKGATE,
1255 		    GWUNIT_CLKGATE_DIS);
1256 
1257 	/* Wa_1607087056:icl,ehl,jsl */
1258 	if (IS_ICELAKE(i915) ||
1259 	    IS_JSL_EHL_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1260 		wa_write_or(wal,
1261 			    SLICE_UNIT_LEVEL_CLKGATE,
1262 			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1263 
1264 	/*
1265 	 * This is not a documented workaround, but rather an optimization
1266 	 * to reduce sampler power.
1267 	 */
1268 	wa_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1269 }
1270 
1271 /*
1272  * Though there are per-engine instances of these registers,
1273  * they retain their value through engine resets and should
1274  * only be provided on the GT workaround list rather than
1275  * the engine-specific workaround list.
1276  */
1277 static void
1278 wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal)
1279 {
1280 	struct intel_engine_cs *engine;
1281 	int id;
1282 
1283 	for_each_engine(engine, gt, id) {
1284 		if (engine->class != VIDEO_DECODE_CLASS ||
1285 		    (engine->instance % 2))
1286 			continue;
1287 
1288 		wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base),
1289 			    IECPUNIT_CLKGATE_DIS);
1290 	}
1291 }
1292 
1293 static void
1294 gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1295 {
1296 	icl_wa_init_mcr(gt, wal);
1297 
1298 	/* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */
1299 	wa_14011060649(gt, wal);
1300 
1301 	/* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */
1302 	wa_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1303 }
1304 
1305 static void
1306 tgl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1307 {
1308 	struct drm_i915_private *i915 = gt->i915;
1309 
1310 	gen12_gt_workarounds_init(gt, wal);
1311 
1312 	/* Wa_1409420604:tgl */
1313 	if (IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1314 		wa_write_or(wal,
1315 			    SUBSLICE_UNIT_LEVEL_CLKGATE2,
1316 			    CPSSUNIT_CLKGATE_DIS);
1317 
1318 	/* Wa_1607087056:tgl also know as BUG:1409180338 */
1319 	if (IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1320 		wa_write_or(wal,
1321 			    SLICE_UNIT_LEVEL_CLKGATE,
1322 			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1323 
1324 	/* Wa_1408615072:tgl[a0] */
1325 	if (IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1326 		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1327 			    VSUNIT_CLKGATE_DIS_TGL);
1328 }
1329 
1330 static void
1331 dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1332 {
1333 	struct drm_i915_private *i915 = gt->i915;
1334 
1335 	gen12_gt_workarounds_init(gt, wal);
1336 
1337 	/* Wa_1607087056:dg1 */
1338 	if (IS_DG1_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1339 		wa_write_or(wal,
1340 			    SLICE_UNIT_LEVEL_CLKGATE,
1341 			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1342 
1343 	/* Wa_1409420604:dg1 */
1344 	if (IS_DG1(i915))
1345 		wa_write_or(wal,
1346 			    SUBSLICE_UNIT_LEVEL_CLKGATE2,
1347 			    CPSSUNIT_CLKGATE_DIS);
1348 
1349 	/* Wa_1408615072:dg1 */
1350 	/* Empirical testing shows this register is unaffected by engine reset. */
1351 	if (IS_DG1(i915))
1352 		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1353 			    VSUNIT_CLKGATE_DIS_TGL);
1354 }
1355 
1356 static void
1357 xehpsdv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1358 {
1359 	struct drm_i915_private *i915 = gt->i915;
1360 
1361 	xehp_init_mcr(gt, wal);
1362 
1363 	/* Wa_1409757795:xehpsdv */
1364 	wa_write_or(wal, SCCGCTL94DC, CG3DDISURB);
1365 
1366 	/* Wa_16011155590:xehpsdv */
1367 	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1368 		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1369 			    TSGUNIT_CLKGATE_DIS);
1370 
1371 	/* Wa_14011780169:xehpsdv */
1372 	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_B0, STEP_FOREVER)) {
1373 		wa_write_or(wal, UNSLCGCTL9440, GAMTLBOACS_CLKGATE_DIS |
1374 			    GAMTLBVDBOX7_CLKGATE_DIS |
1375 			    GAMTLBVDBOX6_CLKGATE_DIS |
1376 			    GAMTLBVDBOX5_CLKGATE_DIS |
1377 			    GAMTLBVDBOX4_CLKGATE_DIS |
1378 			    GAMTLBVDBOX3_CLKGATE_DIS |
1379 			    GAMTLBVDBOX2_CLKGATE_DIS |
1380 			    GAMTLBVDBOX1_CLKGATE_DIS |
1381 			    GAMTLBVDBOX0_CLKGATE_DIS |
1382 			    GAMTLBKCR_CLKGATE_DIS |
1383 			    GAMTLBGUC_CLKGATE_DIS |
1384 			    GAMTLBBLT_CLKGATE_DIS);
1385 		wa_write_or(wal, UNSLCGCTL9444, GAMTLBGFXA0_CLKGATE_DIS |
1386 			    GAMTLBGFXA1_CLKGATE_DIS |
1387 			    GAMTLBCOMPA0_CLKGATE_DIS |
1388 			    GAMTLBCOMPA1_CLKGATE_DIS |
1389 			    GAMTLBCOMPB0_CLKGATE_DIS |
1390 			    GAMTLBCOMPB1_CLKGATE_DIS |
1391 			    GAMTLBCOMPC0_CLKGATE_DIS |
1392 			    GAMTLBCOMPC1_CLKGATE_DIS |
1393 			    GAMTLBCOMPD0_CLKGATE_DIS |
1394 			    GAMTLBCOMPD1_CLKGATE_DIS |
1395 			    GAMTLBMERT_CLKGATE_DIS   |
1396 			    GAMTLBVEBOX3_CLKGATE_DIS |
1397 			    GAMTLBVEBOX2_CLKGATE_DIS |
1398 			    GAMTLBVEBOX1_CLKGATE_DIS |
1399 			    GAMTLBVEBOX0_CLKGATE_DIS);
1400 	}
1401 
1402 	/* Wa_16012725990:xehpsdv */
1403 	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_FOREVER))
1404 		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, VFUNIT_CLKGATE_DIS);
1405 
1406 	/* Wa_14011060649:xehpsdv */
1407 	wa_14011060649(gt, wal);
1408 }
1409 
1410 static void
1411 dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1412 {
1413 	struct intel_engine_cs *engine;
1414 	int id;
1415 
1416 	xehp_init_mcr(gt, wal);
1417 
1418 	/* Wa_14011060649:dg2 */
1419 	wa_14011060649(gt, wal);
1420 
1421 	/*
1422 	 * Although there are per-engine instances of these registers,
1423 	 * they technically exist outside the engine itself and are not
1424 	 * impacted by engine resets.  Furthermore, they're part of the
1425 	 * GuC blacklist so trying to treat them as engine workarounds
1426 	 * will result in GuC initialization failure and a wedged GPU.
1427 	 */
1428 	for_each_engine(engine, gt, id) {
1429 		if (engine->class != VIDEO_DECODE_CLASS)
1430 			continue;
1431 
1432 		/* Wa_16010515920:dg2_g10 */
1433 		if (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_B0))
1434 			wa_write_or(wal, VDBOX_CGCTL3F18(engine->mmio_base),
1435 				    ALNUNIT_CLKGATE_DIS);
1436 	}
1437 
1438 	if (IS_DG2_G10(gt->i915)) {
1439 		/* Wa_22010523718:dg2 */
1440 		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1441 			    CG3DDISCFEG_CLKGATE_DIS);
1442 
1443 		/* Wa_14011006942:dg2 */
1444 		wa_write_or(wal, SUBSLICE_UNIT_LEVEL_CLKGATE,
1445 			    DSS_ROUTER_CLKGATE_DIS);
1446 	}
1447 
1448 	if (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_B0)) {
1449 		/* Wa_14010948348:dg2_g10 */
1450 		wa_write_or(wal, UNSLCGCTL9430, MSQDUNIT_CLKGATE_DIS);
1451 
1452 		/* Wa_14011037102:dg2_g10 */
1453 		wa_write_or(wal, UNSLCGCTL9444, LTCDD_CLKGATE_DIS);
1454 
1455 		/* Wa_14011371254:dg2_g10 */
1456 		wa_write_or(wal, SLICE_UNIT_LEVEL_CLKGATE, NODEDSS_CLKGATE_DIS);
1457 
1458 		/* Wa_14011431319:dg2_g10 */
1459 		wa_write_or(wal, UNSLCGCTL9440, GAMTLBOACS_CLKGATE_DIS |
1460 			    GAMTLBVDBOX7_CLKGATE_DIS |
1461 			    GAMTLBVDBOX6_CLKGATE_DIS |
1462 			    GAMTLBVDBOX5_CLKGATE_DIS |
1463 			    GAMTLBVDBOX4_CLKGATE_DIS |
1464 			    GAMTLBVDBOX3_CLKGATE_DIS |
1465 			    GAMTLBVDBOX2_CLKGATE_DIS |
1466 			    GAMTLBVDBOX1_CLKGATE_DIS |
1467 			    GAMTLBVDBOX0_CLKGATE_DIS |
1468 			    GAMTLBKCR_CLKGATE_DIS |
1469 			    GAMTLBGUC_CLKGATE_DIS |
1470 			    GAMTLBBLT_CLKGATE_DIS);
1471 		wa_write_or(wal, UNSLCGCTL9444, GAMTLBGFXA0_CLKGATE_DIS |
1472 			    GAMTLBGFXA1_CLKGATE_DIS |
1473 			    GAMTLBCOMPA0_CLKGATE_DIS |
1474 			    GAMTLBCOMPA1_CLKGATE_DIS |
1475 			    GAMTLBCOMPB0_CLKGATE_DIS |
1476 			    GAMTLBCOMPB1_CLKGATE_DIS |
1477 			    GAMTLBCOMPC0_CLKGATE_DIS |
1478 			    GAMTLBCOMPC1_CLKGATE_DIS |
1479 			    GAMTLBCOMPD0_CLKGATE_DIS |
1480 			    GAMTLBCOMPD1_CLKGATE_DIS |
1481 			    GAMTLBMERT_CLKGATE_DIS   |
1482 			    GAMTLBVEBOX3_CLKGATE_DIS |
1483 			    GAMTLBVEBOX2_CLKGATE_DIS |
1484 			    GAMTLBVEBOX1_CLKGATE_DIS |
1485 			    GAMTLBVEBOX0_CLKGATE_DIS);
1486 
1487 		/* Wa_14010569222:dg2_g10 */
1488 		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1489 			    GAMEDIA_CLKGATE_DIS);
1490 
1491 		/* Wa_14011028019:dg2_g10 */
1492 		wa_write_or(wal, SSMCGCTL9530, RTFUNIT_CLKGATE_DIS);
1493 	}
1494 
1495 	/* Wa_14014830051:dg2 */
1496 	wa_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1497 
1498 	/*
1499 	 * The following are not actually "workarounds" but rather
1500 	 * recommended tuning settings documented in the bspec's
1501 	 * performance guide section.
1502 	 */
1503 	wa_write_or(wal, GEN12_SQCM, EN_32B_ACCESS);
1504 
1505 	/* Wa_14015795083 */
1506 	wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1507 }
1508 
1509 static void
1510 pvc_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1511 {
1512 	pvc_init_mcr(gt, wal);
1513 
1514 	/* Wa_14015795083 */
1515 	wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1516 }
1517 
1518 static void
1519 gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
1520 {
1521 	struct drm_i915_private *i915 = gt->i915;
1522 
1523 	if (IS_PONTEVECCHIO(i915))
1524 		pvc_gt_workarounds_init(gt, wal);
1525 	else if (IS_DG2(i915))
1526 		dg2_gt_workarounds_init(gt, wal);
1527 	else if (IS_XEHPSDV(i915))
1528 		xehpsdv_gt_workarounds_init(gt, wal);
1529 	else if (IS_DG1(i915))
1530 		dg1_gt_workarounds_init(gt, wal);
1531 	else if (IS_TIGERLAKE(i915))
1532 		tgl_gt_workarounds_init(gt, wal);
1533 	else if (GRAPHICS_VER(i915) == 12)
1534 		gen12_gt_workarounds_init(gt, wal);
1535 	else if (GRAPHICS_VER(i915) == 11)
1536 		icl_gt_workarounds_init(gt, wal);
1537 	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1538 		cfl_gt_workarounds_init(gt, wal);
1539 	else if (IS_GEMINILAKE(i915))
1540 		glk_gt_workarounds_init(gt, wal);
1541 	else if (IS_KABYLAKE(i915))
1542 		kbl_gt_workarounds_init(gt, wal);
1543 	else if (IS_BROXTON(i915))
1544 		gen9_gt_workarounds_init(gt, wal);
1545 	else if (IS_SKYLAKE(i915))
1546 		skl_gt_workarounds_init(gt, wal);
1547 	else if (IS_HASWELL(i915))
1548 		hsw_gt_workarounds_init(gt, wal);
1549 	else if (IS_VALLEYVIEW(i915))
1550 		vlv_gt_workarounds_init(gt, wal);
1551 	else if (IS_IVYBRIDGE(i915))
1552 		ivb_gt_workarounds_init(gt, wal);
1553 	else if (GRAPHICS_VER(i915) == 6)
1554 		snb_gt_workarounds_init(gt, wal);
1555 	else if (GRAPHICS_VER(i915) == 5)
1556 		ilk_gt_workarounds_init(gt, wal);
1557 	else if (IS_G4X(i915))
1558 		g4x_gt_workarounds_init(gt, wal);
1559 	else if (GRAPHICS_VER(i915) == 4)
1560 		gen4_gt_workarounds_init(gt, wal);
1561 	else if (GRAPHICS_VER(i915) <= 8)
1562 		;
1563 	else
1564 		MISSING_CASE(GRAPHICS_VER(i915));
1565 }
1566 
1567 void intel_gt_init_workarounds(struct intel_gt *gt)
1568 {
1569 	struct i915_wa_list *wal = &gt->wa_list;
1570 
1571 	wa_init_start(wal, "GT", "global");
1572 	gt_init_workarounds(gt, wal);
1573 	wa_init_finish(wal);
1574 }
1575 
1576 static enum forcewake_domains
1577 wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
1578 {
1579 	enum forcewake_domains fw = 0;
1580 	struct i915_wa *wa;
1581 	unsigned int i;
1582 
1583 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1584 		fw |= intel_uncore_forcewake_for_reg(uncore,
1585 						     wa->reg,
1586 						     FW_REG_READ |
1587 						     FW_REG_WRITE);
1588 
1589 	return fw;
1590 }
1591 
1592 static bool
1593 wa_verify(const struct i915_wa *wa, u32 cur, const char *name, const char *from)
1594 {
1595 	if ((cur ^ wa->set) & wa->read) {
1596 		DRM_ERROR("%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n",
1597 			  name, from, i915_mmio_reg_offset(wa->reg),
1598 			  cur, cur & wa->read, wa->set & wa->read);
1599 
1600 		return false;
1601 	}
1602 
1603 	return true;
1604 }
1605 
1606 static void
1607 wa_list_apply(struct intel_gt *gt, const struct i915_wa_list *wal)
1608 {
1609 	struct intel_uncore *uncore = gt->uncore;
1610 	enum forcewake_domains fw;
1611 	unsigned long flags;
1612 	struct i915_wa *wa;
1613 	unsigned int i;
1614 
1615 	if (!wal->count)
1616 		return;
1617 
1618 	fw = wal_get_fw_for_rmw(uncore, wal);
1619 
1620 	spin_lock_irqsave(&uncore->lock, flags);
1621 	intel_uncore_forcewake_get__locked(uncore, fw);
1622 
1623 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1624 		u32 val, old = 0;
1625 
1626 		/* open-coded rmw due to steering */
1627 		old = wa->clr ? intel_gt_mcr_read_any_fw(gt, wa->reg) : 0;
1628 		val = (old & ~wa->clr) | wa->set;
1629 		if (val != old || !wa->clr)
1630 			intel_uncore_write_fw(uncore, wa->reg, val);
1631 
1632 		if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1633 			wa_verify(wa, intel_gt_mcr_read_any_fw(gt, wa->reg),
1634 				  wal->name, "application");
1635 	}
1636 
1637 	intel_uncore_forcewake_put__locked(uncore, fw);
1638 	spin_unlock_irqrestore(&uncore->lock, flags);
1639 }
1640 
1641 void intel_gt_apply_workarounds(struct intel_gt *gt)
1642 {
1643 	wa_list_apply(gt, &gt->wa_list);
1644 }
1645 
1646 static bool wa_list_verify(struct intel_gt *gt,
1647 			   const struct i915_wa_list *wal,
1648 			   const char *from)
1649 {
1650 	struct intel_uncore *uncore = gt->uncore;
1651 	struct i915_wa *wa;
1652 	enum forcewake_domains fw;
1653 	unsigned long flags;
1654 	unsigned int i;
1655 	bool ok = true;
1656 
1657 	fw = wal_get_fw_for_rmw(uncore, wal);
1658 
1659 	spin_lock_irqsave(&uncore->lock, flags);
1660 	intel_uncore_forcewake_get__locked(uncore, fw);
1661 
1662 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1663 		ok &= wa_verify(wa,
1664 				intel_gt_mcr_read_any_fw(gt, wa->reg),
1665 				wal->name, from);
1666 
1667 	intel_uncore_forcewake_put__locked(uncore, fw);
1668 	spin_unlock_irqrestore(&uncore->lock, flags);
1669 
1670 	return ok;
1671 }
1672 
1673 bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1674 {
1675 	return wa_list_verify(gt, &gt->wa_list, from);
1676 }
1677 
1678 __maybe_unused
1679 static bool is_nonpriv_flags_valid(u32 flags)
1680 {
1681 	/* Check only valid flag bits are set */
1682 	if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1683 		return false;
1684 
1685 	/* NB: Only 3 out of 4 enum values are valid for access field */
1686 	if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1687 	    RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1688 		return false;
1689 
1690 	return true;
1691 }
1692 
1693 static void
1694 whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1695 {
1696 	struct i915_wa wa = {
1697 		.reg = reg
1698 	};
1699 
1700 	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1701 		return;
1702 
1703 	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1704 		return;
1705 
1706 	wa.reg.reg |= flags;
1707 	_wa_add(wal, &wa);
1708 }
1709 
1710 static void
1711 whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1712 {
1713 	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1714 }
1715 
1716 static void gen9_whitelist_build(struct i915_wa_list *w)
1717 {
1718 	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1719 	whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1720 
1721 	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1722 	whitelist_reg(w, GEN8_CS_CHICKEN1);
1723 
1724 	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1725 	whitelist_reg(w, GEN8_HDC_CHICKEN1);
1726 
1727 	/* WaSendPushConstantsFromMMIO:skl,bxt */
1728 	whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1729 }
1730 
1731 static void skl_whitelist_build(struct intel_engine_cs *engine)
1732 {
1733 	struct i915_wa_list *w = &engine->whitelist;
1734 
1735 	if (engine->class != RENDER_CLASS)
1736 		return;
1737 
1738 	gen9_whitelist_build(w);
1739 
1740 	/* WaDisableLSQCROPERFforOCL:skl */
1741 	whitelist_reg(w, GEN8_L3SQCREG4);
1742 }
1743 
1744 static void bxt_whitelist_build(struct intel_engine_cs *engine)
1745 {
1746 	if (engine->class != RENDER_CLASS)
1747 		return;
1748 
1749 	gen9_whitelist_build(&engine->whitelist);
1750 }
1751 
1752 static void kbl_whitelist_build(struct intel_engine_cs *engine)
1753 {
1754 	struct i915_wa_list *w = &engine->whitelist;
1755 
1756 	if (engine->class != RENDER_CLASS)
1757 		return;
1758 
1759 	gen9_whitelist_build(w);
1760 
1761 	/* WaDisableLSQCROPERFforOCL:kbl */
1762 	whitelist_reg(w, GEN8_L3SQCREG4);
1763 }
1764 
1765 static void glk_whitelist_build(struct intel_engine_cs *engine)
1766 {
1767 	struct i915_wa_list *w = &engine->whitelist;
1768 
1769 	if (engine->class != RENDER_CLASS)
1770 		return;
1771 
1772 	gen9_whitelist_build(w);
1773 
1774 	/* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
1775 	whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1776 }
1777 
1778 static void cfl_whitelist_build(struct intel_engine_cs *engine)
1779 {
1780 	struct i915_wa_list *w = &engine->whitelist;
1781 
1782 	if (engine->class != RENDER_CLASS)
1783 		return;
1784 
1785 	gen9_whitelist_build(w);
1786 
1787 	/*
1788 	 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
1789 	 *
1790 	 * This covers 4 register which are next to one another :
1791 	 *   - PS_INVOCATION_COUNT
1792 	 *   - PS_INVOCATION_COUNT_UDW
1793 	 *   - PS_DEPTH_COUNT
1794 	 *   - PS_DEPTH_COUNT_UDW
1795 	 */
1796 	whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1797 			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1798 			  RING_FORCE_TO_NONPRIV_RANGE_4);
1799 }
1800 
1801 static void allow_read_ctx_timestamp(struct intel_engine_cs *engine)
1802 {
1803 	struct i915_wa_list *w = &engine->whitelist;
1804 
1805 	if (engine->class != RENDER_CLASS)
1806 		whitelist_reg_ext(w,
1807 				  RING_CTX_TIMESTAMP(engine->mmio_base),
1808 				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1809 }
1810 
1811 static void cml_whitelist_build(struct intel_engine_cs *engine)
1812 {
1813 	allow_read_ctx_timestamp(engine);
1814 
1815 	cfl_whitelist_build(engine);
1816 }
1817 
1818 static void icl_whitelist_build(struct intel_engine_cs *engine)
1819 {
1820 	struct i915_wa_list *w = &engine->whitelist;
1821 
1822 	allow_read_ctx_timestamp(engine);
1823 
1824 	switch (engine->class) {
1825 	case RENDER_CLASS:
1826 		/* WaAllowUMDToModifyHalfSliceChicken7:icl */
1827 		whitelist_reg(w, GEN9_HALF_SLICE_CHICKEN7);
1828 
1829 		/* WaAllowUMDToModifySamplerMode:icl */
1830 		whitelist_reg(w, GEN10_SAMPLER_MODE);
1831 
1832 		/* WaEnableStateCacheRedirectToCS:icl */
1833 		whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1834 
1835 		/*
1836 		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
1837 		 *
1838 		 * This covers 4 register which are next to one another :
1839 		 *   - PS_INVOCATION_COUNT
1840 		 *   - PS_INVOCATION_COUNT_UDW
1841 		 *   - PS_DEPTH_COUNT
1842 		 *   - PS_DEPTH_COUNT_UDW
1843 		 */
1844 		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1845 				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1846 				  RING_FORCE_TO_NONPRIV_RANGE_4);
1847 		break;
1848 
1849 	case VIDEO_DECODE_CLASS:
1850 		/* hucStatusRegOffset */
1851 		whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
1852 				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1853 		/* hucUKernelHdrInfoRegOffset */
1854 		whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
1855 				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1856 		/* hucStatus2RegOffset */
1857 		whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
1858 				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1859 		break;
1860 
1861 	default:
1862 		break;
1863 	}
1864 }
1865 
1866 static void tgl_whitelist_build(struct intel_engine_cs *engine)
1867 {
1868 	struct i915_wa_list *w = &engine->whitelist;
1869 
1870 	allow_read_ctx_timestamp(engine);
1871 
1872 	switch (engine->class) {
1873 	case RENDER_CLASS:
1874 		/*
1875 		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
1876 		 * Wa_1408556865:tgl
1877 		 *
1878 		 * This covers 4 registers which are next to one another :
1879 		 *   - PS_INVOCATION_COUNT
1880 		 *   - PS_INVOCATION_COUNT_UDW
1881 		 *   - PS_DEPTH_COUNT
1882 		 *   - PS_DEPTH_COUNT_UDW
1883 		 */
1884 		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1885 				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1886 				  RING_FORCE_TO_NONPRIV_RANGE_4);
1887 
1888 		/*
1889 		 * Wa_1808121037:tgl
1890 		 * Wa_14012131227:dg1
1891 		 * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p
1892 		 */
1893 		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
1894 
1895 		/* Wa_1806527549:tgl */
1896 		whitelist_reg(w, HIZ_CHICKEN);
1897 		break;
1898 	default:
1899 		break;
1900 	}
1901 }
1902 
1903 static void dg1_whitelist_build(struct intel_engine_cs *engine)
1904 {
1905 	struct i915_wa_list *w = &engine->whitelist;
1906 
1907 	tgl_whitelist_build(engine);
1908 
1909 	/* GEN:BUG:1409280441:dg1 */
1910 	if (IS_DG1_GRAPHICS_STEP(engine->i915, STEP_A0, STEP_B0) &&
1911 	    (engine->class == RENDER_CLASS ||
1912 	     engine->class == COPY_ENGINE_CLASS))
1913 		whitelist_reg_ext(w, RING_ID(engine->mmio_base),
1914 				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1915 }
1916 
1917 static void xehpsdv_whitelist_build(struct intel_engine_cs *engine)
1918 {
1919 	allow_read_ctx_timestamp(engine);
1920 }
1921 
1922 static void dg2_whitelist_build(struct intel_engine_cs *engine)
1923 {
1924 	struct i915_wa_list *w = &engine->whitelist;
1925 
1926 	allow_read_ctx_timestamp(engine);
1927 
1928 	switch (engine->class) {
1929 	case RENDER_CLASS:
1930 		/*
1931 		 * Wa_1507100340:dg2_g10
1932 		 *
1933 		 * This covers 4 registers which are next to one another :
1934 		 *   - PS_INVOCATION_COUNT
1935 		 *   - PS_INVOCATION_COUNT_UDW
1936 		 *   - PS_DEPTH_COUNT
1937 		 *   - PS_DEPTH_COUNT_UDW
1938 		 */
1939 		if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0))
1940 			whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1941 					  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1942 					  RING_FORCE_TO_NONPRIV_RANGE_4);
1943 
1944 		break;
1945 	case COMPUTE_CLASS:
1946 		/* Wa_16011157294:dg2_g10 */
1947 		if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0))
1948 			whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1949 		break;
1950 	default:
1951 		break;
1952 	}
1953 }
1954 
1955 static void blacklist_trtt(struct intel_engine_cs *engine)
1956 {
1957 	struct i915_wa_list *w = &engine->whitelist;
1958 
1959 	/*
1960 	 * Prevent read/write access to [0x4400, 0x4600) which covers
1961 	 * the TRTT range across all engines. Note that normally userspace
1962 	 * cannot access the other engines' trtt control, but for simplicity
1963 	 * we cover the entire range on each engine.
1964 	 */
1965 	whitelist_reg_ext(w, _MMIO(0x4400),
1966 			  RING_FORCE_TO_NONPRIV_DENY |
1967 			  RING_FORCE_TO_NONPRIV_RANGE_64);
1968 	whitelist_reg_ext(w, _MMIO(0x4500),
1969 			  RING_FORCE_TO_NONPRIV_DENY |
1970 			  RING_FORCE_TO_NONPRIV_RANGE_64);
1971 }
1972 
1973 static void pvc_whitelist_build(struct intel_engine_cs *engine)
1974 {
1975 	allow_read_ctx_timestamp(engine);
1976 
1977 	/* Wa_16014440446:pvc */
1978 	blacklist_trtt(engine);
1979 }
1980 
1981 void intel_engine_init_whitelist(struct intel_engine_cs *engine)
1982 {
1983 	struct drm_i915_private *i915 = engine->i915;
1984 	struct i915_wa_list *w = &engine->whitelist;
1985 
1986 	wa_init_start(w, "whitelist", engine->name);
1987 
1988 	if (IS_PONTEVECCHIO(i915))
1989 		pvc_whitelist_build(engine);
1990 	else if (IS_DG2(i915))
1991 		dg2_whitelist_build(engine);
1992 	else if (IS_XEHPSDV(i915))
1993 		xehpsdv_whitelist_build(engine);
1994 	else if (IS_DG1(i915))
1995 		dg1_whitelist_build(engine);
1996 	else if (GRAPHICS_VER(i915) == 12)
1997 		tgl_whitelist_build(engine);
1998 	else if (GRAPHICS_VER(i915) == 11)
1999 		icl_whitelist_build(engine);
2000 	else if (IS_COMETLAKE(i915))
2001 		cml_whitelist_build(engine);
2002 	else if (IS_COFFEELAKE(i915))
2003 		cfl_whitelist_build(engine);
2004 	else if (IS_GEMINILAKE(i915))
2005 		glk_whitelist_build(engine);
2006 	else if (IS_KABYLAKE(i915))
2007 		kbl_whitelist_build(engine);
2008 	else if (IS_BROXTON(i915))
2009 		bxt_whitelist_build(engine);
2010 	else if (IS_SKYLAKE(i915))
2011 		skl_whitelist_build(engine);
2012 	else if (GRAPHICS_VER(i915) <= 8)
2013 		;
2014 	else
2015 		MISSING_CASE(GRAPHICS_VER(i915));
2016 
2017 	wa_init_finish(w);
2018 }
2019 
2020 void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
2021 {
2022 	const struct i915_wa_list *wal = &engine->whitelist;
2023 	struct intel_uncore *uncore = engine->uncore;
2024 	const u32 base = engine->mmio_base;
2025 	struct i915_wa *wa;
2026 	unsigned int i;
2027 
2028 	if (!wal->count)
2029 		return;
2030 
2031 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
2032 		intel_uncore_write(uncore,
2033 				   RING_FORCE_TO_NONPRIV(base, i),
2034 				   i915_mmio_reg_offset(wa->reg));
2035 
2036 	/* And clear the rest just in case of garbage */
2037 	for (; i < RING_MAX_NONPRIV_SLOTS; i++)
2038 		intel_uncore_write(uncore,
2039 				   RING_FORCE_TO_NONPRIV(base, i),
2040 				   i915_mmio_reg_offset(RING_NOPID(base)));
2041 }
2042 
2043 /*
2044  * engine_fake_wa_init(), a place holder to program the registers
2045  * which are not part of an official workaround defined by the
2046  * hardware team.
2047  * Adding programming of those register inside workaround will
2048  * allow utilizing wa framework to proper application and verification.
2049  */
2050 static void
2051 engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2052 {
2053 	u8 mocs_w, mocs_r;
2054 
2055 	/*
2056 	 * RING_CMD_CCTL specifies the default MOCS entry that will be used
2057 	 * by the command streamer when executing commands that don't have
2058 	 * a way to explicitly specify a MOCS setting.  The default should
2059 	 * usually reference whichever MOCS entry corresponds to uncached
2060 	 * behavior, although use of a WB cached entry is recommended by the
2061 	 * spec in certain circumstances on specific platforms.
2062 	 */
2063 	if (GRAPHICS_VER(engine->i915) >= 12) {
2064 		mocs_r = engine->gt->mocs.uc_index;
2065 		mocs_w = engine->gt->mocs.uc_index;
2066 
2067 		if (HAS_L3_CCS_READ(engine->i915) &&
2068 		    engine->class == COMPUTE_CLASS) {
2069 			mocs_r = engine->gt->mocs.wb_index;
2070 
2071 			/*
2072 			 * Even on the few platforms where MOCS 0 is a
2073 			 * legitimate table entry, it's never the correct
2074 			 * setting to use here; we can assume the MOCS init
2075 			 * just forgot to initialize wb_index.
2076 			 */
2077 			drm_WARN_ON(&engine->i915->drm, mocs_r == 0);
2078 		}
2079 
2080 		wa_masked_field_set(wal,
2081 				    RING_CMD_CCTL(engine->mmio_base),
2082 				    CMD_CCTL_MOCS_MASK,
2083 				    CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r));
2084 	}
2085 }
2086 
2087 static bool needs_wa_1308578152(struct intel_engine_cs *engine)
2088 {
2089 	return intel_sseu_find_first_xehp_dss(&engine->gt->info.sseu, 0, 0) >=
2090 		GEN_DSS_PER_GSLICE;
2091 }
2092 
2093 static void
2094 rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2095 {
2096 	struct drm_i915_private *i915 = engine->i915;
2097 
2098 	if (IS_DG2(i915)) {
2099 		/* Wa_1509235366:dg2 */
2100 		wa_write_or(wal, GEN12_GAMCNTRL_CTRL, INVALIDATION_BROADCAST_MODE_DIS |
2101 			    GLOBAL_INVALIDATION_MODE);
2102 
2103 		/*
2104 		 * The following are not actually "workarounds" but rather
2105 		 * recommended tuning settings documented in the bspec's
2106 		 * performance guide section.
2107 		 */
2108 		wa_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
2109 	}
2110 
2111 	if (IS_DG2_GRAPHICS_STEP(i915, G11, STEP_A0, STEP_B0)) {
2112 		/* Wa_14013392000:dg2_g11 */
2113 		wa_masked_en(wal, GEN7_ROW_CHICKEN2, GEN12_ENABLE_LARGE_GRF_MODE);
2114 
2115 		/* Wa_16011620976:dg2_g11 */
2116 		wa_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8);
2117 	}
2118 
2119 	if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0) ||
2120 	    IS_DG2_GRAPHICS_STEP(i915, G11, STEP_A0, STEP_B0)) {
2121 		/* Wa_14012419201:dg2 */
2122 		wa_masked_en(wal, GEN9_ROW_CHICKEN4,
2123 			     GEN12_DISABLE_HDR_PAST_PAYLOAD_HOLD_FIX);
2124 	}
2125 
2126 	if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_C0) ||
2127 	    IS_DG2_G11(i915)) {
2128 		/*
2129 		 * Wa_22012826095:dg2
2130 		 * Wa_22013059131:dg2
2131 		 */
2132 		wa_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW,
2133 				 MAXREQS_PER_BANK,
2134 				 REG_FIELD_PREP(MAXREQS_PER_BANK, 2));
2135 
2136 		/* Wa_22013059131:dg2 */
2137 		wa_write_or(wal, LSC_CHICKEN_BIT_0,
2138 			    FORCE_1_SUB_MESSAGE_PER_FRAGMENT);
2139 	}
2140 
2141 	/* Wa_1308578152:dg2_g10 when first gslice is fused off */
2142 	if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_C0) &&
2143 	    needs_wa_1308578152(engine)) {
2144 		wa_masked_dis(wal, GEN12_CS_DEBUG_MODE1_CCCSUNIT_BE_COMMON,
2145 			      GEN12_REPLAY_MODE_GRANULARITY);
2146 	}
2147 
2148 	if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_FOREVER) ||
2149 	    IS_DG2_G11(i915) || IS_DG2_G12(i915)) {
2150 		/* Wa_22013037850:dg2 */
2151 		wa_write_or(wal, LSC_CHICKEN_BIT_0_UDW,
2152 			    DISABLE_128B_EVICTION_COMMAND_UDW);
2153 
2154 		/* Wa_22012856258:dg2 */
2155 		wa_masked_en(wal, GEN7_ROW_CHICKEN2,
2156 			     GEN12_DISABLE_READ_SUPPRESSION);
2157 
2158 		/*
2159 		 * Wa_22010960976:dg2
2160 		 * Wa_14013347512:dg2
2161 		 */
2162 		wa_masked_dis(wal, GEN12_HDC_CHICKEN0,
2163 			      LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK);
2164 	}
2165 
2166 	if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0)) {
2167 		/*
2168 		 * Wa_1608949956:dg2_g10
2169 		 * Wa_14010198302:dg2_g10
2170 		 */
2171 		wa_masked_en(wal, GEN8_ROW_CHICKEN,
2172 			     MDQ_ARBITRATION_MODE | UGM_BACKUP_MODE);
2173 
2174 		/*
2175 		 * Wa_14010918519:dg2_g10
2176 		 *
2177 		 * LSC_CHICKEN_BIT_0 always reads back as 0 is this stepping,
2178 		 * so ignoring verification.
2179 		 */
2180 		wa_add(wal, LSC_CHICKEN_BIT_0_UDW, 0,
2181 		       FORCE_SLM_FENCE_SCOPE_TO_TILE | FORCE_UGM_FENCE_SCOPE_TO_TILE,
2182 		       0, false);
2183 	}
2184 
2185 	if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0)) {
2186 		/* Wa_22010430635:dg2 */
2187 		wa_masked_en(wal,
2188 			     GEN9_ROW_CHICKEN4,
2189 			     GEN12_DISABLE_GRF_CLEAR);
2190 
2191 		/* Wa_14010648519:dg2 */
2192 		wa_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
2193 	}
2194 
2195 	if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_C0) ||
2196 	    IS_DG2_G11(i915)) {
2197 		/* Wa_22012654132:dg2 */
2198 		wa_add(wal, GEN10_CACHE_MODE_SS, 0,
2199 		       _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC),
2200 		       0 /* write-only, so skip validation */,
2201 		       true);
2202 	}
2203 
2204 	/* Wa_14013202645:dg2 */
2205 	if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_C0) ||
2206 	    IS_DG2_GRAPHICS_STEP(i915, G11, STEP_A0, STEP_B0))
2207 		wa_write_or(wal, RT_CTRL, DIS_NULL_QUERY);
2208 
2209 	/* Wa_22012532006:dg2 */
2210 	if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_C0) ||
2211 	    IS_DG2_GRAPHICS_STEP(engine->i915, G11, STEP_A0, STEP_B0))
2212 		wa_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
2213 			     DG2_DISABLE_ROUND_ENABLE_ALLOW_FOR_SSLA);
2214 
2215 	if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0)) {
2216 		/* Wa_14010680813:dg2_g10 */
2217 		wa_write_or(wal, GEN12_GAMSTLB_CTRL, CONTROL_BLOCK_CLKGATE_DIS |
2218 			    EGRESS_BLOCK_CLKGATE_DIS | TAG_BLOCK_CLKGATE_DIS);
2219 	}
2220 
2221 	if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0) ||
2222 	    IS_DG2_GRAPHICS_STEP(engine->i915, G11, STEP_A0, STEP_B0)) {
2223 		/* Wa_14012362059:dg2 */
2224 		wa_write_or(wal, GEN12_MERT_MOD_CTRL, FORCE_MISS_FTLB);
2225 	}
2226 
2227 	if (IS_DG2_GRAPHICS_STEP(i915, G11, STEP_B0, STEP_FOREVER) ||
2228 	    IS_DG2_G10(i915)) {
2229 		/* Wa_22014600077:dg2 */
2230 		wa_add(wal, GEN10_CACHE_MODE_SS, 0,
2231 		       _MASKED_BIT_ENABLE(ENABLE_EU_COUNT_FOR_TDL_FLUSH),
2232 		       0 /* Wa_14012342262 :write-only reg, so skip
2233 			    verification */,
2234 		       true);
2235 	}
2236 
2237 	if (IS_DG1_GRAPHICS_STEP(i915, STEP_A0, STEP_B0) ||
2238 	    IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)) {
2239 		/*
2240 		 * Wa_1607138336:tgl[a0],dg1[a0]
2241 		 * Wa_1607063988:tgl[a0],dg1[a0]
2242 		 */
2243 		wa_write_or(wal,
2244 			    GEN9_CTX_PREEMPT_REG,
2245 			    GEN12_DISABLE_POSH_BUSY_FF_DOP_CG);
2246 	}
2247 
2248 	if (IS_TGL_UY_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)) {
2249 		/*
2250 		 * Wa_1606679103:tgl
2251 		 * (see also Wa_1606682166:icl)
2252 		 */
2253 		wa_write_or(wal,
2254 			    GEN7_SARCHKMD,
2255 			    GEN7_DISABLE_SAMPLER_PREFETCH);
2256 	}
2257 
2258 	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) ||
2259 	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2260 		/* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */
2261 		wa_masked_en(wal, GEN7_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
2262 
2263 		/*
2264 		 * Wa_1407928979:tgl A*
2265 		 * Wa_18011464164:tgl[B0+],dg1[B0+]
2266 		 * Wa_22010931296:tgl[B0+],dg1[B0+]
2267 		 * Wa_14010919138:rkl,dg1,adl-s,adl-p
2268 		 */
2269 		wa_write_or(wal, GEN7_FF_THREAD_MODE,
2270 			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2271 	}
2272 
2273 	if (IS_ALDERLAKE_P(i915) || IS_DG2(i915) || IS_ALDERLAKE_S(i915) ||
2274 	    IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2275 		/*
2276 		 * Wa_1606700617:tgl,dg1,adl-p
2277 		 * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p
2278 		 * Wa_14010826681:tgl,dg1,rkl,adl-p
2279 		 * Wa_18019627453:dg2
2280 		 */
2281 		wa_masked_en(wal,
2282 			     GEN9_CS_DEBUG_MODE1,
2283 			     FF_DOP_CLOCK_GATE_DISABLE);
2284 	}
2285 
2286 	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2287 	    IS_DG1_GRAPHICS_STEP(i915, STEP_A0, STEP_B0) ||
2288 	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2289 		/* Wa_1409804808:tgl,rkl,dg1[a0],adl-s,adl-p */
2290 		wa_masked_en(wal, GEN7_ROW_CHICKEN2,
2291 			     GEN12_PUSH_CONST_DEREF_HOLD_DIS);
2292 
2293 		/*
2294 		 * Wa_1409085225:tgl
2295 		 * Wa_14010229206:tgl,rkl,dg1[a0],adl-s,adl-p
2296 		 */
2297 		wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
2298 	}
2299 
2300 	if (IS_DG1_GRAPHICS_STEP(i915, STEP_A0, STEP_B0) ||
2301 	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2302 		/*
2303 		 * Wa_1607030317:tgl
2304 		 * Wa_1607186500:tgl
2305 		 * Wa_1607297627:tgl,rkl,dg1[a0]
2306 		 *
2307 		 * On TGL and RKL there are multiple entries for this WA in the
2308 		 * BSpec; some indicate this is an A0-only WA, others indicate
2309 		 * it applies to all steppings so we trust the "all steppings."
2310 		 * For DG1 this only applies to A0.
2311 		 */
2312 		wa_masked_en(wal,
2313 			     RING_PSMI_CTL(RENDER_RING_BASE),
2314 			     GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
2315 			     GEN8_RC_SEMA_IDLE_MSG_DISABLE);
2316 	}
2317 
2318 	if (IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) ||
2319 	    IS_ALDERLAKE_S(i915) || IS_ALDERLAKE_P(i915)) {
2320 		/* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */
2321 		wa_masked_en(wal,
2322 			     GEN10_SAMPLER_MODE,
2323 			     ENABLE_SMALLPL);
2324 	}
2325 
2326 	if (GRAPHICS_VER(i915) == 11) {
2327 		/* This is not an Wa. Enable for better image quality */
2328 		wa_masked_en(wal,
2329 			     _3D_CHICKEN3,
2330 			     _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
2331 
2332 		/*
2333 		 * Wa_1405543622:icl
2334 		 * Formerly known as WaGAPZPriorityScheme
2335 		 */
2336 		wa_write_or(wal,
2337 			    GEN8_GARBCNTL,
2338 			    GEN11_ARBITRATION_PRIO_ORDER_MASK);
2339 
2340 		/*
2341 		 * Wa_1604223664:icl
2342 		 * Formerly known as WaL3BankAddressHashing
2343 		 */
2344 		wa_write_clr_set(wal,
2345 				 GEN8_GARBCNTL,
2346 				 GEN11_HASH_CTRL_EXCL_MASK,
2347 				 GEN11_HASH_CTRL_EXCL_BIT0);
2348 		wa_write_clr_set(wal,
2349 				 GEN11_GLBLINVL,
2350 				 GEN11_BANK_HASH_ADDR_EXCL_MASK,
2351 				 GEN11_BANK_HASH_ADDR_EXCL_BIT0);
2352 
2353 		/*
2354 		 * Wa_1405733216:icl
2355 		 * Formerly known as WaDisableCleanEvicts
2356 		 */
2357 		wa_write_or(wal,
2358 			    GEN8_L3SQCREG4,
2359 			    GEN11_LQSC_CLEAN_EVICT_DISABLE);
2360 
2361 		/* Wa_1606682166:icl */
2362 		wa_write_or(wal,
2363 			    GEN7_SARCHKMD,
2364 			    GEN7_DISABLE_SAMPLER_PREFETCH);
2365 
2366 		/* Wa_1409178092:icl */
2367 		wa_write_clr_set(wal,
2368 				 GEN11_SCRATCH2,
2369 				 GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
2370 				 0);
2371 
2372 		/* WaEnable32PlaneMode:icl */
2373 		wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
2374 			     GEN11_ENABLE_32_PLANE_MODE);
2375 
2376 		/*
2377 		 * Wa_1408615072:icl,ehl  (vsunit)
2378 		 * Wa_1407596294:icl,ehl  (hsunit)
2379 		 */
2380 		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
2381 			    VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
2382 
2383 		/*
2384 		 * Wa_1408767742:icl[a2..forever],ehl[all]
2385 		 * Wa_1605460711:icl[a0..c0]
2386 		 */
2387 		wa_write_or(wal,
2388 			    GEN7_FF_THREAD_MODE,
2389 			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2390 
2391 		/* Wa_22010271021 */
2392 		wa_masked_en(wal,
2393 			     GEN9_CS_DEBUG_MODE1,
2394 			     FF_DOP_CLOCK_GATE_DISABLE);
2395 	}
2396 
2397 	if (HAS_PERCTX_PREEMPT_CTRL(i915)) {
2398 		/* FtrPerCtxtPreemptionGranularityControl:skl,bxt,kbl,cfl,cnl,icl,tgl */
2399 		wa_masked_en(wal,
2400 			     GEN7_FF_SLICE_CS_CHICKEN1,
2401 			     GEN9_FFSC_PERCTX_PREEMPT_CTRL);
2402 	}
2403 
2404 	if (IS_SKYLAKE(i915) ||
2405 	    IS_KABYLAKE(i915) ||
2406 	    IS_COFFEELAKE(i915) ||
2407 	    IS_COMETLAKE(i915)) {
2408 		/* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
2409 		wa_write_or(wal,
2410 			    GEN8_GARBCNTL,
2411 			    GEN9_GAPS_TSV_CREDIT_DISABLE);
2412 	}
2413 
2414 	if (IS_BROXTON(i915)) {
2415 		/* WaDisablePooledEuLoadBalancingFix:bxt */
2416 		wa_masked_en(wal,
2417 			     FF_SLICE_CS_CHICKEN2,
2418 			     GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
2419 	}
2420 
2421 	if (GRAPHICS_VER(i915) == 9) {
2422 		/* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
2423 		wa_masked_en(wal,
2424 			     GEN9_CSFE_CHICKEN1_RCS,
2425 			     GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
2426 
2427 		/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
2428 		wa_write_or(wal,
2429 			    BDW_SCRATCH1,
2430 			    GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
2431 
2432 		/* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
2433 		if (IS_GEN9_LP(i915))
2434 			wa_write_clr_set(wal,
2435 					 GEN8_L3SQCREG1,
2436 					 L3_PRIO_CREDITS_MASK,
2437 					 L3_GENERAL_PRIO_CREDITS(62) |
2438 					 L3_HIGH_PRIO_CREDITS(2));
2439 
2440 		/* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
2441 		wa_write_or(wal,
2442 			    GEN8_L3SQCREG4,
2443 			    GEN8_LQSC_FLUSH_COHERENT_LINES);
2444 
2445 		/* Disable atomics in L3 to prevent unrecoverable hangs */
2446 		wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1,
2447 				 GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2448 		wa_write_clr_set(wal, GEN8_L3SQCREG4,
2449 				 GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2450 		wa_write_clr_set(wal, GEN9_SCRATCH1,
2451 				 EVICTION_PERF_FIX_ENABLE, 0);
2452 	}
2453 
2454 	if (IS_HASWELL(i915)) {
2455 		/* WaSampleCChickenBitEnable:hsw */
2456 		wa_masked_en(wal,
2457 			     HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
2458 
2459 		wa_masked_dis(wal,
2460 			      CACHE_MODE_0_GEN7,
2461 			      /* enable HiZ Raw Stall Optimization */
2462 			      HIZ_RAW_STALL_OPT_DISABLE);
2463 	}
2464 
2465 	if (IS_VALLEYVIEW(i915)) {
2466 		/* WaDisableEarlyCull:vlv */
2467 		wa_masked_en(wal,
2468 			     _3D_CHICKEN3,
2469 			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2470 
2471 		/*
2472 		 * WaVSThreadDispatchOverride:ivb,vlv
2473 		 *
2474 		 * This actually overrides the dispatch
2475 		 * mode for all thread types.
2476 		 */
2477 		wa_write_clr_set(wal,
2478 				 GEN7_FF_THREAD_MODE,
2479 				 GEN7_FF_SCHED_MASK,
2480 				 GEN7_FF_TS_SCHED_HW |
2481 				 GEN7_FF_VS_SCHED_HW |
2482 				 GEN7_FF_DS_SCHED_HW);
2483 
2484 		/* WaPsdDispatchEnable:vlv */
2485 		/* WaDisablePSDDualDispatchEnable:vlv */
2486 		wa_masked_en(wal,
2487 			     GEN7_HALF_SLICE_CHICKEN1,
2488 			     GEN7_MAX_PS_THREAD_DEP |
2489 			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2490 	}
2491 
2492 	if (IS_IVYBRIDGE(i915)) {
2493 		/* WaDisableEarlyCull:ivb */
2494 		wa_masked_en(wal,
2495 			     _3D_CHICKEN3,
2496 			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2497 
2498 		if (0) { /* causes HiZ corruption on ivb:gt1 */
2499 			/* enable HiZ Raw Stall Optimization */
2500 			wa_masked_dis(wal,
2501 				      CACHE_MODE_0_GEN7,
2502 				      HIZ_RAW_STALL_OPT_DISABLE);
2503 		}
2504 
2505 		/*
2506 		 * WaVSThreadDispatchOverride:ivb,vlv
2507 		 *
2508 		 * This actually overrides the dispatch
2509 		 * mode for all thread types.
2510 		 */
2511 		wa_write_clr_set(wal,
2512 				 GEN7_FF_THREAD_MODE,
2513 				 GEN7_FF_SCHED_MASK,
2514 				 GEN7_FF_TS_SCHED_HW |
2515 				 GEN7_FF_VS_SCHED_HW |
2516 				 GEN7_FF_DS_SCHED_HW);
2517 
2518 		/* WaDisablePSDDualDispatchEnable:ivb */
2519 		if (IS_IVB_GT1(i915))
2520 			wa_masked_en(wal,
2521 				     GEN7_HALF_SLICE_CHICKEN1,
2522 				     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2523 	}
2524 
2525 	if (GRAPHICS_VER(i915) == 7) {
2526 		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
2527 		wa_masked_en(wal,
2528 			     RING_MODE_GEN7(RENDER_RING_BASE),
2529 			     GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
2530 
2531 		/* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
2532 		wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
2533 
2534 		/*
2535 		 * BSpec says this must be set, even though
2536 		 * WaDisable4x2SubspanOptimization:ivb,hsw
2537 		 * WaDisable4x2SubspanOptimization isn't listed for VLV.
2538 		 */
2539 		wa_masked_en(wal,
2540 			     CACHE_MODE_1,
2541 			     PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
2542 
2543 		/*
2544 		 * BSpec recommends 8x4 when MSAA is used,
2545 		 * however in practice 16x4 seems fastest.
2546 		 *
2547 		 * Note that PS/WM thread counts depend on the WIZ hashing
2548 		 * disable bit, which we don't touch here, but it's good
2549 		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2550 		 */
2551 		wa_masked_field_set(wal,
2552 				    GEN7_GT_MODE,
2553 				    GEN6_WIZ_HASHING_MASK,
2554 				    GEN6_WIZ_HASHING_16x4);
2555 	}
2556 
2557 	if (IS_GRAPHICS_VER(i915, 6, 7))
2558 		/*
2559 		 * We need to disable the AsyncFlip performance optimisations in
2560 		 * order to use MI_WAIT_FOR_EVENT within the CS. It should
2561 		 * already be programmed to '1' on all products.
2562 		 *
2563 		 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
2564 		 */
2565 		wa_masked_en(wal,
2566 			     RING_MI_MODE(RENDER_RING_BASE),
2567 			     ASYNC_FLIP_PERF_DISABLE);
2568 
2569 	if (GRAPHICS_VER(i915) == 6) {
2570 		/*
2571 		 * Required for the hardware to program scanline values for
2572 		 * waiting
2573 		 * WaEnableFlushTlbInvalidationMode:snb
2574 		 */
2575 		wa_masked_en(wal,
2576 			     GFX_MODE,
2577 			     GFX_TLB_INVALIDATE_EXPLICIT);
2578 
2579 		/* WaDisableHiZPlanesWhenMSAAEnabled:snb */
2580 		wa_masked_en(wal,
2581 			     _3D_CHICKEN,
2582 			     _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
2583 
2584 		wa_masked_en(wal,
2585 			     _3D_CHICKEN3,
2586 			     /* WaStripsFansDisableFastClipPerformanceFix:snb */
2587 			     _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
2588 			     /*
2589 			      * Bspec says:
2590 			      * "This bit must be set if 3DSTATE_CLIP clip mode is set
2591 			      * to normal and 3DSTATE_SF number of SF output attributes
2592 			      * is more than 16."
2593 			      */
2594 			     _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
2595 
2596 		/*
2597 		 * BSpec recommends 8x4 when MSAA is used,
2598 		 * however in practice 16x4 seems fastest.
2599 		 *
2600 		 * Note that PS/WM thread counts depend on the WIZ hashing
2601 		 * disable bit, which we don't touch here, but it's good
2602 		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2603 		 */
2604 		wa_masked_field_set(wal,
2605 				    GEN6_GT_MODE,
2606 				    GEN6_WIZ_HASHING_MASK,
2607 				    GEN6_WIZ_HASHING_16x4);
2608 
2609 		/* WaDisable_RenderCache_OperationalFlush:snb */
2610 		wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
2611 
2612 		/*
2613 		 * From the Sandybridge PRM, volume 1 part 3, page 24:
2614 		 * "If this bit is set, STCunit will have LRA as replacement
2615 		 *  policy. [...] This bit must be reset. LRA replacement
2616 		 *  policy is not supported."
2617 		 */
2618 		wa_masked_dis(wal,
2619 			      CACHE_MODE_0,
2620 			      CM0_STC_EVICT_DISABLE_LRA_SNB);
2621 	}
2622 
2623 	if (IS_GRAPHICS_VER(i915, 4, 6))
2624 		/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
2625 		wa_add(wal, RING_MI_MODE(RENDER_RING_BASE),
2626 		       0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
2627 		       /* XXX bit doesn't stick on Broadwater */
2628 		       IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, true);
2629 
2630 	if (GRAPHICS_VER(i915) == 4)
2631 		/*
2632 		 * Disable CONSTANT_BUFFER before it is loaded from the context
2633 		 * image. For as it is loaded, it is executed and the stored
2634 		 * address may no longer be valid, leading to a GPU hang.
2635 		 *
2636 		 * This imposes the requirement that userspace reload their
2637 		 * CONSTANT_BUFFER on every batch, fortunately a requirement
2638 		 * they are already accustomed to from before contexts were
2639 		 * enabled.
2640 		 */
2641 		wa_add(wal, ECOSKPD(RENDER_RING_BASE),
2642 		       0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
2643 		       0 /* XXX bit doesn't stick on Broadwater */,
2644 		       true);
2645 }
2646 
2647 static void
2648 xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2649 {
2650 	struct drm_i915_private *i915 = engine->i915;
2651 
2652 	/* WaKBLVECSSemaphoreWaitPoll:kbl */
2653 	if (IS_KBL_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) {
2654 		wa_write(wal,
2655 			 RING_SEMA_WAIT_POLL(engine->mmio_base),
2656 			 1);
2657 	}
2658 }
2659 
2660 static void
2661 ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2662 {
2663 	if (IS_PVC_CT_STEP(engine->i915, STEP_A0, STEP_C0)) {
2664 		/* Wa_14014999345:pvc */
2665 		wa_masked_en(wal, GEN10_CACHE_MODE_SS, DISABLE_ECC);
2666 	}
2667 }
2668 
2669 /*
2670  * The workarounds in this function apply to shared registers in
2671  * the general render reset domain that aren't tied to a
2672  * specific engine.  Since all render+compute engines get reset
2673  * together, and the contents of these registers are lost during
2674  * the shared render domain reset, we'll define such workarounds
2675  * here and then add them to just a single RCS or CCS engine's
2676  * workaround list (whichever engine has the XXXX flag).
2677  */
2678 static void
2679 general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2680 {
2681 	struct drm_i915_private *i915 = engine->i915;
2682 
2683 	if (IS_PONTEVECCHIO(i915)) {
2684 		/*
2685 		 * The following is not actually a "workaround" but rather
2686 		 * a recommended tuning setting documented in the bspec's
2687 		 * performance guide section.
2688 		 */
2689 		wa_write(wal, XEHPC_L3SCRUB, SCRUB_CL_DWNGRADE_SHARED | SCRUB_RATE_4B_PER_CLK);
2690 	}
2691 
2692 	if (IS_XEHPSDV(i915)) {
2693 		/* Wa_1409954639 */
2694 		wa_masked_en(wal,
2695 			     GEN8_ROW_CHICKEN,
2696 			     SYSTOLIC_DOP_CLOCK_GATING_DIS);
2697 
2698 		/* Wa_1607196519 */
2699 		wa_masked_en(wal,
2700 			     GEN9_ROW_CHICKEN4,
2701 			     GEN12_DISABLE_GRF_CLEAR);
2702 
2703 		/* Wa_14010670810:xehpsdv */
2704 		wa_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
2705 
2706 		/* Wa_14010449647:xehpsdv */
2707 		wa_masked_en(wal, GEN7_HALF_SLICE_CHICKEN1,
2708 			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2709 
2710 		/* Wa_18011725039:xehpsdv */
2711 		if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_B0)) {
2712 			wa_masked_dis(wal, MLTICTXCTL, TDONRENDER);
2713 			wa_write_or(wal, L3SQCREG1_CCS0, FLUSHALLNONCOH);
2714 		}
2715 
2716 		/* Wa_14012362059:xehpsdv */
2717 		wa_write_or(wal, GEN12_MERT_MOD_CTRL, FORCE_MISS_FTLB);
2718 
2719 		/* Wa_14014368820:xehpsdv */
2720 		wa_write_or(wal, GEN12_GAMCNTRL_CTRL, INVALIDATION_BROADCAST_MODE_DIS |
2721 				GLOBAL_INVALIDATION_MODE);
2722 	}
2723 
2724 	if (IS_DG2(i915) || IS_PONTEVECCHIO(i915)) {
2725 		/* Wa_14015227452:dg2,pvc */
2726 		wa_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE);
2727 
2728 		/* Wa_22014226127:dg2,pvc */
2729 		wa_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE);
2730 
2731 		/* Wa_16015675438:dg2,pvc */
2732 		wa_masked_en(wal, FF_SLICE_CS_CHICKEN2, GEN12_PERF_FIX_BALANCING_CFE_DISABLE);
2733 
2734 		/* Wa_18018781329:dg2,pvc */
2735 		wa_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
2736 		wa_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
2737 		wa_write_or(wal, VDBX_MOD_CTRL, FORCE_MISS_FTLB);
2738 		wa_write_or(wal, VEBX_MOD_CTRL, FORCE_MISS_FTLB);
2739 	}
2740 }
2741 
2742 static void
2743 engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2744 {
2745 	if (I915_SELFTEST_ONLY(GRAPHICS_VER(engine->i915) < 4))
2746 		return;
2747 
2748 	engine_fake_wa_init(engine, wal);
2749 
2750 	/*
2751 	 * These are common workarounds that just need to applied
2752 	 * to a single RCS/CCS engine's workaround list since
2753 	 * they're reset as part of the general render domain reset.
2754 	 */
2755 	if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE)
2756 		general_render_compute_wa_init(engine, wal);
2757 
2758 	if (engine->class == COMPUTE_CLASS)
2759 		ccs_engine_wa_init(engine, wal);
2760 	else if (engine->class == RENDER_CLASS)
2761 		rcs_engine_wa_init(engine, wal);
2762 	else
2763 		xcs_engine_wa_init(engine, wal);
2764 }
2765 
2766 void intel_engine_init_workarounds(struct intel_engine_cs *engine)
2767 {
2768 	struct i915_wa_list *wal = &engine->wa_list;
2769 
2770 	if (GRAPHICS_VER(engine->i915) < 4)
2771 		return;
2772 
2773 	wa_init_start(wal, "engine", engine->name);
2774 	engine_init_workarounds(engine, wal);
2775 	wa_init_finish(wal);
2776 }
2777 
2778 void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
2779 {
2780 	wa_list_apply(engine->gt, &engine->wa_list);
2781 }
2782 
2783 static const struct i915_range mcr_ranges_gen8[] = {
2784 	{ .start = 0x5500, .end = 0x55ff },
2785 	{ .start = 0x7000, .end = 0x7fff },
2786 	{ .start = 0x9400, .end = 0x97ff },
2787 	{ .start = 0xb000, .end = 0xb3ff },
2788 	{ .start = 0xe000, .end = 0xe7ff },
2789 	{},
2790 };
2791 
2792 static const struct i915_range mcr_ranges_gen12[] = {
2793 	{ .start =  0x8150, .end =  0x815f },
2794 	{ .start =  0x9520, .end =  0x955f },
2795 	{ .start =  0xb100, .end =  0xb3ff },
2796 	{ .start =  0xde80, .end =  0xe8ff },
2797 	{ .start = 0x24a00, .end = 0x24a7f },
2798 	{},
2799 };
2800 
2801 static const struct i915_range mcr_ranges_xehp[] = {
2802 	{ .start =  0x4000, .end =  0x4aff },
2803 	{ .start =  0x5200, .end =  0x52ff },
2804 	{ .start =  0x5400, .end =  0x7fff },
2805 	{ .start =  0x8140, .end =  0x815f },
2806 	{ .start =  0x8c80, .end =  0x8dff },
2807 	{ .start =  0x94d0, .end =  0x955f },
2808 	{ .start =  0x9680, .end =  0x96ff },
2809 	{ .start =  0xb000, .end =  0xb3ff },
2810 	{ .start =  0xc800, .end =  0xcfff },
2811 	{ .start =  0xd800, .end =  0xd8ff },
2812 	{ .start =  0xdc00, .end =  0xffff },
2813 	{ .start = 0x17000, .end = 0x17fff },
2814 	{ .start = 0x24a00, .end = 0x24a7f },
2815 	{},
2816 };
2817 
2818 static bool mcr_range(struct drm_i915_private *i915, u32 offset)
2819 {
2820 	const struct i915_range *mcr_ranges;
2821 	int i;
2822 
2823 	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50))
2824 		mcr_ranges = mcr_ranges_xehp;
2825 	else if (GRAPHICS_VER(i915) >= 12)
2826 		mcr_ranges = mcr_ranges_gen12;
2827 	else if (GRAPHICS_VER(i915) >= 8)
2828 		mcr_ranges = mcr_ranges_gen8;
2829 	else
2830 		return false;
2831 
2832 	/*
2833 	 * Registers in these ranges are affected by the MCR selector
2834 	 * which only controls CPU initiated MMIO. Routing does not
2835 	 * work for CS access so we cannot verify them on this path.
2836 	 */
2837 	for (i = 0; mcr_ranges[i].start; i++)
2838 		if (offset >= mcr_ranges[i].start &&
2839 		    offset <= mcr_ranges[i].end)
2840 			return true;
2841 
2842 	return false;
2843 }
2844 
2845 static int
2846 wa_list_srm(struct i915_request *rq,
2847 	    const struct i915_wa_list *wal,
2848 	    struct i915_vma *vma)
2849 {
2850 	struct drm_i915_private *i915 = rq->engine->i915;
2851 	unsigned int i, count = 0;
2852 	const struct i915_wa *wa;
2853 	u32 srm, *cs;
2854 
2855 	srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
2856 	if (GRAPHICS_VER(i915) >= 8)
2857 		srm++;
2858 
2859 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2860 		if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
2861 			count++;
2862 	}
2863 
2864 	cs = intel_ring_begin(rq, 4 * count);
2865 	if (IS_ERR(cs))
2866 		return PTR_ERR(cs);
2867 
2868 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2869 		u32 offset = i915_mmio_reg_offset(wa->reg);
2870 
2871 		if (mcr_range(i915, offset))
2872 			continue;
2873 
2874 		*cs++ = srm;
2875 		*cs++ = offset;
2876 		*cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
2877 		*cs++ = 0;
2878 	}
2879 	intel_ring_advance(rq, cs);
2880 
2881 	return 0;
2882 }
2883 
2884 static int engine_wa_list_verify(struct intel_context *ce,
2885 				 const struct i915_wa_list * const wal,
2886 				 const char *from)
2887 {
2888 	const struct i915_wa *wa;
2889 	struct i915_request *rq;
2890 	struct i915_vma *vma;
2891 	struct i915_gem_ww_ctx ww;
2892 	unsigned int i;
2893 	u32 *results;
2894 	int err;
2895 
2896 	if (!wal->count)
2897 		return 0;
2898 
2899 	vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm,
2900 					   wal->count * sizeof(u32));
2901 	if (IS_ERR(vma))
2902 		return PTR_ERR(vma);
2903 
2904 	intel_engine_pm_get(ce->engine);
2905 	i915_gem_ww_ctx_init(&ww, false);
2906 retry:
2907 	err = i915_gem_object_lock(vma->obj, &ww);
2908 	if (err == 0)
2909 		err = intel_context_pin_ww(ce, &ww);
2910 	if (err)
2911 		goto err_pm;
2912 
2913 	err = i915_vma_pin_ww(vma, &ww, 0, 0,
2914 			   i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
2915 	if (err)
2916 		goto err_unpin;
2917 
2918 	rq = i915_request_create(ce);
2919 	if (IS_ERR(rq)) {
2920 		err = PTR_ERR(rq);
2921 		goto err_vma;
2922 	}
2923 
2924 	err = i915_request_await_object(rq, vma->obj, true);
2925 	if (err == 0)
2926 		err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
2927 	if (err == 0)
2928 		err = wa_list_srm(rq, wal, vma);
2929 
2930 	i915_request_get(rq);
2931 	if (err)
2932 		i915_request_set_error_once(rq, err);
2933 	i915_request_add(rq);
2934 
2935 	if (err)
2936 		goto err_rq;
2937 
2938 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
2939 		err = -ETIME;
2940 		goto err_rq;
2941 	}
2942 
2943 	results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
2944 	if (IS_ERR(results)) {
2945 		err = PTR_ERR(results);
2946 		goto err_rq;
2947 	}
2948 
2949 	err = 0;
2950 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2951 		if (mcr_range(rq->engine->i915, i915_mmio_reg_offset(wa->reg)))
2952 			continue;
2953 
2954 		if (!wa_verify(wa, results[i], wal->name, from))
2955 			err = -ENXIO;
2956 	}
2957 
2958 	i915_gem_object_unpin_map(vma->obj);
2959 
2960 err_rq:
2961 	i915_request_put(rq);
2962 err_vma:
2963 	i915_vma_unpin(vma);
2964 err_unpin:
2965 	intel_context_unpin(ce);
2966 err_pm:
2967 	if (err == -EDEADLK) {
2968 		err = i915_gem_ww_ctx_backoff(&ww);
2969 		if (!err)
2970 			goto retry;
2971 	}
2972 	i915_gem_ww_ctx_fini(&ww);
2973 	intel_engine_pm_put(ce->engine);
2974 	i915_vma_put(vma);
2975 	return err;
2976 }
2977 
2978 int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
2979 				    const char *from)
2980 {
2981 	return engine_wa_list_verify(engine->kernel_context,
2982 				     &engine->wa_list,
2983 				     from);
2984 }
2985 
2986 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2987 #include "selftest_workarounds.c"
2988 #endif
2989