1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2014-2018 Intel Corporation
4 */
5
6 #include "i915_drv.h"
7 #include "i915_reg.h"
8 #include "intel_context.h"
9 #include "intel_engine_pm.h"
10 #include "intel_engine_regs.h"
11 #include "intel_gpu_commands.h"
12 #include "intel_gt.h"
13 #include "intel_gt_ccs_mode.h"
14 #include "intel_gt_mcr.h"
15 #include "intel_gt_regs.h"
16 #include "intel_ring.h"
17 #include "intel_workarounds.h"
18
19 /**
20 * DOC: Hardware workarounds
21 *
22 * Hardware workarounds are register programming documented to be executed in
23 * the driver that fall outside of the normal programming sequences for a
24 * platform. There are some basic categories of workarounds, depending on
25 * how/when they are applied:
26 *
27 * - Context workarounds: workarounds that touch registers that are
28 * saved/restored to/from the HW context image. The list is emitted (via Load
29 * Register Immediate commands) once when initializing the device and saved in
30 * the default context. That default context is then used on every context
31 * creation to have a "primed golden context", i.e. a context image that
32 * already contains the changes needed to all the registers.
33 *
34 * Context workarounds should be implemented in the \*_ctx_workarounds_init()
35 * variants respective to the targeted platforms.
36 *
37 * - Engine workarounds: the list of these WAs is applied whenever the specific
38 * engine is reset. It's also possible that a set of engine classes share a
39 * common power domain and they are reset together. This happens on some
40 * platforms with render and compute engines. In this case (at least) one of
41 * them need to keeep the workaround programming: the approach taken in the
42 * driver is to tie those workarounds to the first compute/render engine that
43 * is registered. When executing with GuC submission, engine resets are
44 * outside of kernel driver control, hence the list of registers involved in
45 * written once, on engine initialization, and then passed to GuC, that
46 * saves/restores their values before/after the reset takes place. See
47 * ``drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c`` for reference.
48 *
49 * Workarounds for registers specific to RCS and CCS should be implemented in
50 * rcs_engine_wa_init() and ccs_engine_wa_init(), respectively; those for
51 * registers belonging to BCS, VCS or VECS should be implemented in
52 * xcs_engine_wa_init(). Workarounds for registers not belonging to a specific
53 * engine's MMIO range but that are part of of the common RCS/CCS reset domain
54 * should be implemented in general_render_compute_wa_init(). The settings
55 * about the CCS load balancing should be added in ccs_engine_wa_mode().
56 *
57 * - GT workarounds: the list of these WAs is applied whenever these registers
58 * revert to their default values: on GPU reset, suspend/resume [1]_, etc.
59 *
60 * GT workarounds should be implemented in the \*_gt_workarounds_init()
61 * variants respective to the targeted platforms.
62 *
63 * - Register whitelist: some workarounds need to be implemented in userspace,
64 * but need to touch privileged registers. The whitelist in the kernel
65 * instructs the hardware to allow the access to happen. From the kernel side,
66 * this is just a special case of a MMIO workaround (as we write the list of
67 * these to/be-whitelisted registers to some special HW registers).
68 *
69 * Register whitelisting should be done in the \*_whitelist_build() variants
70 * respective to the targeted platforms.
71 *
72 * - Workaround batchbuffers: buffers that get executed automatically by the
73 * hardware on every HW context restore. These buffers are created and
74 * programmed in the default context so the hardware always go through those
75 * programming sequences when switching contexts. The support for workaround
76 * batchbuffers is enabled these hardware mechanisms:
77 *
78 * #. INDIRECT_CTX: A batchbuffer and an offset are provided in the default
79 * context, pointing the hardware to jump to that location when that offset
80 * is reached in the context restore. Workaround batchbuffer in the driver
81 * currently uses this mechanism for all platforms.
82 *
83 * #. BB_PER_CTX_PTR: A batchbuffer is provided in the default context,
84 * pointing the hardware to a buffer to continue executing after the
85 * engine registers are restored in a context restore sequence. This is
86 * currently not used in the driver.
87 *
88 * - Other: There are WAs that, due to their nature, cannot be applied from a
89 * central place. Those are peppered around the rest of the code, as needed.
90 * Workarounds related to the display IP are the main example.
91 *
92 * .. [1] Technically, some registers are powercontext saved & restored, so they
93 * survive a suspend/resume. In practice, writing them again is not too
94 * costly and simplifies things, so it's the approach taken in the driver.
95 */
96
wa_init_start(struct i915_wa_list * wal,struct intel_gt * gt,const char * name,const char * engine_name)97 static void wa_init_start(struct i915_wa_list *wal, struct intel_gt *gt,
98 const char *name, const char *engine_name)
99 {
100 wal->gt = gt;
101 wal->name = name;
102 wal->engine_name = engine_name;
103 }
104
105 #define WA_LIST_CHUNK (1 << 4)
106
wa_init_finish(struct i915_wa_list * wal)107 static void wa_init_finish(struct i915_wa_list *wal)
108 {
109 /* Trim unused entries. */
110 if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
111 struct i915_wa *list = kmemdup(wal->list,
112 wal->count * sizeof(*list),
113 GFP_KERNEL);
114
115 if (list) {
116 kfree(wal->list);
117 wal->list = list;
118 }
119 }
120
121 if (!wal->count)
122 return;
123
124 drm_dbg(&wal->gt->i915->drm, "Initialized %u %s workarounds on %s\n",
125 wal->wa_count, wal->name, wal->engine_name);
126 }
127
128 static enum forcewake_domains
wal_get_fw_for_rmw(struct intel_uncore * uncore,const struct i915_wa_list * wal)129 wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
130 {
131 enum forcewake_domains fw = 0;
132 struct i915_wa *wa;
133 unsigned int i;
134
135 for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
136 fw |= intel_uncore_forcewake_for_reg(uncore,
137 wa->reg,
138 FW_REG_READ |
139 FW_REG_WRITE);
140
141 return fw;
142 }
143
_wa_add(struct i915_wa_list * wal,const struct i915_wa * wa)144 static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
145 {
146 unsigned int addr = i915_mmio_reg_offset(wa->reg);
147 struct drm_i915_private *i915 = wal->gt->i915;
148 unsigned int start = 0, end = wal->count;
149 const unsigned int grow = WA_LIST_CHUNK;
150 struct i915_wa *wa_;
151
152 GEM_BUG_ON(!is_power_of_2(grow));
153
154 if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
155 struct i915_wa *list;
156
157 list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
158 GFP_KERNEL);
159 if (!list) {
160 drm_err(&i915->drm, "No space for workaround init!\n");
161 return;
162 }
163
164 if (wal->list) {
165 memcpy(list, wal->list, sizeof(*wa) * wal->count);
166 kfree(wal->list);
167 }
168
169 wal->list = list;
170 }
171
172 while (start < end) {
173 unsigned int mid = start + (end - start) / 2;
174
175 if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
176 start = mid + 1;
177 } else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
178 end = mid;
179 } else {
180 wa_ = &wal->list[mid];
181
182 if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
183 drm_err(&i915->drm,
184 "Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
185 i915_mmio_reg_offset(wa_->reg),
186 wa_->clr, wa_->set);
187
188 wa_->set &= ~wa->clr;
189 }
190
191 wal->wa_count++;
192 wa_->set |= wa->set;
193 wa_->clr |= wa->clr;
194 wa_->read |= wa->read;
195 return;
196 }
197 }
198
199 wal->wa_count++;
200 wa_ = &wal->list[wal->count++];
201 *wa_ = *wa;
202
203 while (wa_-- > wal->list) {
204 GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
205 i915_mmio_reg_offset(wa_[1].reg));
206 if (i915_mmio_reg_offset(wa_[1].reg) >
207 i915_mmio_reg_offset(wa_[0].reg))
208 break;
209
210 swap(wa_[1], wa_[0]);
211 }
212 }
213
wa_add(struct i915_wa_list * wal,i915_reg_t reg,u32 clear,u32 set,u32 read_mask,bool masked_reg)214 static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
215 u32 clear, u32 set, u32 read_mask, bool masked_reg)
216 {
217 struct i915_wa wa = {
218 .reg = reg,
219 .clr = clear,
220 .set = set,
221 .read = read_mask,
222 .masked_reg = masked_reg,
223 };
224
225 _wa_add(wal, &wa);
226 }
227
wa_mcr_add(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 clear,u32 set,u32 read_mask,bool masked_reg)228 static void wa_mcr_add(struct i915_wa_list *wal, i915_mcr_reg_t reg,
229 u32 clear, u32 set, u32 read_mask, bool masked_reg)
230 {
231 struct i915_wa wa = {
232 .mcr_reg = reg,
233 .clr = clear,
234 .set = set,
235 .read = read_mask,
236 .masked_reg = masked_reg,
237 .is_mcr = 1,
238 };
239
240 _wa_add(wal, &wa);
241 }
242
243 static void
wa_write_clr_set(struct i915_wa_list * wal,i915_reg_t reg,u32 clear,u32 set)244 wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
245 {
246 wa_add(wal, reg, clear, set, clear | set, false);
247 }
248
249 static void
wa_mcr_write_clr_set(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 clear,u32 set)250 wa_mcr_write_clr_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clear, u32 set)
251 {
252 wa_mcr_add(wal, reg, clear, set, clear | set, false);
253 }
254
255 static void
wa_write(struct i915_wa_list * wal,i915_reg_t reg,u32 set)256 wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
257 {
258 wa_write_clr_set(wal, reg, ~0, set);
259 }
260
261 static void
wa_mcr_write(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 set)262 wa_mcr_write(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
263 {
264 wa_mcr_write_clr_set(wal, reg, ~0, set);
265 }
266
267 static void
wa_write_or(struct i915_wa_list * wal,i915_reg_t reg,u32 set)268 wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
269 {
270 wa_write_clr_set(wal, reg, set, set);
271 }
272
273 static void
wa_mcr_write_or(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 set)274 wa_mcr_write_or(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
275 {
276 wa_mcr_write_clr_set(wal, reg, set, set);
277 }
278
279 static void
wa_write_clr(struct i915_wa_list * wal,i915_reg_t reg,u32 clr)280 wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
281 {
282 wa_write_clr_set(wal, reg, clr, 0);
283 }
284
285 static void
wa_mcr_write_clr(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 clr)286 wa_mcr_write_clr(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clr)
287 {
288 wa_mcr_write_clr_set(wal, reg, clr, 0);
289 }
290
291 /*
292 * WA operations on "masked register". A masked register has the upper 16 bits
293 * documented as "masked" in b-spec. Its purpose is to allow writing to just a
294 * portion of the register without a rmw: you simply write in the upper 16 bits
295 * the mask of bits you are going to modify.
296 *
297 * The wa_masked_* family of functions already does the necessary operations to
298 * calculate the mask based on the parameters passed, so user only has to
299 * provide the lower 16 bits of that register.
300 */
301
302 static void
wa_masked_en(struct i915_wa_list * wal,i915_reg_t reg,u32 val)303 wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
304 {
305 wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
306 }
307
308 static void
wa_mcr_masked_en(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 val)309 wa_mcr_masked_en(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
310 {
311 wa_mcr_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
312 }
313
314 static void
wa_masked_dis(struct i915_wa_list * wal,i915_reg_t reg,u32 val)315 wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
316 {
317 wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
318 }
319
320 static void
wa_mcr_masked_dis(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 val)321 wa_mcr_masked_dis(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
322 {
323 wa_mcr_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
324 }
325
326 static void
wa_masked_field_set(struct i915_wa_list * wal,i915_reg_t reg,u32 mask,u32 val)327 wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg,
328 u32 mask, u32 val)
329 {
330 wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
331 }
332
333 static void
wa_mcr_masked_field_set(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 mask,u32 val)334 wa_mcr_masked_field_set(struct i915_wa_list *wal, i915_mcr_reg_t reg,
335 u32 mask, u32 val)
336 {
337 wa_mcr_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
338 }
339
gen6_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)340 static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
341 struct i915_wa_list *wal)
342 {
343 wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
344 }
345
gen7_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)346 static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
347 struct i915_wa_list *wal)
348 {
349 wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
350 }
351
gen8_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)352 static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
353 struct i915_wa_list *wal)
354 {
355 wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
356
357 /* WaDisableAsyncFlipPerfMode:bdw,chv */
358 wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE);
359
360 /* WaDisablePartialInstShootdown:bdw,chv */
361 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
362 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
363
364 /* Use Force Non-Coherent whenever executing a 3D context. This is a
365 * workaround for a possible hang in the unlikely event a TLB
366 * invalidation occurs during a PSD flush.
367 */
368 /* WaForceEnableNonCoherent:bdw,chv */
369 /* WaHdcDisableFetchWhenMasked:bdw,chv */
370 wa_masked_en(wal, HDC_CHICKEN0,
371 HDC_DONOT_FETCH_MEM_WHEN_MASKED |
372 HDC_FORCE_NON_COHERENT);
373
374 /* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
375 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
376 * polygons in the same 8x4 pixel/sample area to be processed without
377 * stalling waiting for the earlier ones to write to Hierarchical Z
378 * buffer."
379 *
380 * This optimization is off by default for BDW and CHV; turn it on.
381 */
382 wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
383
384 /* Wa4x4STCOptimizationDisable:bdw,chv */
385 wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
386
387 /*
388 * BSpec recommends 8x4 when MSAA is used,
389 * however in practice 16x4 seems fastest.
390 *
391 * Note that PS/WM thread counts depend on the WIZ hashing
392 * disable bit, which we don't touch here, but it's good
393 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
394 */
395 wa_masked_field_set(wal, GEN7_GT_MODE,
396 GEN6_WIZ_HASHING_MASK,
397 GEN6_WIZ_HASHING_16x4);
398 }
399
bdw_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)400 static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
401 struct i915_wa_list *wal)
402 {
403 struct drm_i915_private *i915 = engine->i915;
404
405 gen8_ctx_workarounds_init(engine, wal);
406
407 /* WaDisableThreadStallDopClockGating:bdw (pre-production) */
408 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
409
410 /* WaDisableDopClockGating:bdw
411 *
412 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
413 * to disable EUTC clock gating.
414 */
415 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
416 DOP_CLOCK_GATING_DISABLE);
417
418 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
419 GEN8_SAMPLER_POWER_BYPASS_DIS);
420
421 wa_masked_en(wal, HDC_CHICKEN0,
422 /* WaForceContextSaveRestoreNonCoherent:bdw */
423 HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
424 /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
425 (IS_BROADWELL_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
426 }
427
chv_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)428 static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
429 struct i915_wa_list *wal)
430 {
431 gen8_ctx_workarounds_init(engine, wal);
432
433 /* WaDisableThreadStallDopClockGating:chv */
434 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
435
436 /* Improve HiZ throughput on CHV. */
437 wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
438 }
439
gen9_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)440 static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
441 struct i915_wa_list *wal)
442 {
443 struct drm_i915_private *i915 = engine->i915;
444
445 if (HAS_LLC(i915)) {
446 /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
447 *
448 * Must match Display Engine. See
449 * WaCompressedResourceDisplayNewHashMode.
450 */
451 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
452 GEN9_PBE_COMPRESSED_HASH_SELECTION);
453 wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
454 GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
455 }
456
457 /* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
458 /* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
459 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
460 FLOW_CONTROL_ENABLE |
461 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
462
463 /* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
464 /* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
465 wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
466 GEN9_ENABLE_YV12_BUGFIX |
467 GEN9_ENABLE_GPGPU_PREEMPTION);
468
469 /* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
470 /* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
471 wa_masked_en(wal, CACHE_MODE_1,
472 GEN8_4x4_STC_OPTIMIZATION_DISABLE |
473 GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
474
475 /* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
476 wa_mcr_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5,
477 GEN9_CCS_TLB_PREFETCH_ENABLE);
478
479 /* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
480 wa_masked_en(wal, HDC_CHICKEN0,
481 HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
482 HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
483
484 /* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
485 * both tied to WaForceContextSaveRestoreNonCoherent
486 * in some hsds for skl. We keep the tie for all gen9. The
487 * documentation is a bit hazy and so we want to get common behaviour,
488 * even though there is no clear evidence we would need both on kbl/bxt.
489 * This area has been source of system hangs so we play it safe
490 * and mimic the skl regardless of what bspec says.
491 *
492 * Use Force Non-Coherent whenever executing a 3D context. This
493 * is a workaround for a possible hang in the unlikely event
494 * a TLB invalidation occurs during a PSD flush.
495 */
496
497 /* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
498 wa_masked_en(wal, HDC_CHICKEN0,
499 HDC_FORCE_NON_COHERENT);
500
501 /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
502 if (IS_SKYLAKE(i915) ||
503 IS_KABYLAKE(i915) ||
504 IS_COFFEELAKE(i915) ||
505 IS_COMETLAKE(i915))
506 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
507 GEN8_SAMPLER_POWER_BYPASS_DIS);
508
509 /* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
510 wa_mcr_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
511
512 /*
513 * Supporting preemption with fine-granularity requires changes in the
514 * batch buffer programming. Since we can't break old userspace, we
515 * need to set our default preemption level to safe value. Userspace is
516 * still able to use more fine-grained preemption levels, since in
517 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
518 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
519 * not real HW workarounds, but merely a way to start using preemption
520 * while maintaining old contract with userspace.
521 */
522
523 /* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
524 wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
525
526 /* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
527 wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
528 GEN9_PREEMPT_GPGPU_LEVEL_MASK,
529 GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
530
531 /* WaClearHIZ_WM_CHICKEN3:bxt,glk */
532 if (IS_GEN9_LP(i915))
533 wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
534 }
535
skl_tune_iz_hashing(struct intel_engine_cs * engine,struct i915_wa_list * wal)536 static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
537 struct i915_wa_list *wal)
538 {
539 struct intel_gt *gt = engine->gt;
540 u8 vals[3] = { 0, 0, 0 };
541 unsigned int i;
542
543 for (i = 0; i < 3; i++) {
544 u8 ss;
545
546 /*
547 * Only consider slices where one, and only one, subslice has 7
548 * EUs
549 */
550 if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
551 continue;
552
553 /*
554 * subslice_7eu[i] != 0 (because of the check above) and
555 * ss_max == 4 (maximum number of subslices possible per slice)
556 *
557 * -> 0 <= ss <= 3;
558 */
559 ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
560 vals[i] = 3 - ss;
561 }
562
563 if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
564 return;
565
566 /* Tune IZ hashing. See intel_device_info_runtime_init() */
567 wa_masked_field_set(wal, GEN7_GT_MODE,
568 GEN9_IZ_HASHING_MASK(2) |
569 GEN9_IZ_HASHING_MASK(1) |
570 GEN9_IZ_HASHING_MASK(0),
571 GEN9_IZ_HASHING(2, vals[2]) |
572 GEN9_IZ_HASHING(1, vals[1]) |
573 GEN9_IZ_HASHING(0, vals[0]));
574 }
575
skl_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)576 static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
577 struct i915_wa_list *wal)
578 {
579 gen9_ctx_workarounds_init(engine, wal);
580 skl_tune_iz_hashing(engine, wal);
581 }
582
bxt_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)583 static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
584 struct i915_wa_list *wal)
585 {
586 gen9_ctx_workarounds_init(engine, wal);
587
588 /* WaDisableThreadStallDopClockGating:bxt */
589 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
590 STALL_DOP_GATING_DISABLE);
591
592 /* WaToEnableHwFixForPushConstHWBug:bxt */
593 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
594 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
595 }
596
kbl_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)597 static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
598 struct i915_wa_list *wal)
599 {
600 struct drm_i915_private *i915 = engine->i915;
601
602 gen9_ctx_workarounds_init(engine, wal);
603
604 /* WaToEnableHwFixForPushConstHWBug:kbl */
605 if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER))
606 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
607 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
608
609 /* WaDisableSbeCacheDispatchPortSharing:kbl */
610 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
611 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
612 }
613
glk_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)614 static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
615 struct i915_wa_list *wal)
616 {
617 gen9_ctx_workarounds_init(engine, wal);
618
619 /* WaToEnableHwFixForPushConstHWBug:glk */
620 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
621 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
622 }
623
cfl_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)624 static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
625 struct i915_wa_list *wal)
626 {
627 gen9_ctx_workarounds_init(engine, wal);
628
629 /* WaToEnableHwFixForPushConstHWBug:cfl */
630 wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
631 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
632
633 /* WaDisableSbeCacheDispatchPortSharing:cfl */
634 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
635 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
636 }
637
icl_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)638 static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
639 struct i915_wa_list *wal)
640 {
641 /* Wa_1406697149 (WaDisableBankHangMode:icl) */
642 wa_write(wal, GEN8_L3CNTLREG, GEN8_ERRDETBCTRL);
643
644 /* WaForceEnableNonCoherent:icl
645 * This is not the same workaround as in early Gen9 platforms, where
646 * lacking this could cause system hangs, but coherency performance
647 * overhead is high and only a few compute workloads really need it
648 * (the register is whitelisted in hardware now, so UMDs can opt in
649 * for coherency if they have a good reason).
650 */
651 wa_mcr_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
652
653 /* WaEnableFloatBlendOptimization:icl */
654 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
655 _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE),
656 0 /* write-only, so skip validation */,
657 true);
658
659 /* WaDisableGPGPUMidThreadPreemption:icl */
660 wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
661 GEN9_PREEMPT_GPGPU_LEVEL_MASK,
662 GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
663
664 /* allow headerless messages for preemptible GPGPU context */
665 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
666 GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
667
668 /* Wa_1604278689:icl,ehl */
669 wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
670 wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER,
671 0,
672 0xFFFFFFFF);
673
674 /* Wa_1406306137:icl,ehl */
675 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
676 }
677
678 /*
679 * These settings aren't actually workarounds, but general tuning settings that
680 * need to be programmed on dg2 platform.
681 */
dg2_ctx_gt_tuning_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)682 static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine,
683 struct i915_wa_list *wal)
684 {
685 wa_mcr_masked_en(wal, CHICKEN_RASTER_2, TBIMR_FAST_CLIP);
686 wa_mcr_write_clr_set(wal, XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK,
687 REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f));
688 wa_mcr_write_clr_set(wal, XEHP_FF_MODE2, FF_MODE2_TDS_TIMER_MASK,
689 FF_MODE2_TDS_TIMER_128);
690 }
691
gen12_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)692 static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine,
693 struct i915_wa_list *wal)
694 {
695 struct drm_i915_private *i915 = engine->i915;
696
697 /*
698 * Wa_1409142259:tgl,dg1,adl-p
699 * Wa_1409347922:tgl,dg1,adl-p
700 * Wa_1409252684:tgl,dg1,adl-p
701 * Wa_1409217633:tgl,dg1,adl-p
702 * Wa_1409207793:tgl,dg1,adl-p
703 * Wa_1409178076:tgl,dg1,adl-p
704 * Wa_1408979724:tgl,dg1,adl-p
705 * Wa_14010443199:tgl,rkl,dg1,adl-p
706 * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p
707 * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p
708 */
709 wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
710 GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
711
712 /* WaDisableGPGPUMidThreadPreemption:gen12 */
713 wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
714 GEN9_PREEMPT_GPGPU_LEVEL_MASK,
715 GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
716
717 /*
718 * Wa_16011163337 - GS_TIMER
719 *
720 * TDS_TIMER: Although some platforms refer to it as Wa_1604555607, we
721 * need to program it even on those that don't explicitly list that
722 * workaround.
723 *
724 * Note that the programming of GEN12_FF_MODE2 is further modified
725 * according to the FF_MODE2 guidance given by Wa_1608008084.
726 * Wa_1608008084 tells us the FF_MODE2 register will return the wrong
727 * value when read from the CPU.
728 *
729 * The default value for this register is zero for all fields.
730 * So instead of doing a RMW we should just write the desired values
731 * for TDS and GS timers. Note that since the readback can't be trusted,
732 * the clear mask is just set to ~0 to make sure other bits are not
733 * inadvertently set. For the same reason read verification is ignored.
734 */
735 wa_add(wal,
736 GEN12_FF_MODE2,
737 ~0,
738 FF_MODE2_TDS_TIMER_128 | FF_MODE2_GS_TIMER_224,
739 0, false);
740
741 if (!IS_DG1(i915)) {
742 /* Wa_1806527549 */
743 wa_masked_en(wal, HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE);
744
745 /* Wa_1606376872 */
746 wa_masked_en(wal, COMMON_SLICE_CHICKEN4, DISABLE_TDC_LOAD_BALANCING_CALC);
747 }
748 }
749
dg1_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)750 static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine,
751 struct i915_wa_list *wal)
752 {
753 gen12_ctx_workarounds_init(engine, wal);
754
755 /* Wa_1409044764 */
756 wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3,
757 DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN);
758
759 /* Wa_22010493298 */
760 wa_masked_en(wal, HIZ_CHICKEN,
761 DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE);
762 }
763
dg2_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)764 static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine,
765 struct i915_wa_list *wal)
766 {
767 dg2_ctx_gt_tuning_init(engine, wal);
768
769 /* Wa_16013271637:dg2 */
770 wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
771 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
772
773 /* Wa_14014947963:dg2 */
774 wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000);
775
776 /* Wa_18018764978:dg2 */
777 wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
778
779 /* Wa_15010599737:dg2 */
780 wa_mcr_masked_en(wal, CHICKEN_RASTER_1, DIS_SF_ROUND_NEAREST_EVEN);
781
782 /* Wa_18019271663:dg2 */
783 wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
784 }
785
xelpg_ctx_gt_tuning_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)786 static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine,
787 struct i915_wa_list *wal)
788 {
789 struct intel_gt *gt = engine->gt;
790
791 dg2_ctx_gt_tuning_init(engine, wal);
792
793 /*
794 * Due to Wa_16014892111, the DRAW_WATERMARK tuning must be done in
795 * gen12_emit_indirect_ctx_rcs() rather than here on some early
796 * steppings.
797 */
798 if (!(IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
799 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)))
800 wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, 0x3FF, 0, false);
801 }
802
xelpg_ctx_workarounds_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)803 static void xelpg_ctx_workarounds_init(struct intel_engine_cs *engine,
804 struct i915_wa_list *wal)
805 {
806 struct intel_gt *gt = engine->gt;
807
808 xelpg_ctx_gt_tuning_init(engine, wal);
809
810 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
811 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
812 /* Wa_14014947963 */
813 wa_masked_field_set(wal, VF_PREEMPTION,
814 PREEMPTION_VERTEX_COUNT, 0x4000);
815
816 /* Wa_16013271637 */
817 wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
818 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
819
820 /* Wa_18019627453 */
821 wa_mcr_masked_en(wal, VFLSKPD, VF_PREFETCH_TLB_DIS);
822
823 /* Wa_18018764978 */
824 wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
825 }
826
827 /* Wa_18019271663 */
828 wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
829 }
830
fakewa_disable_nestedbb_mode(struct intel_engine_cs * engine,struct i915_wa_list * wal)831 static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine,
832 struct i915_wa_list *wal)
833 {
834 /*
835 * This is a "fake" workaround defined by software to ensure we
836 * maintain reliable, backward-compatible behavior for userspace with
837 * regards to how nested MI_BATCH_BUFFER_START commands are handled.
838 *
839 * The per-context setting of MI_MODE[12] determines whether the bits
840 * of a nested MI_BATCH_BUFFER_START instruction should be interpreted
841 * in the traditional manner or whether they should instead use a new
842 * tgl+ meaning that breaks backward compatibility, but allows nesting
843 * into 3rd-level batchbuffers. When this new capability was first
844 * added in TGL, it remained off by default unless a context
845 * intentionally opted in to the new behavior. However Xe_HPG now
846 * flips this on by default and requires that we explicitly opt out if
847 * we don't want the new behavior.
848 *
849 * From a SW perspective, we want to maintain the backward-compatible
850 * behavior for userspace, so we'll apply a fake workaround to set it
851 * back to the legacy behavior on platforms where the hardware default
852 * is to break compatibility. At the moment there is no Linux
853 * userspace that utilizes third-level batchbuffers, so this will avoid
854 * userspace from needing to make any changes. using the legacy
855 * meaning is the correct thing to do. If/when we have userspace
856 * consumers that want to utilize third-level batch nesting, we can
857 * provide a context parameter to allow them to opt-in.
858 */
859 wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN);
860 }
861
gen12_ctx_gt_mocs_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)862 static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine,
863 struct i915_wa_list *wal)
864 {
865 u8 mocs;
866
867 /*
868 * Some blitter commands do not have a field for MOCS, those
869 * commands will use MOCS index pointed by BLIT_CCTL.
870 * BLIT_CCTL registers are needed to be programmed to un-cached.
871 */
872 if (engine->class == COPY_ENGINE_CLASS) {
873 mocs = engine->gt->mocs.uc_index;
874 wa_write_clr_set(wal,
875 BLIT_CCTL(engine->mmio_base),
876 BLIT_CCTL_MASK,
877 BLIT_CCTL_MOCS(mocs, mocs));
878 }
879 }
880
881 /*
882 * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround
883 * defined by the hardware team, but it programming general context registers.
884 * Adding those context register programming in context workaround
885 * allow us to use the wa framework for proper application and validation.
886 */
887 static void
gen12_ctx_gt_fake_wa_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)888 gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine,
889 struct i915_wa_list *wal)
890 {
891 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
892 fakewa_disable_nestedbb_mode(engine, wal);
893
894 gen12_ctx_gt_mocs_init(engine, wal);
895 }
896
897 static void
__intel_engine_init_ctx_wa(struct intel_engine_cs * engine,struct i915_wa_list * wal,const char * name)898 __intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
899 struct i915_wa_list *wal,
900 const char *name)
901 {
902 struct drm_i915_private *i915 = engine->i915;
903
904 wa_init_start(wal, engine->gt, name, engine->name);
905
906 /* Applies to all engines */
907 /*
908 * Fake workarounds are not the actual workaround but
909 * programming of context registers using workaround framework.
910 */
911 if (GRAPHICS_VER(i915) >= 12)
912 gen12_ctx_gt_fake_wa_init(engine, wal);
913
914 if (engine->class != RENDER_CLASS)
915 goto done;
916
917 if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
918 xelpg_ctx_workarounds_init(engine, wal);
919 else if (IS_PONTEVECCHIO(i915))
920 ; /* noop; none at this time */
921 else if (IS_DG2(i915))
922 dg2_ctx_workarounds_init(engine, wal);
923 else if (IS_XEHPSDV(i915))
924 ; /* noop; none at this time */
925 else if (IS_DG1(i915))
926 dg1_ctx_workarounds_init(engine, wal);
927 else if (GRAPHICS_VER(i915) == 12)
928 gen12_ctx_workarounds_init(engine, wal);
929 else if (GRAPHICS_VER(i915) == 11)
930 icl_ctx_workarounds_init(engine, wal);
931 else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
932 cfl_ctx_workarounds_init(engine, wal);
933 else if (IS_GEMINILAKE(i915))
934 glk_ctx_workarounds_init(engine, wal);
935 else if (IS_KABYLAKE(i915))
936 kbl_ctx_workarounds_init(engine, wal);
937 else if (IS_BROXTON(i915))
938 bxt_ctx_workarounds_init(engine, wal);
939 else if (IS_SKYLAKE(i915))
940 skl_ctx_workarounds_init(engine, wal);
941 else if (IS_CHERRYVIEW(i915))
942 chv_ctx_workarounds_init(engine, wal);
943 else if (IS_BROADWELL(i915))
944 bdw_ctx_workarounds_init(engine, wal);
945 else if (GRAPHICS_VER(i915) == 7)
946 gen7_ctx_workarounds_init(engine, wal);
947 else if (GRAPHICS_VER(i915) == 6)
948 gen6_ctx_workarounds_init(engine, wal);
949 else if (GRAPHICS_VER(i915) < 8)
950 ;
951 else
952 MISSING_CASE(GRAPHICS_VER(i915));
953
954 done:
955 wa_init_finish(wal);
956 }
957
intel_engine_init_ctx_wa(struct intel_engine_cs * engine)958 void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
959 {
960 __intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
961 }
962
intel_engine_emit_ctx_wa(struct i915_request * rq)963 int intel_engine_emit_ctx_wa(struct i915_request *rq)
964 {
965 struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
966 struct intel_uncore *uncore = rq->engine->uncore;
967 enum forcewake_domains fw;
968 unsigned long flags;
969 struct i915_wa *wa;
970 unsigned int i;
971 u32 *cs;
972 int ret;
973
974 if (wal->count == 0)
975 return 0;
976
977 ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
978 if (ret)
979 return ret;
980
981 cs = intel_ring_begin(rq, (wal->count * 2 + 2));
982 if (IS_ERR(cs))
983 return PTR_ERR(cs);
984
985 fw = wal_get_fw_for_rmw(uncore, wal);
986
987 intel_gt_mcr_lock(wal->gt, &flags);
988 spin_lock(&uncore->lock);
989 intel_uncore_forcewake_get__locked(uncore, fw);
990
991 *cs++ = MI_LOAD_REGISTER_IMM(wal->count);
992 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
993 u32 val;
994
995 /* Skip reading the register if it's not really needed */
996 if (wa->masked_reg || (wa->clr | wa->set) == U32_MAX) {
997 val = wa->set;
998 } else {
999 val = wa->is_mcr ?
1000 intel_gt_mcr_read_any_fw(wal->gt, wa->mcr_reg) :
1001 intel_uncore_read_fw(uncore, wa->reg);
1002 val &= ~wa->clr;
1003 val |= wa->set;
1004 }
1005
1006 *cs++ = i915_mmio_reg_offset(wa->reg);
1007 *cs++ = val;
1008 }
1009 *cs++ = MI_NOOP;
1010
1011 intel_uncore_forcewake_put__locked(uncore, fw);
1012 spin_unlock(&uncore->lock);
1013 intel_gt_mcr_unlock(wal->gt, flags);
1014
1015 intel_ring_advance(rq, cs);
1016
1017 ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
1018 if (ret)
1019 return ret;
1020
1021 return 0;
1022 }
1023
1024 static void
gen4_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1025 gen4_gt_workarounds_init(struct intel_gt *gt,
1026 struct i915_wa_list *wal)
1027 {
1028 /* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
1029 wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
1030 }
1031
1032 static void
g4x_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1033 g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1034 {
1035 gen4_gt_workarounds_init(gt, wal);
1036
1037 /* WaDisableRenderCachePipelinedFlush:g4x,ilk */
1038 wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
1039 }
1040
1041 static void
ilk_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1042 ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1043 {
1044 g4x_gt_workarounds_init(gt, wal);
1045
1046 wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
1047 }
1048
1049 static void
snb_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1050 snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1051 {
1052 }
1053
1054 static void
ivb_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1055 ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1056 {
1057 /* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
1058 wa_masked_dis(wal,
1059 GEN7_COMMON_SLICE_CHICKEN1,
1060 GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
1061
1062 /* WaApplyL3ControlAndL3ChickenMode:ivb */
1063 wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
1064 wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
1065
1066 /* WaForceL3Serialization:ivb */
1067 wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1068 }
1069
1070 static void
vlv_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1071 vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1072 {
1073 /* WaForceL3Serialization:vlv */
1074 wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1075
1076 /*
1077 * WaIncreaseL3CreditsForVLVB0:vlv
1078 * This is the hardware default actually.
1079 */
1080 wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
1081 }
1082
1083 static void
hsw_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1084 hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1085 {
1086 /* L3 caching of data atomics doesn't work -- disable it. */
1087 wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
1088
1089 wa_add(wal,
1090 HSW_ROW_CHICKEN3, 0,
1091 _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
1092 0 /* XXX does this reg exist? */, true);
1093
1094 /* WaVSRefCountFullforceMissDisable:hsw */
1095 wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
1096 }
1097
1098 static void
gen9_wa_init_mcr(struct drm_i915_private * i915,struct i915_wa_list * wal)1099 gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
1100 {
1101 const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu;
1102 unsigned int slice, subslice;
1103 u32 mcr, mcr_mask;
1104
1105 GEM_BUG_ON(GRAPHICS_VER(i915) != 9);
1106
1107 /*
1108 * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml
1109 * Before any MMIO read into slice/subslice specific registers, MCR
1110 * packet control register needs to be programmed to point to any
1111 * enabled s/ss pair. Otherwise, incorrect values will be returned.
1112 * This means each subsequent MMIO read will be forwarded to an
1113 * specific s/ss combination, but this is OK since these registers
1114 * are consistent across s/ss in almost all cases. In the rare
1115 * occasions, such as INSTDONE, where this value is dependent
1116 * on s/ss combo, the read should be done with read_subslice_reg.
1117 */
1118 slice = ffs(sseu->slice_mask) - 1;
1119 GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw));
1120 subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice));
1121 GEM_BUG_ON(!subslice);
1122 subslice--;
1123
1124 /*
1125 * We use GEN8_MCR..() macros to calculate the |mcr| value for
1126 * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads
1127 */
1128 mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1129 mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1130
1131 drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr);
1132
1133 wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
1134 }
1135
1136 static void
gen9_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1137 gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1138 {
1139 struct drm_i915_private *i915 = gt->i915;
1140
1141 /* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */
1142 gen9_wa_init_mcr(i915, wal);
1143
1144 /* WaDisableKillLogic:bxt,skl,kbl */
1145 if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
1146 wa_write_or(wal,
1147 GAM_ECOCHK,
1148 ECOCHK_DIS_TLB);
1149
1150 if (HAS_LLC(i915)) {
1151 /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
1152 *
1153 * Must match Display Engine. See
1154 * WaCompressedResourceDisplayNewHashMode.
1155 */
1156 wa_write_or(wal,
1157 MMCD_MISC_CTRL,
1158 MMCD_PCLA | MMCD_HOTSPOT_EN);
1159 }
1160
1161 /* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
1162 wa_write_or(wal,
1163 GAM_ECOCHK,
1164 BDW_DISABLE_HDC_INVALIDATION);
1165 }
1166
1167 static void
skl_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1168 skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1169 {
1170 gen9_gt_workarounds_init(gt, wal);
1171
1172 /* WaDisableGafsUnitClkGating:skl */
1173 wa_write_or(wal,
1174 GEN7_UCGCTL4,
1175 GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1176
1177 /* WaInPlaceDecompressionHang:skl */
1178 if (IS_SKYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0))
1179 wa_write_or(wal,
1180 GEN9_GAMT_ECO_REG_RW_IA,
1181 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1182 }
1183
1184 static void
kbl_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1185 kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1186 {
1187 gen9_gt_workarounds_init(gt, wal);
1188
1189 /* WaDisableDynamicCreditSharing:kbl */
1190 if (IS_KABYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, 0, STEP_C0))
1191 wa_write_or(wal,
1192 GAMT_CHKN_BIT_REG,
1193 GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1194
1195 /* WaDisableGafsUnitClkGating:kbl */
1196 wa_write_or(wal,
1197 GEN7_UCGCTL4,
1198 GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1199
1200 /* WaInPlaceDecompressionHang:kbl */
1201 wa_write_or(wal,
1202 GEN9_GAMT_ECO_REG_RW_IA,
1203 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1204 }
1205
1206 static void
glk_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1207 glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1208 {
1209 gen9_gt_workarounds_init(gt, wal);
1210 }
1211
1212 static void
cfl_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1213 cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1214 {
1215 gen9_gt_workarounds_init(gt, wal);
1216
1217 /* WaDisableGafsUnitClkGating:cfl */
1218 wa_write_or(wal,
1219 GEN7_UCGCTL4,
1220 GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1221
1222 /* WaInPlaceDecompressionHang:cfl */
1223 wa_write_or(wal,
1224 GEN9_GAMT_ECO_REG_RW_IA,
1225 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1226 }
1227
__set_mcr_steering(struct i915_wa_list * wal,i915_reg_t steering_reg,unsigned int slice,unsigned int subslice)1228 static void __set_mcr_steering(struct i915_wa_list *wal,
1229 i915_reg_t steering_reg,
1230 unsigned int slice, unsigned int subslice)
1231 {
1232 u32 mcr, mcr_mask;
1233
1234 mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1235 mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1236
1237 wa_write_clr_set(wal, steering_reg, mcr_mask, mcr);
1238 }
1239
debug_dump_steering(struct intel_gt * gt)1240 static void debug_dump_steering(struct intel_gt *gt)
1241 {
1242 struct drm_printer p = drm_debug_printer("MCR Steering:");
1243
1244 if (drm_debug_enabled(DRM_UT_DRIVER))
1245 intel_gt_mcr_report_steering(&p, gt, false);
1246 }
1247
__add_mcr_wa(struct intel_gt * gt,struct i915_wa_list * wal,unsigned int slice,unsigned int subslice)1248 static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal,
1249 unsigned int slice, unsigned int subslice)
1250 {
1251 __set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice);
1252
1253 gt->default_steering.groupid = slice;
1254 gt->default_steering.instanceid = subslice;
1255
1256 debug_dump_steering(gt);
1257 }
1258
1259 static void
icl_wa_init_mcr(struct intel_gt * gt,struct i915_wa_list * wal)1260 icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1261 {
1262 const struct sseu_dev_info *sseu = >->info.sseu;
1263 unsigned int subslice;
1264
1265 GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11);
1266 GEM_BUG_ON(hweight8(sseu->slice_mask) > 1);
1267
1268 /*
1269 * Although a platform may have subslices, we need to always steer
1270 * reads to the lowest instance that isn't fused off. When Render
1271 * Power Gating is enabled, grabbing forcewake will only power up a
1272 * single subslice (the "minconfig") if there isn't a real workload
1273 * that needs to be run; this means that if we steer register reads to
1274 * one of the higher subslices, we run the risk of reading back 0's or
1275 * random garbage.
1276 */
1277 subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0));
1278
1279 /*
1280 * If the subslice we picked above also steers us to a valid L3 bank,
1281 * then we can just rely on the default steering and won't need to
1282 * worry about explicitly re-steering L3BANK reads later.
1283 */
1284 if (gt->info.l3bank_mask & BIT(subslice))
1285 gt->steering_table[L3BANK] = NULL;
1286
1287 __add_mcr_wa(gt, wal, 0, subslice);
1288 }
1289
1290 static void
xehp_init_mcr(struct intel_gt * gt,struct i915_wa_list * wal)1291 xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1292 {
1293 const struct sseu_dev_info *sseu = >->info.sseu;
1294 unsigned long slice, subslice = 0, slice_mask = 0;
1295 u32 lncf_mask = 0;
1296 int i;
1297
1298 /*
1299 * On Xe_HP the steering increases in complexity. There are now several
1300 * more units that require steering and we're not guaranteed to be able
1301 * to find a common setting for all of them. These are:
1302 * - GSLICE (fusable)
1303 * - DSS (sub-unit within gslice; fusable)
1304 * - L3 Bank (fusable)
1305 * - MSLICE (fusable)
1306 * - LNCF (sub-unit within mslice; always present if mslice is present)
1307 *
1308 * We'll do our default/implicit steering based on GSLICE (in the
1309 * sliceid field) and DSS (in the subsliceid field). If we can
1310 * find overlap between the valid MSLICE and/or LNCF values with
1311 * a suitable GSLICE, then we can just re-use the default value and
1312 * skip and explicit steering at runtime.
1313 *
1314 * We only need to look for overlap between GSLICE/MSLICE/LNCF to find
1315 * a valid sliceid value. DSS steering is the only type of steering
1316 * that utilizes the 'subsliceid' bits.
1317 *
1318 * Also note that, even though the steering domain is called "GSlice"
1319 * and it is encoded in the register using the gslice format, the spec
1320 * says that the combined (geometry | compute) fuse should be used to
1321 * select the steering.
1322 */
1323
1324 /* Find the potential gslice candidates */
1325 slice_mask = intel_slicemask_from_xehp_dssmask(sseu->subslice_mask,
1326 GEN_DSS_PER_GSLICE);
1327
1328 /*
1329 * Find the potential LNCF candidates. Either LNCF within a valid
1330 * mslice is fine.
1331 */
1332 for_each_set_bit(i, >->info.mslice_mask, GEN12_MAX_MSLICES)
1333 lncf_mask |= (0x3 << (i * 2));
1334
1335 /*
1336 * Are there any sliceid values that work for both GSLICE and LNCF
1337 * steering?
1338 */
1339 if (slice_mask & lncf_mask) {
1340 slice_mask &= lncf_mask;
1341 gt->steering_table[LNCF] = NULL;
1342 }
1343
1344 /* How about sliceid values that also work for MSLICE steering? */
1345 if (slice_mask & gt->info.mslice_mask) {
1346 slice_mask &= gt->info.mslice_mask;
1347 gt->steering_table[MSLICE] = NULL;
1348 }
1349
1350 if (IS_XEHPSDV(gt->i915) && slice_mask & BIT(0))
1351 gt->steering_table[GAM] = NULL;
1352
1353 slice = __ffs(slice_mask);
1354 subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, slice) %
1355 GEN_DSS_PER_GSLICE;
1356
1357 __add_mcr_wa(gt, wal, slice, subslice);
1358
1359 /*
1360 * SQIDI ranges are special because they use different steering
1361 * registers than everything else we work with. On XeHP SDV and
1362 * DG2-G10, any value in the steering registers will work fine since
1363 * all instances are present, but DG2-G11 only has SQIDI instances at
1364 * ID's 2 and 3, so we need to steer to one of those. For simplicity
1365 * we'll just steer to a hardcoded "2" since that value will work
1366 * everywhere.
1367 */
1368 __set_mcr_steering(wal, MCFG_MCR_SELECTOR, 0, 2);
1369 __set_mcr_steering(wal, SF_MCR_SELECTOR, 0, 2);
1370
1371 /*
1372 * On DG2, GAM registers have a dedicated steering control register
1373 * and must always be programmed to a hardcoded groupid of "1."
1374 */
1375 if (IS_DG2(gt->i915))
1376 __set_mcr_steering(wal, GAM_MCR_SELECTOR, 1, 0);
1377 }
1378
1379 static void
pvc_init_mcr(struct intel_gt * gt,struct i915_wa_list * wal)1380 pvc_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1381 {
1382 unsigned int dss;
1383
1384 /*
1385 * Setup implicit steering for COMPUTE and DSS ranges to the first
1386 * non-fused-off DSS. All other types of MCR registers will be
1387 * explicitly steered.
1388 */
1389 dss = intel_sseu_find_first_xehp_dss(>->info.sseu, 0, 0);
1390 __add_mcr_wa(gt, wal, dss / GEN_DSS_PER_CSLICE, dss % GEN_DSS_PER_CSLICE);
1391 }
1392
1393 static void
icl_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1394 icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1395 {
1396 struct drm_i915_private *i915 = gt->i915;
1397
1398 icl_wa_init_mcr(gt, wal);
1399
1400 /* WaModifyGamTlbPartitioning:icl */
1401 wa_write_clr_set(wal,
1402 GEN11_GACB_PERF_CTRL,
1403 GEN11_HASH_CTRL_MASK,
1404 GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1405
1406 /* Wa_1405766107:icl
1407 * Formerly known as WaCL2SFHalfMaxAlloc
1408 */
1409 wa_write_or(wal,
1410 GEN11_LSN_UNSLCVC,
1411 GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1412 GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1413
1414 /* Wa_220166154:icl
1415 * Formerly known as WaDisCtxReload
1416 */
1417 wa_write_or(wal,
1418 GEN8_GAMW_ECO_DEV_RW_IA,
1419 GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1420
1421 /* Wa_1406463099:icl
1422 * Formerly known as WaGamTlbPendError
1423 */
1424 wa_write_or(wal,
1425 GAMT_CHKN_BIT_REG,
1426 GAMT_CHKN_DISABLE_L3_COH_PIPE);
1427
1428 /*
1429 * Wa_1408615072:icl,ehl (vsunit)
1430 * Wa_1407596294:icl,ehl (hsunit)
1431 */
1432 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1433 VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
1434
1435 /* Wa_1407352427:icl,ehl */
1436 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1437 PSDUNIT_CLKGATE_DIS);
1438
1439 /* Wa_1406680159:icl,ehl */
1440 wa_mcr_write_or(wal,
1441 GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1442 GWUNIT_CLKGATE_DIS);
1443
1444 /* Wa_1607087056:icl,ehl,jsl */
1445 if (IS_ICELAKE(i915) ||
1446 ((IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) &&
1447 IS_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)))
1448 wa_write_or(wal,
1449 GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1450 L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1451
1452 /*
1453 * This is not a documented workaround, but rather an optimization
1454 * to reduce sampler power.
1455 */
1456 wa_mcr_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1457 }
1458
1459 /*
1460 * Though there are per-engine instances of these registers,
1461 * they retain their value through engine resets and should
1462 * only be provided on the GT workaround list rather than
1463 * the engine-specific workaround list.
1464 */
1465 static void
wa_14011060649(struct intel_gt * gt,struct i915_wa_list * wal)1466 wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal)
1467 {
1468 struct intel_engine_cs *engine;
1469 int id;
1470
1471 for_each_engine(engine, gt, id) {
1472 if (engine->class != VIDEO_DECODE_CLASS ||
1473 (engine->instance % 2))
1474 continue;
1475
1476 wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base),
1477 IECPUNIT_CLKGATE_DIS);
1478 }
1479 }
1480
1481 static void
gen12_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1482 gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1483 {
1484 icl_wa_init_mcr(gt, wal);
1485
1486 /* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */
1487 wa_14011060649(gt, wal);
1488
1489 /* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */
1490 wa_mcr_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1491
1492 /*
1493 * Wa_14015795083
1494 *
1495 * Firmware on some gen12 platforms locks the MISCCPCTL register,
1496 * preventing i915 from modifying it for this workaround. Skip the
1497 * readback verification for this workaround on debug builds; if the
1498 * workaround doesn't stick due to firmware behavior, it's not an error
1499 * that we want CI to flag.
1500 */
1501 wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1502 0, 0, false);
1503 }
1504
1505 static void
dg1_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1506 dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1507 {
1508 gen12_gt_workarounds_init(gt, wal);
1509
1510 /* Wa_1409420604:dg1 */
1511 wa_mcr_write_or(wal, SUBSLICE_UNIT_LEVEL_CLKGATE2,
1512 CPSSUNIT_CLKGATE_DIS);
1513
1514 /* Wa_1408615072:dg1 */
1515 /* Empirical testing shows this register is unaffected by engine reset. */
1516 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, VSUNIT_CLKGATE_DIS_TGL);
1517 }
1518
1519 static void
xehpsdv_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1520 xehpsdv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1521 {
1522 struct drm_i915_private *i915 = gt->i915;
1523
1524 xehp_init_mcr(gt, wal);
1525
1526 /* Wa_1409757795:xehpsdv */
1527 wa_mcr_write_or(wal, SCCGCTL94DC, CG3DDISURB);
1528
1529 /* Wa_18011725039:xehpsdv */
1530 if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_B0)) {
1531 wa_mcr_masked_dis(wal, MLTICTXCTL, TDONRENDER);
1532 wa_mcr_write_or(wal, L3SQCREG1_CCS0, FLUSHALLNONCOH);
1533 }
1534
1535 /* Wa_16011155590:xehpsdv */
1536 if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1537 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1538 TSGUNIT_CLKGATE_DIS);
1539
1540 /* Wa_14011780169:xehpsdv */
1541 if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_B0, STEP_FOREVER)) {
1542 wa_write_or(wal, UNSLCGCTL9440, GAMTLBOACS_CLKGATE_DIS |
1543 GAMTLBVDBOX7_CLKGATE_DIS |
1544 GAMTLBVDBOX6_CLKGATE_DIS |
1545 GAMTLBVDBOX5_CLKGATE_DIS |
1546 GAMTLBVDBOX4_CLKGATE_DIS |
1547 GAMTLBVDBOX3_CLKGATE_DIS |
1548 GAMTLBVDBOX2_CLKGATE_DIS |
1549 GAMTLBVDBOX1_CLKGATE_DIS |
1550 GAMTLBVDBOX0_CLKGATE_DIS |
1551 GAMTLBKCR_CLKGATE_DIS |
1552 GAMTLBGUC_CLKGATE_DIS |
1553 GAMTLBBLT_CLKGATE_DIS);
1554 wa_write_or(wal, UNSLCGCTL9444, GAMTLBGFXA0_CLKGATE_DIS |
1555 GAMTLBGFXA1_CLKGATE_DIS |
1556 GAMTLBCOMPA0_CLKGATE_DIS |
1557 GAMTLBCOMPA1_CLKGATE_DIS |
1558 GAMTLBCOMPB0_CLKGATE_DIS |
1559 GAMTLBCOMPB1_CLKGATE_DIS |
1560 GAMTLBCOMPC0_CLKGATE_DIS |
1561 GAMTLBCOMPC1_CLKGATE_DIS |
1562 GAMTLBCOMPD0_CLKGATE_DIS |
1563 GAMTLBCOMPD1_CLKGATE_DIS |
1564 GAMTLBMERT_CLKGATE_DIS |
1565 GAMTLBVEBOX3_CLKGATE_DIS |
1566 GAMTLBVEBOX2_CLKGATE_DIS |
1567 GAMTLBVEBOX1_CLKGATE_DIS |
1568 GAMTLBVEBOX0_CLKGATE_DIS);
1569 }
1570
1571 /* Wa_16012725990:xehpsdv */
1572 if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_FOREVER))
1573 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, VFUNIT_CLKGATE_DIS);
1574
1575 /* Wa_14011060649:xehpsdv */
1576 wa_14011060649(gt, wal);
1577
1578 /* Wa_14012362059:xehpsdv */
1579 wa_mcr_write_or(wal, XEHP_MERT_MOD_CTRL, FORCE_MISS_FTLB);
1580
1581 /* Wa_14014368820:xehpsdv */
1582 wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1583 INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1584
1585 /* Wa_14010670810:xehpsdv */
1586 wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1587 }
1588
1589 static void
dg2_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1590 dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1591 {
1592 xehp_init_mcr(gt, wal);
1593
1594 /* Wa_14011060649:dg2 */
1595 wa_14011060649(gt, wal);
1596
1597 if (IS_DG2_G10(gt->i915)) {
1598 /* Wa_22010523718:dg2 */
1599 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1600 CG3DDISCFEG_CLKGATE_DIS);
1601
1602 /* Wa_14011006942:dg2 */
1603 wa_mcr_write_or(wal, GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1604 DSS_ROUTER_CLKGATE_DIS);
1605 }
1606
1607 /* Wa_14014830051:dg2 */
1608 wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1609
1610 /* Wa_14015795083 */
1611 wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1612
1613 /* Wa_18018781329 */
1614 wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1615 wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1616 wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1617 wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1618
1619 /* Wa_1509235366:dg2 */
1620 wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1621 INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1622
1623 /* Wa_14010648519:dg2 */
1624 wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1625 }
1626
1627 static void
pvc_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1628 pvc_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1629 {
1630 pvc_init_mcr(gt, wal);
1631
1632 /* Wa_14015795083 */
1633 wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1634
1635 /* Wa_18018781329 */
1636 wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1637 wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1638 wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1639 wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1640
1641 /* Wa_16016694945 */
1642 wa_mcr_masked_en(wal, XEHPC_LNCFMISCCFGREG0, XEHPC_OVRLSCCC);
1643 }
1644
1645 static void
xelpg_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1646 xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1647 {
1648 /* Wa_14018575942 / Wa_18018781329 */
1649 wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1650 wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1651
1652 /* Wa_22016670082 */
1653 wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1654
1655 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1656 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
1657 /* Wa_14014830051 */
1658 wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1659
1660 /* Wa_14015795083 */
1661 wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1662 }
1663
1664 /*
1665 * Unlike older platforms, we no longer setup implicit steering here;
1666 * all MCR accesses are explicitly steered.
1667 */
1668 debug_dump_steering(gt);
1669 }
1670
1671 static void
xelpmp_gt_workarounds_init(struct intel_gt * gt,struct i915_wa_list * wal)1672 xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1673 {
1674 /*
1675 * Wa_14018778641
1676 * Wa_18018781329
1677 *
1678 * Note that although these registers are MCR on the primary
1679 * GT, the media GT's versions are regular singleton registers.
1680 */
1681 wa_write_or(wal, XELPMP_GSC_MOD_CTRL, FORCE_MISS_FTLB);
1682
1683 debug_dump_steering(gt);
1684 }
1685
1686 /*
1687 * The bspec performance guide has recommended MMIO tuning settings. These
1688 * aren't truly "workarounds" but we want to program them through the
1689 * workaround infrastructure to make sure they're (re)applied at the proper
1690 * times.
1691 *
1692 * The programming in this function is for settings that persist through
1693 * engine resets and also are not part of any engine's register state context.
1694 * I.e., settings that only need to be re-applied in the event of a full GT
1695 * reset.
1696 */
gt_tuning_settings(struct intel_gt * gt,struct i915_wa_list * wal)1697 static void gt_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal)
1698 {
1699 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) {
1700 wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1701 wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1702 }
1703
1704 if (IS_PONTEVECCHIO(gt->i915)) {
1705 wa_mcr_write(wal, XEHPC_L3SCRUB,
1706 SCRUB_CL_DWNGRADE_SHARED | SCRUB_RATE_4B_PER_CLK);
1707 wa_mcr_masked_en(wal, XEHPC_LNCFMISCCFGREG0, XEHPC_HOSTCACHEEN);
1708 }
1709
1710 if (IS_DG2(gt->i915)) {
1711 wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1712 wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1713 }
1714 }
1715
1716 static void
gt_init_workarounds(struct intel_gt * gt,struct i915_wa_list * wal)1717 gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
1718 {
1719 struct drm_i915_private *i915 = gt->i915;
1720
1721 gt_tuning_settings(gt, wal);
1722
1723 if (gt->type == GT_MEDIA) {
1724 if (MEDIA_VER(i915) >= 13)
1725 xelpmp_gt_workarounds_init(gt, wal);
1726 else
1727 MISSING_CASE(MEDIA_VER(i915));
1728
1729 return;
1730 }
1731
1732 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)))
1733 xelpg_gt_workarounds_init(gt, wal);
1734 else if (IS_PONTEVECCHIO(i915))
1735 pvc_gt_workarounds_init(gt, wal);
1736 else if (IS_DG2(i915))
1737 dg2_gt_workarounds_init(gt, wal);
1738 else if (IS_XEHPSDV(i915))
1739 xehpsdv_gt_workarounds_init(gt, wal);
1740 else if (IS_DG1(i915))
1741 dg1_gt_workarounds_init(gt, wal);
1742 else if (GRAPHICS_VER(i915) == 12)
1743 gen12_gt_workarounds_init(gt, wal);
1744 else if (GRAPHICS_VER(i915) == 11)
1745 icl_gt_workarounds_init(gt, wal);
1746 else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1747 cfl_gt_workarounds_init(gt, wal);
1748 else if (IS_GEMINILAKE(i915))
1749 glk_gt_workarounds_init(gt, wal);
1750 else if (IS_KABYLAKE(i915))
1751 kbl_gt_workarounds_init(gt, wal);
1752 else if (IS_BROXTON(i915))
1753 gen9_gt_workarounds_init(gt, wal);
1754 else if (IS_SKYLAKE(i915))
1755 skl_gt_workarounds_init(gt, wal);
1756 else if (IS_HASWELL(i915))
1757 hsw_gt_workarounds_init(gt, wal);
1758 else if (IS_VALLEYVIEW(i915))
1759 vlv_gt_workarounds_init(gt, wal);
1760 else if (IS_IVYBRIDGE(i915))
1761 ivb_gt_workarounds_init(gt, wal);
1762 else if (GRAPHICS_VER(i915) == 6)
1763 snb_gt_workarounds_init(gt, wal);
1764 else if (GRAPHICS_VER(i915) == 5)
1765 ilk_gt_workarounds_init(gt, wal);
1766 else if (IS_G4X(i915))
1767 g4x_gt_workarounds_init(gt, wal);
1768 else if (GRAPHICS_VER(i915) == 4)
1769 gen4_gt_workarounds_init(gt, wal);
1770 else if (GRAPHICS_VER(i915) <= 8)
1771 ;
1772 else
1773 MISSING_CASE(GRAPHICS_VER(i915));
1774 }
1775
intel_gt_init_workarounds(struct intel_gt * gt)1776 void intel_gt_init_workarounds(struct intel_gt *gt)
1777 {
1778 struct i915_wa_list *wal = >->wa_list;
1779
1780 wa_init_start(wal, gt, "GT", "global");
1781 gt_init_workarounds(gt, wal);
1782 wa_init_finish(wal);
1783 }
1784
1785 static bool
wa_verify(struct intel_gt * gt,const struct i915_wa * wa,u32 cur,const char * name,const char * from)1786 wa_verify(struct intel_gt *gt, const struct i915_wa *wa, u32 cur,
1787 const char *name, const char *from)
1788 {
1789 if ((cur ^ wa->set) & wa->read) {
1790 drm_err(>->i915->drm,
1791 "%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n",
1792 name, from, i915_mmio_reg_offset(wa->reg),
1793 cur, cur & wa->read, wa->set & wa->read);
1794
1795 return false;
1796 }
1797
1798 return true;
1799 }
1800
wa_list_apply(const struct i915_wa_list * wal)1801 static void wa_list_apply(const struct i915_wa_list *wal)
1802 {
1803 struct intel_gt *gt = wal->gt;
1804 struct intel_uncore *uncore = gt->uncore;
1805 enum forcewake_domains fw;
1806 unsigned long flags;
1807 struct i915_wa *wa;
1808 unsigned int i;
1809
1810 if (!wal->count)
1811 return;
1812
1813 fw = wal_get_fw_for_rmw(uncore, wal);
1814
1815 intel_gt_mcr_lock(gt, &flags);
1816 spin_lock(&uncore->lock);
1817 intel_uncore_forcewake_get__locked(uncore, fw);
1818
1819 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1820 u32 val, old = 0;
1821
1822 /* open-coded rmw due to steering */
1823 if (wa->clr)
1824 old = wa->is_mcr ?
1825 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1826 intel_uncore_read_fw(uncore, wa->reg);
1827 val = (old & ~wa->clr) | wa->set;
1828 if (val != old || !wa->clr) {
1829 if (wa->is_mcr)
1830 intel_gt_mcr_multicast_write_fw(gt, wa->mcr_reg, val);
1831 else
1832 intel_uncore_write_fw(uncore, wa->reg, val);
1833 }
1834
1835 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
1836 u32 val = wa->is_mcr ?
1837 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1838 intel_uncore_read_fw(uncore, wa->reg);
1839
1840 wa_verify(gt, wa, val, wal->name, "application");
1841 }
1842 }
1843
1844 intel_uncore_forcewake_put__locked(uncore, fw);
1845 spin_unlock(&uncore->lock);
1846 intel_gt_mcr_unlock(gt, flags);
1847 }
1848
intel_gt_apply_workarounds(struct intel_gt * gt)1849 void intel_gt_apply_workarounds(struct intel_gt *gt)
1850 {
1851 wa_list_apply(>->wa_list);
1852 }
1853
wa_list_verify(struct intel_gt * gt,const struct i915_wa_list * wal,const char * from)1854 static bool wa_list_verify(struct intel_gt *gt,
1855 const struct i915_wa_list *wal,
1856 const char *from)
1857 {
1858 struct intel_uncore *uncore = gt->uncore;
1859 struct i915_wa *wa;
1860 enum forcewake_domains fw;
1861 unsigned long flags;
1862 unsigned int i;
1863 bool ok = true;
1864
1865 fw = wal_get_fw_for_rmw(uncore, wal);
1866
1867 intel_gt_mcr_lock(gt, &flags);
1868 spin_lock(&uncore->lock);
1869 intel_uncore_forcewake_get__locked(uncore, fw);
1870
1871 for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1872 ok &= wa_verify(wal->gt, wa, wa->is_mcr ?
1873 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1874 intel_uncore_read_fw(uncore, wa->reg),
1875 wal->name, from);
1876
1877 intel_uncore_forcewake_put__locked(uncore, fw);
1878 spin_unlock(&uncore->lock);
1879 intel_gt_mcr_unlock(gt, flags);
1880
1881 return ok;
1882 }
1883
intel_gt_verify_workarounds(struct intel_gt * gt,const char * from)1884 bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1885 {
1886 return wa_list_verify(gt, >->wa_list, from);
1887 }
1888
1889 __maybe_unused
is_nonpriv_flags_valid(u32 flags)1890 static bool is_nonpriv_flags_valid(u32 flags)
1891 {
1892 /* Check only valid flag bits are set */
1893 if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1894 return false;
1895
1896 /* NB: Only 3 out of 4 enum values are valid for access field */
1897 if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1898 RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1899 return false;
1900
1901 return true;
1902 }
1903
1904 static void
whitelist_reg_ext(struct i915_wa_list * wal,i915_reg_t reg,u32 flags)1905 whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1906 {
1907 struct i915_wa wa = {
1908 .reg = reg
1909 };
1910
1911 if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1912 return;
1913
1914 if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1915 return;
1916
1917 wa.reg.reg |= flags;
1918 _wa_add(wal, &wa);
1919 }
1920
1921 static void
whitelist_mcr_reg_ext(struct i915_wa_list * wal,i915_mcr_reg_t reg,u32 flags)1922 whitelist_mcr_reg_ext(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 flags)
1923 {
1924 struct i915_wa wa = {
1925 .mcr_reg = reg,
1926 .is_mcr = 1,
1927 };
1928
1929 if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1930 return;
1931
1932 if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1933 return;
1934
1935 wa.mcr_reg.reg |= flags;
1936 _wa_add(wal, &wa);
1937 }
1938
1939 static void
whitelist_reg(struct i915_wa_list * wal,i915_reg_t reg)1940 whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1941 {
1942 whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1943 }
1944
1945 static void
whitelist_mcr_reg(struct i915_wa_list * wal,i915_mcr_reg_t reg)1946 whitelist_mcr_reg(struct i915_wa_list *wal, i915_mcr_reg_t reg)
1947 {
1948 whitelist_mcr_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1949 }
1950
gen9_whitelist_build(struct i915_wa_list * w)1951 static void gen9_whitelist_build(struct i915_wa_list *w)
1952 {
1953 /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1954 whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1955
1956 /* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1957 whitelist_reg(w, GEN8_CS_CHICKEN1);
1958
1959 /* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1960 whitelist_reg(w, GEN8_HDC_CHICKEN1);
1961
1962 /* WaSendPushConstantsFromMMIO:skl,bxt */
1963 whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1964 }
1965
skl_whitelist_build(struct intel_engine_cs * engine)1966 static void skl_whitelist_build(struct intel_engine_cs *engine)
1967 {
1968 struct i915_wa_list *w = &engine->whitelist;
1969
1970 if (engine->class != RENDER_CLASS)
1971 return;
1972
1973 gen9_whitelist_build(w);
1974
1975 /* WaDisableLSQCROPERFforOCL:skl */
1976 whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1977 }
1978
bxt_whitelist_build(struct intel_engine_cs * engine)1979 static void bxt_whitelist_build(struct intel_engine_cs *engine)
1980 {
1981 if (engine->class != RENDER_CLASS)
1982 return;
1983
1984 gen9_whitelist_build(&engine->whitelist);
1985 }
1986
kbl_whitelist_build(struct intel_engine_cs * engine)1987 static void kbl_whitelist_build(struct intel_engine_cs *engine)
1988 {
1989 struct i915_wa_list *w = &engine->whitelist;
1990
1991 if (engine->class != RENDER_CLASS)
1992 return;
1993
1994 gen9_whitelist_build(w);
1995
1996 /* WaDisableLSQCROPERFforOCL:kbl */
1997 whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1998 }
1999
glk_whitelist_build(struct intel_engine_cs * engine)2000 static void glk_whitelist_build(struct intel_engine_cs *engine)
2001 {
2002 struct i915_wa_list *w = &engine->whitelist;
2003
2004 if (engine->class != RENDER_CLASS)
2005 return;
2006
2007 gen9_whitelist_build(w);
2008
2009 /* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
2010 whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
2011 }
2012
cfl_whitelist_build(struct intel_engine_cs * engine)2013 static void cfl_whitelist_build(struct intel_engine_cs *engine)
2014 {
2015 struct i915_wa_list *w = &engine->whitelist;
2016
2017 if (engine->class != RENDER_CLASS)
2018 return;
2019
2020 gen9_whitelist_build(w);
2021
2022 /*
2023 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
2024 *
2025 * This covers 4 register which are next to one another :
2026 * - PS_INVOCATION_COUNT
2027 * - PS_INVOCATION_COUNT_UDW
2028 * - PS_DEPTH_COUNT
2029 * - PS_DEPTH_COUNT_UDW
2030 */
2031 whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2032 RING_FORCE_TO_NONPRIV_ACCESS_RD |
2033 RING_FORCE_TO_NONPRIV_RANGE_4);
2034 }
2035
allow_read_ctx_timestamp(struct intel_engine_cs * engine)2036 static void allow_read_ctx_timestamp(struct intel_engine_cs *engine)
2037 {
2038 struct i915_wa_list *w = &engine->whitelist;
2039
2040 if (engine->class != RENDER_CLASS)
2041 whitelist_reg_ext(w,
2042 RING_CTX_TIMESTAMP(engine->mmio_base),
2043 RING_FORCE_TO_NONPRIV_ACCESS_RD);
2044 }
2045
cml_whitelist_build(struct intel_engine_cs * engine)2046 static void cml_whitelist_build(struct intel_engine_cs *engine)
2047 {
2048 allow_read_ctx_timestamp(engine);
2049
2050 cfl_whitelist_build(engine);
2051 }
2052
icl_whitelist_build(struct intel_engine_cs * engine)2053 static void icl_whitelist_build(struct intel_engine_cs *engine)
2054 {
2055 struct i915_wa_list *w = &engine->whitelist;
2056
2057 allow_read_ctx_timestamp(engine);
2058
2059 switch (engine->class) {
2060 case RENDER_CLASS:
2061 /* WaAllowUMDToModifyHalfSliceChicken7:icl */
2062 whitelist_mcr_reg(w, GEN9_HALF_SLICE_CHICKEN7);
2063
2064 /* WaAllowUMDToModifySamplerMode:icl */
2065 whitelist_mcr_reg(w, GEN10_SAMPLER_MODE);
2066
2067 /* WaEnableStateCacheRedirectToCS:icl */
2068 whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
2069
2070 /*
2071 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
2072 *
2073 * This covers 4 register which are next to one another :
2074 * - PS_INVOCATION_COUNT
2075 * - PS_INVOCATION_COUNT_UDW
2076 * - PS_DEPTH_COUNT
2077 * - PS_DEPTH_COUNT_UDW
2078 */
2079 whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2080 RING_FORCE_TO_NONPRIV_ACCESS_RD |
2081 RING_FORCE_TO_NONPRIV_RANGE_4);
2082 break;
2083
2084 case VIDEO_DECODE_CLASS:
2085 /* hucStatusRegOffset */
2086 whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
2087 RING_FORCE_TO_NONPRIV_ACCESS_RD);
2088 /* hucUKernelHdrInfoRegOffset */
2089 whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
2090 RING_FORCE_TO_NONPRIV_ACCESS_RD);
2091 /* hucStatus2RegOffset */
2092 whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
2093 RING_FORCE_TO_NONPRIV_ACCESS_RD);
2094 break;
2095
2096 default:
2097 break;
2098 }
2099 }
2100
tgl_whitelist_build(struct intel_engine_cs * engine)2101 static void tgl_whitelist_build(struct intel_engine_cs *engine)
2102 {
2103 struct i915_wa_list *w = &engine->whitelist;
2104
2105 allow_read_ctx_timestamp(engine);
2106
2107 switch (engine->class) {
2108 case RENDER_CLASS:
2109 /*
2110 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
2111 * Wa_1408556865:tgl
2112 *
2113 * This covers 4 registers which are next to one another :
2114 * - PS_INVOCATION_COUNT
2115 * - PS_INVOCATION_COUNT_UDW
2116 * - PS_DEPTH_COUNT
2117 * - PS_DEPTH_COUNT_UDW
2118 */
2119 whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2120 RING_FORCE_TO_NONPRIV_ACCESS_RD |
2121 RING_FORCE_TO_NONPRIV_RANGE_4);
2122
2123 /*
2124 * Wa_1808121037:tgl
2125 * Wa_14012131227:dg1
2126 * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p
2127 */
2128 whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2129
2130 /* Wa_1806527549:tgl */
2131 whitelist_reg(w, HIZ_CHICKEN);
2132
2133 /* Required by recommended tuning setting (not a workaround) */
2134 whitelist_reg(w, GEN11_COMMON_SLICE_CHICKEN3);
2135
2136 break;
2137 default:
2138 break;
2139 }
2140 }
2141
dg2_whitelist_build(struct intel_engine_cs * engine)2142 static void dg2_whitelist_build(struct intel_engine_cs *engine)
2143 {
2144 struct i915_wa_list *w = &engine->whitelist;
2145
2146 switch (engine->class) {
2147 case RENDER_CLASS:
2148 /* Required by recommended tuning setting (not a workaround) */
2149 whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2150
2151 break;
2152 default:
2153 break;
2154 }
2155 }
2156
blacklist_trtt(struct intel_engine_cs * engine)2157 static void blacklist_trtt(struct intel_engine_cs *engine)
2158 {
2159 struct i915_wa_list *w = &engine->whitelist;
2160
2161 /*
2162 * Prevent read/write access to [0x4400, 0x4600) which covers
2163 * the TRTT range across all engines. Note that normally userspace
2164 * cannot access the other engines' trtt control, but for simplicity
2165 * we cover the entire range on each engine.
2166 */
2167 whitelist_reg_ext(w, _MMIO(0x4400),
2168 RING_FORCE_TO_NONPRIV_DENY |
2169 RING_FORCE_TO_NONPRIV_RANGE_64);
2170 whitelist_reg_ext(w, _MMIO(0x4500),
2171 RING_FORCE_TO_NONPRIV_DENY |
2172 RING_FORCE_TO_NONPRIV_RANGE_64);
2173 }
2174
pvc_whitelist_build(struct intel_engine_cs * engine)2175 static void pvc_whitelist_build(struct intel_engine_cs *engine)
2176 {
2177 /* Wa_16014440446:pvc */
2178 blacklist_trtt(engine);
2179 }
2180
xelpg_whitelist_build(struct intel_engine_cs * engine)2181 static void xelpg_whitelist_build(struct intel_engine_cs *engine)
2182 {
2183 struct i915_wa_list *w = &engine->whitelist;
2184
2185 switch (engine->class) {
2186 case RENDER_CLASS:
2187 /* Required by recommended tuning setting (not a workaround) */
2188 whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2189
2190 break;
2191 default:
2192 break;
2193 }
2194 }
2195
intel_engine_init_whitelist(struct intel_engine_cs * engine)2196 void intel_engine_init_whitelist(struct intel_engine_cs *engine)
2197 {
2198 struct drm_i915_private *i915 = engine->i915;
2199 struct i915_wa_list *w = &engine->whitelist;
2200
2201 wa_init_start(w, engine->gt, "whitelist", engine->name);
2202
2203 if (engine->gt->type == GT_MEDIA)
2204 ; /* none yet */
2205 else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
2206 xelpg_whitelist_build(engine);
2207 else if (IS_PONTEVECCHIO(i915))
2208 pvc_whitelist_build(engine);
2209 else if (IS_DG2(i915))
2210 dg2_whitelist_build(engine);
2211 else if (IS_XEHPSDV(i915))
2212 ; /* none needed */
2213 else if (GRAPHICS_VER(i915) == 12)
2214 tgl_whitelist_build(engine);
2215 else if (GRAPHICS_VER(i915) == 11)
2216 icl_whitelist_build(engine);
2217 else if (IS_COMETLAKE(i915))
2218 cml_whitelist_build(engine);
2219 else if (IS_COFFEELAKE(i915))
2220 cfl_whitelist_build(engine);
2221 else if (IS_GEMINILAKE(i915))
2222 glk_whitelist_build(engine);
2223 else if (IS_KABYLAKE(i915))
2224 kbl_whitelist_build(engine);
2225 else if (IS_BROXTON(i915))
2226 bxt_whitelist_build(engine);
2227 else if (IS_SKYLAKE(i915))
2228 skl_whitelist_build(engine);
2229 else if (GRAPHICS_VER(i915) <= 8)
2230 ;
2231 else
2232 MISSING_CASE(GRAPHICS_VER(i915));
2233
2234 wa_init_finish(w);
2235 }
2236
intel_engine_apply_whitelist(struct intel_engine_cs * engine)2237 void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
2238 {
2239 const struct i915_wa_list *wal = &engine->whitelist;
2240 struct intel_uncore *uncore = engine->uncore;
2241 const u32 base = engine->mmio_base;
2242 struct i915_wa *wa;
2243 unsigned int i;
2244
2245 if (!wal->count)
2246 return;
2247
2248 for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
2249 intel_uncore_write(uncore,
2250 RING_FORCE_TO_NONPRIV(base, i),
2251 i915_mmio_reg_offset(wa->reg));
2252
2253 /* And clear the rest just in case of garbage */
2254 for (; i < RING_MAX_NONPRIV_SLOTS; i++)
2255 intel_uncore_write(uncore,
2256 RING_FORCE_TO_NONPRIV(base, i),
2257 i915_mmio_reg_offset(RING_NOPID(base)));
2258 }
2259
2260 /*
2261 * engine_fake_wa_init(), a place holder to program the registers
2262 * which are not part of an official workaround defined by the
2263 * hardware team.
2264 * Adding programming of those register inside workaround will
2265 * allow utilizing wa framework to proper application and verification.
2266 */
2267 static void
engine_fake_wa_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)2268 engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2269 {
2270 u8 mocs_w, mocs_r;
2271
2272 /*
2273 * RING_CMD_CCTL specifies the default MOCS entry that will be used
2274 * by the command streamer when executing commands that don't have
2275 * a way to explicitly specify a MOCS setting. The default should
2276 * usually reference whichever MOCS entry corresponds to uncached
2277 * behavior, although use of a WB cached entry is recommended by the
2278 * spec in certain circumstances on specific platforms.
2279 */
2280 if (GRAPHICS_VER(engine->i915) >= 12) {
2281 mocs_r = engine->gt->mocs.uc_index;
2282 mocs_w = engine->gt->mocs.uc_index;
2283
2284 if (HAS_L3_CCS_READ(engine->i915) &&
2285 engine->class == COMPUTE_CLASS) {
2286 mocs_r = engine->gt->mocs.wb_index;
2287
2288 /*
2289 * Even on the few platforms where MOCS 0 is a
2290 * legitimate table entry, it's never the correct
2291 * setting to use here; we can assume the MOCS init
2292 * just forgot to initialize wb_index.
2293 */
2294 drm_WARN_ON(&engine->i915->drm, mocs_r == 0);
2295 }
2296
2297 wa_masked_field_set(wal,
2298 RING_CMD_CCTL(engine->mmio_base),
2299 CMD_CCTL_MOCS_MASK,
2300 CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r));
2301 }
2302 }
2303
2304 static void
rcs_engine_wa_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)2305 rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2306 {
2307 struct drm_i915_private *i915 = engine->i915;
2308 struct intel_gt *gt = engine->gt;
2309
2310 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2311 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
2312 /* Wa_22014600077 */
2313 wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2314 ENABLE_EU_COUNT_FOR_TDL_FLUSH);
2315 }
2316
2317 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2318 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2319 IS_DG2(i915)) {
2320 /* Wa_1509727124 */
2321 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2322 SC_DISABLE_POWER_OPTIMIZATION_EBB);
2323 }
2324
2325 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2326 IS_DG2(i915)) {
2327 /* Wa_22012856258 */
2328 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2329 GEN12_DISABLE_READ_SUPPRESSION);
2330 }
2331
2332 if (IS_DG2(i915)) {
2333 /*
2334 * Wa_22010960976:dg2
2335 * Wa_14013347512:dg2
2336 */
2337 wa_mcr_masked_dis(wal, XEHP_HDC_CHICKEN0,
2338 LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK);
2339 }
2340
2341 if (IS_DG2_G11(i915) || IS_DG2_G10(i915)) {
2342 /* Wa_22014600077:dg2 */
2343 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2344 _MASKED_BIT_ENABLE(ENABLE_EU_COUNT_FOR_TDL_FLUSH),
2345 0 /* Wa_14012342262 write-only reg, so skip verification */,
2346 true);
2347 }
2348
2349 if (IS_DG2(i915) || IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2350 IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2351 /*
2352 * Wa_1606700617:tgl,dg1,adl-p
2353 * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p
2354 * Wa_14010826681:tgl,dg1,rkl,adl-p
2355 * Wa_18019627453:dg2
2356 */
2357 wa_masked_en(wal,
2358 GEN9_CS_DEBUG_MODE1,
2359 FF_DOP_CLOCK_GATE_DISABLE);
2360 }
2361
2362 if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) ||
2363 IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2364 /* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */
2365 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
2366
2367 /*
2368 * Wa_1407928979:tgl A*
2369 * Wa_18011464164:tgl[B0+],dg1[B0+]
2370 * Wa_22010931296:tgl[B0+],dg1[B0+]
2371 * Wa_14010919138:rkl,dg1,adl-s,adl-p
2372 */
2373 wa_write_or(wal, GEN7_FF_THREAD_MODE,
2374 GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2375
2376 /* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */
2377 wa_mcr_masked_en(wal,
2378 GEN10_SAMPLER_MODE,
2379 ENABLE_SMALLPL);
2380 }
2381
2382 if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2383 IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2384 /* Wa_1409804808 */
2385 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2386 GEN12_PUSH_CONST_DEREF_HOLD_DIS);
2387
2388 /* Wa_14010229206 */
2389 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
2390 }
2391
2392 if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || IS_ALDERLAKE_P(i915)) {
2393 /*
2394 * Wa_1607297627
2395 *
2396 * On TGL and RKL there are multiple entries for this WA in the
2397 * BSpec; some indicate this is an A0-only WA, others indicate
2398 * it applies to all steppings so we trust the "all steppings."
2399 */
2400 wa_masked_en(wal,
2401 RING_PSMI_CTL(RENDER_RING_BASE),
2402 GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
2403 GEN8_RC_SEMA_IDLE_MSG_DISABLE);
2404 }
2405
2406 if (GRAPHICS_VER(i915) == 11) {
2407 /* This is not an Wa. Enable for better image quality */
2408 wa_masked_en(wal,
2409 _3D_CHICKEN3,
2410 _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
2411
2412 /*
2413 * Wa_1405543622:icl
2414 * Formerly known as WaGAPZPriorityScheme
2415 */
2416 wa_write_or(wal,
2417 GEN8_GARBCNTL,
2418 GEN11_ARBITRATION_PRIO_ORDER_MASK);
2419
2420 /*
2421 * Wa_1604223664:icl
2422 * Formerly known as WaL3BankAddressHashing
2423 */
2424 wa_write_clr_set(wal,
2425 GEN8_GARBCNTL,
2426 GEN11_HASH_CTRL_EXCL_MASK,
2427 GEN11_HASH_CTRL_EXCL_BIT0);
2428 wa_write_clr_set(wal,
2429 GEN11_GLBLINVL,
2430 GEN11_BANK_HASH_ADDR_EXCL_MASK,
2431 GEN11_BANK_HASH_ADDR_EXCL_BIT0);
2432
2433 /*
2434 * Wa_1405733216:icl
2435 * Formerly known as WaDisableCleanEvicts
2436 */
2437 wa_mcr_write_or(wal,
2438 GEN8_L3SQCREG4,
2439 GEN11_LQSC_CLEAN_EVICT_DISABLE);
2440
2441 /* Wa_1606682166:icl */
2442 wa_write_or(wal,
2443 GEN7_SARCHKMD,
2444 GEN7_DISABLE_SAMPLER_PREFETCH);
2445
2446 /* Wa_1409178092:icl */
2447 wa_mcr_write_clr_set(wal,
2448 GEN11_SCRATCH2,
2449 GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
2450 0);
2451
2452 /* WaEnable32PlaneMode:icl */
2453 wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
2454 GEN11_ENABLE_32_PLANE_MODE);
2455
2456 /*
2457 * Wa_1408767742:icl[a2..forever],ehl[all]
2458 * Wa_1605460711:icl[a0..c0]
2459 */
2460 wa_write_or(wal,
2461 GEN7_FF_THREAD_MODE,
2462 GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2463
2464 /* Wa_22010271021 */
2465 wa_masked_en(wal,
2466 GEN9_CS_DEBUG_MODE1,
2467 FF_DOP_CLOCK_GATE_DISABLE);
2468 }
2469
2470 /*
2471 * Intel platforms that support fine-grained preemption (i.e., gen9 and
2472 * beyond) allow the kernel-mode driver to choose between two different
2473 * options for controlling preemption granularity and behavior.
2474 *
2475 * Option 1 (hardware default):
2476 * Preemption settings are controlled in a global manner via
2477 * kernel-only register CS_DEBUG_MODE1 (0x20EC). Any granularity
2478 * and settings chosen by the kernel-mode driver will apply to all
2479 * userspace clients.
2480 *
2481 * Option 2:
2482 * Preemption settings are controlled on a per-context basis via
2483 * register CS_CHICKEN1 (0x2580). CS_CHICKEN1 is saved/restored on
2484 * context switch and is writable by userspace (e.g., via
2485 * MI_LOAD_REGISTER_IMMEDIATE instructions placed in a batch buffer)
2486 * which allows different userspace drivers/clients to select
2487 * different settings, or to change those settings on the fly in
2488 * response to runtime needs. This option was known by name
2489 * "FtrPerCtxtPreemptionGranularityControl" at one time, although
2490 * that name is somewhat misleading as other non-granularity
2491 * preemption settings are also impacted by this decision.
2492 *
2493 * On Linux, our policy has always been to let userspace drivers
2494 * control preemption granularity/settings (Option 2). This was
2495 * originally mandatory on gen9 to prevent ABI breakage (old gen9
2496 * userspace developed before object-level preemption was enabled would
2497 * not behave well if i915 were to go with Option 1 and enable that
2498 * preemption in a global manner). On gen9 each context would have
2499 * object-level preemption disabled by default (see
2500 * WaDisable3DMidCmdPreemption in gen9_ctx_workarounds_init), but
2501 * userspace drivers could opt-in to object-level preemption as they
2502 * saw fit. For post-gen9 platforms, we continue to utilize Option 2;
2503 * even though it is no longer necessary for ABI compatibility when
2504 * enabling a new platform, it does ensure that userspace will be able
2505 * to implement any workarounds that show up requiring temporary
2506 * adjustments to preemption behavior at runtime.
2507 *
2508 * Notes/Workarounds:
2509 * - Wa_14015141709: On DG2 and early steppings of MTL,
2510 * CS_CHICKEN1[0] does not disable object-level preemption as
2511 * it is supposed to (nor does CS_DEBUG_MODE1[0] if we had been
2512 * using Option 1). Effectively this means userspace is unable
2513 * to disable object-level preemption on these platforms/steppings
2514 * despite the setting here.
2515 *
2516 * - Wa_16013994831: May require that userspace program
2517 * CS_CHICKEN1[10] when certain runtime conditions are true.
2518 * Userspace requires Option 2 to be in effect for their update of
2519 * CS_CHICKEN1[10] to be effective.
2520 *
2521 * Other workarounds may appear in the future that will also require
2522 * Option 2 behavior to allow proper userspace implementation.
2523 */
2524 if (GRAPHICS_VER(i915) >= 9)
2525 wa_masked_en(wal,
2526 GEN7_FF_SLICE_CS_CHICKEN1,
2527 GEN9_FFSC_PERCTX_PREEMPT_CTRL);
2528
2529 if (IS_SKYLAKE(i915) ||
2530 IS_KABYLAKE(i915) ||
2531 IS_COFFEELAKE(i915) ||
2532 IS_COMETLAKE(i915)) {
2533 /* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
2534 wa_write_or(wal,
2535 GEN8_GARBCNTL,
2536 GEN9_GAPS_TSV_CREDIT_DISABLE);
2537 }
2538
2539 if (IS_BROXTON(i915)) {
2540 /* WaDisablePooledEuLoadBalancingFix:bxt */
2541 wa_masked_en(wal,
2542 FF_SLICE_CS_CHICKEN2,
2543 GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
2544 }
2545
2546 if (GRAPHICS_VER(i915) == 9) {
2547 /* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
2548 wa_masked_en(wal,
2549 GEN9_CSFE_CHICKEN1_RCS,
2550 GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
2551
2552 /* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
2553 wa_mcr_write_or(wal,
2554 BDW_SCRATCH1,
2555 GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
2556
2557 /* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
2558 if (IS_GEN9_LP(i915))
2559 wa_mcr_write_clr_set(wal,
2560 GEN8_L3SQCREG1,
2561 L3_PRIO_CREDITS_MASK,
2562 L3_GENERAL_PRIO_CREDITS(62) |
2563 L3_HIGH_PRIO_CREDITS(2));
2564
2565 /* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
2566 wa_mcr_write_or(wal,
2567 GEN8_L3SQCREG4,
2568 GEN8_LQSC_FLUSH_COHERENT_LINES);
2569
2570 /* Disable atomics in L3 to prevent unrecoverable hangs */
2571 wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1,
2572 GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2573 wa_mcr_write_clr_set(wal, GEN8_L3SQCREG4,
2574 GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2575 wa_mcr_write_clr_set(wal, GEN9_SCRATCH1,
2576 EVICTION_PERF_FIX_ENABLE, 0);
2577 }
2578
2579 if (IS_HASWELL(i915)) {
2580 /* WaSampleCChickenBitEnable:hsw */
2581 wa_masked_en(wal,
2582 HSW_HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
2583
2584 wa_masked_dis(wal,
2585 CACHE_MODE_0_GEN7,
2586 /* enable HiZ Raw Stall Optimization */
2587 HIZ_RAW_STALL_OPT_DISABLE);
2588 }
2589
2590 if (IS_VALLEYVIEW(i915)) {
2591 /* WaDisableEarlyCull:vlv */
2592 wa_masked_en(wal,
2593 _3D_CHICKEN3,
2594 _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2595
2596 /*
2597 * WaVSThreadDispatchOverride:ivb,vlv
2598 *
2599 * This actually overrides the dispatch
2600 * mode for all thread types.
2601 */
2602 wa_write_clr_set(wal,
2603 GEN7_FF_THREAD_MODE,
2604 GEN7_FF_SCHED_MASK,
2605 GEN7_FF_TS_SCHED_HW |
2606 GEN7_FF_VS_SCHED_HW |
2607 GEN7_FF_DS_SCHED_HW);
2608
2609 /* WaPsdDispatchEnable:vlv */
2610 /* WaDisablePSDDualDispatchEnable:vlv */
2611 wa_masked_en(wal,
2612 GEN7_HALF_SLICE_CHICKEN1,
2613 GEN7_MAX_PS_THREAD_DEP |
2614 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2615 }
2616
2617 if (IS_IVYBRIDGE(i915)) {
2618 /* WaDisableEarlyCull:ivb */
2619 wa_masked_en(wal,
2620 _3D_CHICKEN3,
2621 _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2622
2623 if (0) { /* causes HiZ corruption on ivb:gt1 */
2624 /* enable HiZ Raw Stall Optimization */
2625 wa_masked_dis(wal,
2626 CACHE_MODE_0_GEN7,
2627 HIZ_RAW_STALL_OPT_DISABLE);
2628 }
2629
2630 /*
2631 * WaVSThreadDispatchOverride:ivb,vlv
2632 *
2633 * This actually overrides the dispatch
2634 * mode for all thread types.
2635 */
2636 wa_write_clr_set(wal,
2637 GEN7_FF_THREAD_MODE,
2638 GEN7_FF_SCHED_MASK,
2639 GEN7_FF_TS_SCHED_HW |
2640 GEN7_FF_VS_SCHED_HW |
2641 GEN7_FF_DS_SCHED_HW);
2642
2643 /* WaDisablePSDDualDispatchEnable:ivb */
2644 if (IS_IVB_GT1(i915))
2645 wa_masked_en(wal,
2646 GEN7_HALF_SLICE_CHICKEN1,
2647 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2648 }
2649
2650 if (GRAPHICS_VER(i915) == 7) {
2651 /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
2652 wa_masked_en(wal,
2653 RING_MODE_GEN7(RENDER_RING_BASE),
2654 GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
2655
2656 /* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
2657 wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
2658
2659 /*
2660 * BSpec says this must be set, even though
2661 * WaDisable4x2SubspanOptimization:ivb,hsw
2662 * WaDisable4x2SubspanOptimization isn't listed for VLV.
2663 */
2664 wa_masked_en(wal,
2665 CACHE_MODE_1,
2666 PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
2667
2668 /*
2669 * BSpec recommends 8x4 when MSAA is used,
2670 * however in practice 16x4 seems fastest.
2671 *
2672 * Note that PS/WM thread counts depend on the WIZ hashing
2673 * disable bit, which we don't touch here, but it's good
2674 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2675 */
2676 wa_masked_field_set(wal,
2677 GEN7_GT_MODE,
2678 GEN6_WIZ_HASHING_MASK,
2679 GEN6_WIZ_HASHING_16x4);
2680 }
2681
2682 if (IS_GRAPHICS_VER(i915, 6, 7))
2683 /*
2684 * We need to disable the AsyncFlip performance optimisations in
2685 * order to use MI_WAIT_FOR_EVENT within the CS. It should
2686 * already be programmed to '1' on all products.
2687 *
2688 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
2689 */
2690 wa_masked_en(wal,
2691 RING_MI_MODE(RENDER_RING_BASE),
2692 ASYNC_FLIP_PERF_DISABLE);
2693
2694 if (GRAPHICS_VER(i915) == 6) {
2695 /*
2696 * Required for the hardware to program scanline values for
2697 * waiting
2698 * WaEnableFlushTlbInvalidationMode:snb
2699 */
2700 wa_masked_en(wal,
2701 GFX_MODE,
2702 GFX_TLB_INVALIDATE_EXPLICIT);
2703
2704 /* WaDisableHiZPlanesWhenMSAAEnabled:snb */
2705 wa_masked_en(wal,
2706 _3D_CHICKEN,
2707 _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
2708
2709 wa_masked_en(wal,
2710 _3D_CHICKEN3,
2711 /* WaStripsFansDisableFastClipPerformanceFix:snb */
2712 _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
2713 /*
2714 * Bspec says:
2715 * "This bit must be set if 3DSTATE_CLIP clip mode is set
2716 * to normal and 3DSTATE_SF number of SF output attributes
2717 * is more than 16."
2718 */
2719 _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
2720
2721 /*
2722 * BSpec recommends 8x4 when MSAA is used,
2723 * however in practice 16x4 seems fastest.
2724 *
2725 * Note that PS/WM thread counts depend on the WIZ hashing
2726 * disable bit, which we don't touch here, but it's good
2727 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2728 */
2729 wa_masked_field_set(wal,
2730 GEN6_GT_MODE,
2731 GEN6_WIZ_HASHING_MASK,
2732 GEN6_WIZ_HASHING_16x4);
2733
2734 /* WaDisable_RenderCache_OperationalFlush:snb */
2735 wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
2736
2737 /*
2738 * From the Sandybridge PRM, volume 1 part 3, page 24:
2739 * "If this bit is set, STCunit will have LRA as replacement
2740 * policy. [...] This bit must be reset. LRA replacement
2741 * policy is not supported."
2742 */
2743 wa_masked_dis(wal,
2744 CACHE_MODE_0,
2745 CM0_STC_EVICT_DISABLE_LRA_SNB);
2746 }
2747
2748 if (IS_GRAPHICS_VER(i915, 4, 6))
2749 /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
2750 wa_add(wal, RING_MI_MODE(RENDER_RING_BASE),
2751 0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
2752 /* XXX bit doesn't stick on Broadwater */
2753 IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, true);
2754
2755 if (GRAPHICS_VER(i915) == 4)
2756 /*
2757 * Disable CONSTANT_BUFFER before it is loaded from the context
2758 * image. For as it is loaded, it is executed and the stored
2759 * address may no longer be valid, leading to a GPU hang.
2760 *
2761 * This imposes the requirement that userspace reload their
2762 * CONSTANT_BUFFER on every batch, fortunately a requirement
2763 * they are already accustomed to from before contexts were
2764 * enabled.
2765 */
2766 wa_add(wal, ECOSKPD(RENDER_RING_BASE),
2767 0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
2768 0 /* XXX bit doesn't stick on Broadwater */,
2769 true);
2770 }
2771
2772 static void
xcs_engine_wa_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)2773 xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2774 {
2775 struct drm_i915_private *i915 = engine->i915;
2776
2777 /* WaKBLVECSSemaphoreWaitPoll:kbl */
2778 if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) {
2779 wa_write(wal,
2780 RING_SEMA_WAIT_POLL(engine->mmio_base),
2781 1);
2782 }
2783 }
2784
2785 static void
ccs_engine_wa_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)2786 ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2787 {
2788 if (IS_PVC_CT_STEP(engine->i915, STEP_A0, STEP_C0)) {
2789 /* Wa_14014999345:pvc */
2790 wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS, DISABLE_ECC);
2791 }
2792 }
2793
2794 /*
2795 * The bspec performance guide has recommended MMIO tuning settings. These
2796 * aren't truly "workarounds" but we want to program them with the same
2797 * workaround infrastructure to ensure that they're automatically added to
2798 * the GuC save/restore lists, re-applied at the right times, and checked for
2799 * any conflicting programming requested by real workarounds.
2800 *
2801 * Programming settings should be added here only if their registers are not
2802 * part of an engine's register state context. If a register is part of a
2803 * context, then any tuning settings should be programmed in an appropriate
2804 * function invoked by __intel_engine_init_ctx_wa().
2805 */
2806 static void
add_render_compute_tuning_settings(struct intel_gt * gt,struct i915_wa_list * wal)2807 add_render_compute_tuning_settings(struct intel_gt *gt,
2808 struct i915_wa_list *wal)
2809 {
2810 struct drm_i915_private *i915 = gt->i915;
2811
2812 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915))
2813 wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512);
2814
2815 /*
2816 * This tuning setting proves beneficial only on ATS-M designs; the
2817 * default "age based" setting is optimal on regular DG2 and other
2818 * platforms.
2819 */
2820 if (INTEL_INFO(i915)->tuning_thread_rr_after_dep)
2821 wa_mcr_masked_field_set(wal, GEN9_ROW_CHICKEN4, THREAD_EX_ARB_MODE,
2822 THREAD_EX_ARB_MODE_RR_AFTER_DEP);
2823
2824 if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
2825 wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC);
2826 }
2827
ccs_engine_wa_mode(struct intel_engine_cs * engine,struct i915_wa_list * wal)2828 static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2829 {
2830 struct intel_gt *gt = engine->gt;
2831 u32 mode;
2832
2833 if (!IS_DG2(gt->i915))
2834 return;
2835
2836 /*
2837 * Wa_14019159160: This workaround, along with others, leads to
2838 * significant challenges in utilizing load balancing among the
2839 * CCS slices. Consequently, an architectural decision has been
2840 * made to completely disable automatic CCS load balancing.
2841 */
2842 wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE);
2843
2844 /*
2845 * After having disabled automatic load balancing we need to
2846 * assign all slices to a single CCS. We will call it CCS mode 1
2847 */
2848 mode = intel_gt_apply_ccs_mode(gt);
2849 wa_masked_en(wal, XEHP_CCS_MODE, mode);
2850 }
2851
2852 /*
2853 * The workarounds in this function apply to shared registers in
2854 * the general render reset domain that aren't tied to a
2855 * specific engine. Since all render+compute engines get reset
2856 * together, and the contents of these registers are lost during
2857 * the shared render domain reset, we'll define such workarounds
2858 * here and then add them to just a single RCS or CCS engine's
2859 * workaround list (whichever engine has the XXXX flag).
2860 */
2861 static void
general_render_compute_wa_init(struct intel_engine_cs * engine,struct i915_wa_list * wal)2862 general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2863 {
2864 struct drm_i915_private *i915 = engine->i915;
2865 struct intel_gt *gt = engine->gt;
2866
2867 add_render_compute_tuning_settings(gt, wal);
2868
2869 if (GRAPHICS_VER(i915) >= 11) {
2870 /* This is not a Wa (although referred to as
2871 * WaSetInidrectStateOverride in places), this allows
2872 * applications that reference sampler states through
2873 * the BindlessSamplerStateBaseAddress to have their
2874 * border color relative to DynamicStateBaseAddress
2875 * rather than BindlessSamplerStateBaseAddress.
2876 *
2877 * Otherwise SAMPLER_STATE border colors have to be
2878 * copied in multiple heaps (DynamicStateBaseAddress &
2879 * BindlessSamplerStateBaseAddress)
2880 *
2881 * BSpec: 46052
2882 */
2883 wa_mcr_masked_en(wal,
2884 GEN10_SAMPLER_MODE,
2885 GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE);
2886 }
2887
2888 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) ||
2889 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER) ||
2890 IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 74), IP_VER(12, 74)))
2891 /* Wa_14017856879 */
2892 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN3, MTL_DISABLE_FIX_FOR_EOT_FLUSH);
2893
2894 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2895 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2896 /*
2897 * Wa_14017066071
2898 * Wa_14017654203
2899 */
2900 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2901 MTL_DISABLE_SAMPLER_SC_OOO);
2902
2903 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2904 /* Wa_22015279794 */
2905 wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2906 DISABLE_PREFETCH_INTO_IC);
2907
2908 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2909 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2910 IS_DG2(i915)) {
2911 /* Wa_22013037850 */
2912 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW,
2913 DISABLE_128B_EVICTION_COMMAND_UDW);
2914
2915 /* Wa_18017747507 */
2916 wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE);
2917 }
2918
2919 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2920 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2921 IS_PONTEVECCHIO(i915) ||
2922 IS_DG2(i915)) {
2923 /* Wa_22014226127 */
2924 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE);
2925 }
2926
2927 if (IS_PONTEVECCHIO(i915) || IS_DG2(i915)) {
2928 /* Wa_14015227452:dg2,pvc */
2929 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE);
2930
2931 /* Wa_16015675438:dg2,pvc */
2932 wa_masked_en(wal, FF_SLICE_CS_CHICKEN2, GEN12_PERF_FIX_BALANCING_CFE_DISABLE);
2933 }
2934
2935 if (IS_DG2(i915)) {
2936 /*
2937 * Wa_16011620976:dg2_g11
2938 * Wa_22015475538:dg2
2939 */
2940 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8);
2941 }
2942
2943 if (IS_DG2_G11(i915)) {
2944 /*
2945 * Wa_22012826095:dg2
2946 * Wa_22013059131:dg2
2947 */
2948 wa_mcr_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW,
2949 MAXREQS_PER_BANK,
2950 REG_FIELD_PREP(MAXREQS_PER_BANK, 2));
2951
2952 /* Wa_22013059131:dg2 */
2953 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0,
2954 FORCE_1_SUB_MESSAGE_PER_FRAGMENT);
2955
2956 /*
2957 * Wa_22012654132
2958 *
2959 * Note that register 0xE420 is write-only and cannot be read
2960 * back for verification on DG2 (due to Wa_14012342262), so
2961 * we need to explicitly skip the readback.
2962 */
2963 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2964 _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC),
2965 0 /* write-only, so skip validation */,
2966 true);
2967 }
2968
2969 if (IS_XEHPSDV(i915)) {
2970 /* Wa_1409954639 */
2971 wa_mcr_masked_en(wal,
2972 GEN8_ROW_CHICKEN,
2973 SYSTOLIC_DOP_CLOCK_GATING_DIS);
2974
2975 /* Wa_1607196519 */
2976 wa_mcr_masked_en(wal,
2977 GEN9_ROW_CHICKEN4,
2978 GEN12_DISABLE_GRF_CLEAR);
2979
2980 /* Wa_14010449647:xehpsdv */
2981 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
2982 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2983 }
2984 }
2985
2986 static void
engine_init_workarounds(struct intel_engine_cs * engine,struct i915_wa_list * wal)2987 engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2988 {
2989 if (GRAPHICS_VER(engine->i915) < 4)
2990 return;
2991
2992 engine_fake_wa_init(engine, wal);
2993
2994 /*
2995 * These are common workarounds that just need to applied
2996 * to a single RCS/CCS engine's workaround list since
2997 * they're reset as part of the general render domain reset.
2998 */
2999 if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) {
3000 general_render_compute_wa_init(engine, wal);
3001 ccs_engine_wa_mode(engine, wal);
3002 }
3003
3004 if (engine->class == COMPUTE_CLASS)
3005 ccs_engine_wa_init(engine, wal);
3006 else if (engine->class == RENDER_CLASS)
3007 rcs_engine_wa_init(engine, wal);
3008 else
3009 xcs_engine_wa_init(engine, wal);
3010 }
3011
intel_engine_init_workarounds(struct intel_engine_cs * engine)3012 void intel_engine_init_workarounds(struct intel_engine_cs *engine)
3013 {
3014 struct i915_wa_list *wal = &engine->wa_list;
3015
3016 wa_init_start(wal, engine->gt, "engine", engine->name);
3017 engine_init_workarounds(engine, wal);
3018 wa_init_finish(wal);
3019 }
3020
intel_engine_apply_workarounds(struct intel_engine_cs * engine)3021 void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
3022 {
3023 wa_list_apply(&engine->wa_list);
3024 }
3025
3026 static const struct i915_range mcr_ranges_gen8[] = {
3027 { .start = 0x5500, .end = 0x55ff },
3028 { .start = 0x7000, .end = 0x7fff },
3029 { .start = 0x9400, .end = 0x97ff },
3030 { .start = 0xb000, .end = 0xb3ff },
3031 { .start = 0xe000, .end = 0xe7ff },
3032 {},
3033 };
3034
3035 static const struct i915_range mcr_ranges_gen12[] = {
3036 { .start = 0x8150, .end = 0x815f },
3037 { .start = 0x9520, .end = 0x955f },
3038 { .start = 0xb100, .end = 0xb3ff },
3039 { .start = 0xde80, .end = 0xe8ff },
3040 { .start = 0x24a00, .end = 0x24a7f },
3041 {},
3042 };
3043
3044 static const struct i915_range mcr_ranges_xehp[] = {
3045 { .start = 0x4000, .end = 0x4aff },
3046 { .start = 0x5200, .end = 0x52ff },
3047 { .start = 0x5400, .end = 0x7fff },
3048 { .start = 0x8140, .end = 0x815f },
3049 { .start = 0x8c80, .end = 0x8dff },
3050 { .start = 0x94d0, .end = 0x955f },
3051 { .start = 0x9680, .end = 0x96ff },
3052 { .start = 0xb000, .end = 0xb3ff },
3053 { .start = 0xc800, .end = 0xcfff },
3054 { .start = 0xd800, .end = 0xd8ff },
3055 { .start = 0xdc00, .end = 0xffff },
3056 { .start = 0x17000, .end = 0x17fff },
3057 { .start = 0x24a00, .end = 0x24a7f },
3058 {},
3059 };
3060
mcr_range(struct drm_i915_private * i915,u32 offset)3061 static bool mcr_range(struct drm_i915_private *i915, u32 offset)
3062 {
3063 const struct i915_range *mcr_ranges;
3064 int i;
3065
3066 if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50))
3067 mcr_ranges = mcr_ranges_xehp;
3068 else if (GRAPHICS_VER(i915) >= 12)
3069 mcr_ranges = mcr_ranges_gen12;
3070 else if (GRAPHICS_VER(i915) >= 8)
3071 mcr_ranges = mcr_ranges_gen8;
3072 else
3073 return false;
3074
3075 /*
3076 * Registers in these ranges are affected by the MCR selector
3077 * which only controls CPU initiated MMIO. Routing does not
3078 * work for CS access so we cannot verify them on this path.
3079 */
3080 for (i = 0; mcr_ranges[i].start; i++)
3081 if (offset >= mcr_ranges[i].start &&
3082 offset <= mcr_ranges[i].end)
3083 return true;
3084
3085 return false;
3086 }
3087
3088 static int
wa_list_srm(struct i915_request * rq,const struct i915_wa_list * wal,struct i915_vma * vma)3089 wa_list_srm(struct i915_request *rq,
3090 const struct i915_wa_list *wal,
3091 struct i915_vma *vma)
3092 {
3093 struct drm_i915_private *i915 = rq->i915;
3094 unsigned int i, count = 0;
3095 const struct i915_wa *wa;
3096 u32 srm, *cs;
3097
3098 srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
3099 if (GRAPHICS_VER(i915) >= 8)
3100 srm++;
3101
3102 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3103 if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
3104 count++;
3105 }
3106
3107 cs = intel_ring_begin(rq, 4 * count);
3108 if (IS_ERR(cs))
3109 return PTR_ERR(cs);
3110
3111 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3112 u32 offset = i915_mmio_reg_offset(wa->reg);
3113
3114 if (mcr_range(i915, offset))
3115 continue;
3116
3117 *cs++ = srm;
3118 *cs++ = offset;
3119 *cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
3120 *cs++ = 0;
3121 }
3122 intel_ring_advance(rq, cs);
3123
3124 return 0;
3125 }
3126
engine_wa_list_verify(struct intel_context * ce,const struct i915_wa_list * const wal,const char * from)3127 static int engine_wa_list_verify(struct intel_context *ce,
3128 const struct i915_wa_list * const wal,
3129 const char *from)
3130 {
3131 const struct i915_wa *wa;
3132 struct i915_request *rq;
3133 struct i915_vma *vma;
3134 struct i915_gem_ww_ctx ww;
3135 unsigned int i;
3136 u32 *results;
3137 int err;
3138
3139 if (!wal->count)
3140 return 0;
3141
3142 vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm,
3143 wal->count * sizeof(u32));
3144 if (IS_ERR(vma))
3145 return PTR_ERR(vma);
3146
3147 intel_engine_pm_get(ce->engine);
3148 i915_gem_ww_ctx_init(&ww, false);
3149 retry:
3150 err = i915_gem_object_lock(vma->obj, &ww);
3151 if (err == 0)
3152 err = intel_context_pin_ww(ce, &ww);
3153 if (err)
3154 goto err_pm;
3155
3156 err = i915_vma_pin_ww(vma, &ww, 0, 0,
3157 i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
3158 if (err)
3159 goto err_unpin;
3160
3161 rq = i915_request_create(ce);
3162 if (IS_ERR(rq)) {
3163 err = PTR_ERR(rq);
3164 goto err_vma;
3165 }
3166
3167 err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
3168 if (err == 0)
3169 err = wa_list_srm(rq, wal, vma);
3170
3171 i915_request_get(rq);
3172 if (err)
3173 i915_request_set_error_once(rq, err);
3174 i915_request_add(rq);
3175
3176 if (err)
3177 goto err_rq;
3178
3179 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
3180 err = -ETIME;
3181 goto err_rq;
3182 }
3183
3184 results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
3185 if (IS_ERR(results)) {
3186 err = PTR_ERR(results);
3187 goto err_rq;
3188 }
3189
3190 err = 0;
3191 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3192 if (mcr_range(rq->i915, i915_mmio_reg_offset(wa->reg)))
3193 continue;
3194
3195 if (!wa_verify(wal->gt, wa, results[i], wal->name, from))
3196 err = -ENXIO;
3197 }
3198
3199 i915_gem_object_unpin_map(vma->obj);
3200
3201 err_rq:
3202 i915_request_put(rq);
3203 err_vma:
3204 i915_vma_unpin(vma);
3205 err_unpin:
3206 intel_context_unpin(ce);
3207 err_pm:
3208 if (err == -EDEADLK) {
3209 err = i915_gem_ww_ctx_backoff(&ww);
3210 if (!err)
3211 goto retry;
3212 }
3213 i915_gem_ww_ctx_fini(&ww);
3214 intel_engine_pm_put(ce->engine);
3215 i915_vma_put(vma);
3216 return err;
3217 }
3218
intel_engine_verify_workarounds(struct intel_engine_cs * engine,const char * from)3219 int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
3220 const char *from)
3221 {
3222 return engine_wa_list_verify(engine->kernel_context,
3223 &engine->wa_list,
3224 from);
3225 }
3226
3227 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
3228 #include "selftest_workarounds.c"
3229 #endif
3230