xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision ba485bc8)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gem/i915_gem_lmem.h"
7 
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "intel_engine.h"
12 #include "intel_gpu_commands.h"
13 #include "intel_gt.h"
14 #include "intel_lrc.h"
15 #include "intel_lrc_reg.h"
16 #include "intel_ring.h"
17 #include "shmem_utils.h"
18 
19 static void set_offsets(u32 *regs,
20 			const u8 *data,
21 			const struct intel_engine_cs *engine,
22 			bool close)
23 #define NOP(x) (BIT(7) | (x))
24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
25 #define POSTED BIT(0)
26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
27 #define REG16(x) \
28 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
29 	(((x) >> 2) & 0x7f)
30 #define END 0
31 {
32 	const u32 base = engine->mmio_base;
33 
34 	while (*data) {
35 		u8 count, flags;
36 
37 		if (*data & BIT(7)) { /* skip */
38 			count = *data++ & ~BIT(7);
39 			regs += count;
40 			continue;
41 		}
42 
43 		count = *data & 0x3f;
44 		flags = *data >> 6;
45 		data++;
46 
47 		*regs = MI_LOAD_REGISTER_IMM(count);
48 		if (flags & POSTED)
49 			*regs |= MI_LRI_FORCE_POSTED;
50 		if (INTEL_GEN(engine->i915) >= 11)
51 			*regs |= MI_LRI_LRM_CS_MMIO;
52 		regs++;
53 
54 		GEM_BUG_ON(!count);
55 		do {
56 			u32 offset = 0;
57 			u8 v;
58 
59 			do {
60 				v = *data++;
61 				offset <<= 7;
62 				offset |= v & ~BIT(7);
63 			} while (v & BIT(7));
64 
65 			regs[0] = base + (offset << 2);
66 			regs += 2;
67 		} while (--count);
68 	}
69 
70 	if (close) {
71 		/* Close the batch; used mainly by live_lrc_layout() */
72 		*regs = MI_BATCH_BUFFER_END;
73 		if (INTEL_GEN(engine->i915) >= 10)
74 			*regs |= BIT(0);
75 	}
76 }
77 
78 static const u8 gen8_xcs_offsets[] = {
79 	NOP(1),
80 	LRI(11, 0),
81 	REG16(0x244),
82 	REG(0x034),
83 	REG(0x030),
84 	REG(0x038),
85 	REG(0x03c),
86 	REG(0x168),
87 	REG(0x140),
88 	REG(0x110),
89 	REG(0x11c),
90 	REG(0x114),
91 	REG(0x118),
92 
93 	NOP(9),
94 	LRI(9, 0),
95 	REG16(0x3a8),
96 	REG16(0x28c),
97 	REG16(0x288),
98 	REG16(0x284),
99 	REG16(0x280),
100 	REG16(0x27c),
101 	REG16(0x278),
102 	REG16(0x274),
103 	REG16(0x270),
104 
105 	NOP(13),
106 	LRI(2, 0),
107 	REG16(0x200),
108 	REG(0x028),
109 
110 	END
111 };
112 
113 static const u8 gen9_xcs_offsets[] = {
114 	NOP(1),
115 	LRI(14, POSTED),
116 	REG16(0x244),
117 	REG(0x034),
118 	REG(0x030),
119 	REG(0x038),
120 	REG(0x03c),
121 	REG(0x168),
122 	REG(0x140),
123 	REG(0x110),
124 	REG(0x11c),
125 	REG(0x114),
126 	REG(0x118),
127 	REG(0x1c0),
128 	REG(0x1c4),
129 	REG(0x1c8),
130 
131 	NOP(3),
132 	LRI(9, POSTED),
133 	REG16(0x3a8),
134 	REG16(0x28c),
135 	REG16(0x288),
136 	REG16(0x284),
137 	REG16(0x280),
138 	REG16(0x27c),
139 	REG16(0x278),
140 	REG16(0x274),
141 	REG16(0x270),
142 
143 	NOP(13),
144 	LRI(1, POSTED),
145 	REG16(0x200),
146 
147 	NOP(13),
148 	LRI(44, POSTED),
149 	REG(0x028),
150 	REG(0x09c),
151 	REG(0x0c0),
152 	REG(0x178),
153 	REG(0x17c),
154 	REG16(0x358),
155 	REG(0x170),
156 	REG(0x150),
157 	REG(0x154),
158 	REG(0x158),
159 	REG16(0x41c),
160 	REG16(0x600),
161 	REG16(0x604),
162 	REG16(0x608),
163 	REG16(0x60c),
164 	REG16(0x610),
165 	REG16(0x614),
166 	REG16(0x618),
167 	REG16(0x61c),
168 	REG16(0x620),
169 	REG16(0x624),
170 	REG16(0x628),
171 	REG16(0x62c),
172 	REG16(0x630),
173 	REG16(0x634),
174 	REG16(0x638),
175 	REG16(0x63c),
176 	REG16(0x640),
177 	REG16(0x644),
178 	REG16(0x648),
179 	REG16(0x64c),
180 	REG16(0x650),
181 	REG16(0x654),
182 	REG16(0x658),
183 	REG16(0x65c),
184 	REG16(0x660),
185 	REG16(0x664),
186 	REG16(0x668),
187 	REG16(0x66c),
188 	REG16(0x670),
189 	REG16(0x674),
190 	REG16(0x678),
191 	REG16(0x67c),
192 	REG(0x068),
193 
194 	END
195 };
196 
197 static const u8 gen12_xcs_offsets[] = {
198 	NOP(1),
199 	LRI(13, POSTED),
200 	REG16(0x244),
201 	REG(0x034),
202 	REG(0x030),
203 	REG(0x038),
204 	REG(0x03c),
205 	REG(0x168),
206 	REG(0x140),
207 	REG(0x110),
208 	REG(0x1c0),
209 	REG(0x1c4),
210 	REG(0x1c8),
211 	REG(0x180),
212 	REG16(0x2b4),
213 
214 	NOP(5),
215 	LRI(9, POSTED),
216 	REG16(0x3a8),
217 	REG16(0x28c),
218 	REG16(0x288),
219 	REG16(0x284),
220 	REG16(0x280),
221 	REG16(0x27c),
222 	REG16(0x278),
223 	REG16(0x274),
224 	REG16(0x270),
225 
226 	END
227 };
228 
229 static const u8 gen8_rcs_offsets[] = {
230 	NOP(1),
231 	LRI(14, POSTED),
232 	REG16(0x244),
233 	REG(0x034),
234 	REG(0x030),
235 	REG(0x038),
236 	REG(0x03c),
237 	REG(0x168),
238 	REG(0x140),
239 	REG(0x110),
240 	REG(0x11c),
241 	REG(0x114),
242 	REG(0x118),
243 	REG(0x1c0),
244 	REG(0x1c4),
245 	REG(0x1c8),
246 
247 	NOP(3),
248 	LRI(9, POSTED),
249 	REG16(0x3a8),
250 	REG16(0x28c),
251 	REG16(0x288),
252 	REG16(0x284),
253 	REG16(0x280),
254 	REG16(0x27c),
255 	REG16(0x278),
256 	REG16(0x274),
257 	REG16(0x270),
258 
259 	NOP(13),
260 	LRI(1, 0),
261 	REG(0x0c8),
262 
263 	END
264 };
265 
266 static const u8 gen9_rcs_offsets[] = {
267 	NOP(1),
268 	LRI(14, POSTED),
269 	REG16(0x244),
270 	REG(0x34),
271 	REG(0x30),
272 	REG(0x38),
273 	REG(0x3c),
274 	REG(0x168),
275 	REG(0x140),
276 	REG(0x110),
277 	REG(0x11c),
278 	REG(0x114),
279 	REG(0x118),
280 	REG(0x1c0),
281 	REG(0x1c4),
282 	REG(0x1c8),
283 
284 	NOP(3),
285 	LRI(9, POSTED),
286 	REG16(0x3a8),
287 	REG16(0x28c),
288 	REG16(0x288),
289 	REG16(0x284),
290 	REG16(0x280),
291 	REG16(0x27c),
292 	REG16(0x278),
293 	REG16(0x274),
294 	REG16(0x270),
295 
296 	NOP(13),
297 	LRI(1, 0),
298 	REG(0xc8),
299 
300 	NOP(13),
301 	LRI(44, POSTED),
302 	REG(0x28),
303 	REG(0x9c),
304 	REG(0xc0),
305 	REG(0x178),
306 	REG(0x17c),
307 	REG16(0x358),
308 	REG(0x170),
309 	REG(0x150),
310 	REG(0x154),
311 	REG(0x158),
312 	REG16(0x41c),
313 	REG16(0x600),
314 	REG16(0x604),
315 	REG16(0x608),
316 	REG16(0x60c),
317 	REG16(0x610),
318 	REG16(0x614),
319 	REG16(0x618),
320 	REG16(0x61c),
321 	REG16(0x620),
322 	REG16(0x624),
323 	REG16(0x628),
324 	REG16(0x62c),
325 	REG16(0x630),
326 	REG16(0x634),
327 	REG16(0x638),
328 	REG16(0x63c),
329 	REG16(0x640),
330 	REG16(0x644),
331 	REG16(0x648),
332 	REG16(0x64c),
333 	REG16(0x650),
334 	REG16(0x654),
335 	REG16(0x658),
336 	REG16(0x65c),
337 	REG16(0x660),
338 	REG16(0x664),
339 	REG16(0x668),
340 	REG16(0x66c),
341 	REG16(0x670),
342 	REG16(0x674),
343 	REG16(0x678),
344 	REG16(0x67c),
345 	REG(0x68),
346 
347 	END
348 };
349 
350 static const u8 gen11_rcs_offsets[] = {
351 	NOP(1),
352 	LRI(15, POSTED),
353 	REG16(0x244),
354 	REG(0x034),
355 	REG(0x030),
356 	REG(0x038),
357 	REG(0x03c),
358 	REG(0x168),
359 	REG(0x140),
360 	REG(0x110),
361 	REG(0x11c),
362 	REG(0x114),
363 	REG(0x118),
364 	REG(0x1c0),
365 	REG(0x1c4),
366 	REG(0x1c8),
367 	REG(0x180),
368 
369 	NOP(1),
370 	LRI(9, POSTED),
371 	REG16(0x3a8),
372 	REG16(0x28c),
373 	REG16(0x288),
374 	REG16(0x284),
375 	REG16(0x280),
376 	REG16(0x27c),
377 	REG16(0x278),
378 	REG16(0x274),
379 	REG16(0x270),
380 
381 	LRI(1, POSTED),
382 	REG(0x1b0),
383 
384 	NOP(10),
385 	LRI(1, 0),
386 	REG(0x0c8),
387 
388 	END
389 };
390 
391 static const u8 gen12_rcs_offsets[] = {
392 	NOP(1),
393 	LRI(13, POSTED),
394 	REG16(0x244),
395 	REG(0x034),
396 	REG(0x030),
397 	REG(0x038),
398 	REG(0x03c),
399 	REG(0x168),
400 	REG(0x140),
401 	REG(0x110),
402 	REG(0x1c0),
403 	REG(0x1c4),
404 	REG(0x1c8),
405 	REG(0x180),
406 	REG16(0x2b4),
407 
408 	NOP(5),
409 	LRI(9, POSTED),
410 	REG16(0x3a8),
411 	REG16(0x28c),
412 	REG16(0x288),
413 	REG16(0x284),
414 	REG16(0x280),
415 	REG16(0x27c),
416 	REG16(0x278),
417 	REG16(0x274),
418 	REG16(0x270),
419 
420 	LRI(3, POSTED),
421 	REG(0x1b0),
422 	REG16(0x5a8),
423 	REG16(0x5ac),
424 
425 	NOP(6),
426 	LRI(1, 0),
427 	REG(0x0c8),
428 	NOP(3 + 9 + 1),
429 
430 	LRI(51, POSTED),
431 	REG16(0x588),
432 	REG16(0x588),
433 	REG16(0x588),
434 	REG16(0x588),
435 	REG16(0x588),
436 	REG16(0x588),
437 	REG(0x028),
438 	REG(0x09c),
439 	REG(0x0c0),
440 	REG(0x178),
441 	REG(0x17c),
442 	REG16(0x358),
443 	REG(0x170),
444 	REG(0x150),
445 	REG(0x154),
446 	REG(0x158),
447 	REG16(0x41c),
448 	REG16(0x600),
449 	REG16(0x604),
450 	REG16(0x608),
451 	REG16(0x60c),
452 	REG16(0x610),
453 	REG16(0x614),
454 	REG16(0x618),
455 	REG16(0x61c),
456 	REG16(0x620),
457 	REG16(0x624),
458 	REG16(0x628),
459 	REG16(0x62c),
460 	REG16(0x630),
461 	REG16(0x634),
462 	REG16(0x638),
463 	REG16(0x63c),
464 	REG16(0x640),
465 	REG16(0x644),
466 	REG16(0x648),
467 	REG16(0x64c),
468 	REG16(0x650),
469 	REG16(0x654),
470 	REG16(0x658),
471 	REG16(0x65c),
472 	REG16(0x660),
473 	REG16(0x664),
474 	REG16(0x668),
475 	REG16(0x66c),
476 	REG16(0x670),
477 	REG16(0x674),
478 	REG16(0x678),
479 	REG16(0x67c),
480 	REG(0x068),
481 	REG(0x084),
482 	NOP(1),
483 
484 	END
485 };
486 
487 #undef END
488 #undef REG16
489 #undef REG
490 #undef LRI
491 #undef NOP
492 
493 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
494 {
495 	/*
496 	 * The gen12+ lists only have the registers we program in the basic
497 	 * default state. We rely on the context image using relative
498 	 * addressing to automatic fixup the register state between the
499 	 * physical engines for virtual engine.
500 	 */
501 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
502 		   !intel_engine_has_relative_mmio(engine));
503 
504 	if (engine->class == RENDER_CLASS) {
505 		if (INTEL_GEN(engine->i915) >= 12)
506 			return gen12_rcs_offsets;
507 		else if (INTEL_GEN(engine->i915) >= 11)
508 			return gen11_rcs_offsets;
509 		else if (INTEL_GEN(engine->i915) >= 9)
510 			return gen9_rcs_offsets;
511 		else
512 			return gen8_rcs_offsets;
513 	} else {
514 		if (INTEL_GEN(engine->i915) >= 12)
515 			return gen12_xcs_offsets;
516 		else if (INTEL_GEN(engine->i915) >= 9)
517 			return gen9_xcs_offsets;
518 		else
519 			return gen8_xcs_offsets;
520 	}
521 }
522 
523 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
524 {
525 	if (INTEL_GEN(engine->i915) >= 12)
526 		return 0x60;
527 	else if (INTEL_GEN(engine->i915) >= 9)
528 		return 0x54;
529 	else if (engine->class == RENDER_CLASS)
530 		return 0x58;
531 	else
532 		return -1;
533 }
534 
535 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
536 {
537 	if (INTEL_GEN(engine->i915) >= 12)
538 		return 0x74;
539 	else if (INTEL_GEN(engine->i915) >= 9)
540 		return 0x68;
541 	else if (engine->class == RENDER_CLASS)
542 		return 0xd8;
543 	else
544 		return -1;
545 }
546 
547 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
548 {
549 	if (INTEL_GEN(engine->i915) >= 12)
550 		return 0x12;
551 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
552 		return 0x18;
553 	else
554 		return -1;
555 }
556 
557 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
558 {
559 	int x;
560 
561 	x = lrc_ring_wa_bb_per_ctx(engine);
562 	if (x < 0)
563 		return x;
564 
565 	return x + 2;
566 }
567 
568 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
569 {
570 	int x;
571 
572 	x = lrc_ring_indirect_ptr(engine);
573 	if (x < 0)
574 		return x;
575 
576 	return x + 2;
577 }
578 
579 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
580 {
581 	if (engine->class != RENDER_CLASS)
582 		return -1;
583 
584 	if (INTEL_GEN(engine->i915) >= 12)
585 		return 0xb6;
586 	else if (INTEL_GEN(engine->i915) >= 11)
587 		return 0xaa;
588 	else
589 		return -1;
590 }
591 
592 static u32
593 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
594 {
595 	switch (INTEL_GEN(engine->i915)) {
596 	default:
597 		MISSING_CASE(INTEL_GEN(engine->i915));
598 		fallthrough;
599 	case 12:
600 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
601 	case 11:
602 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
603 	case 10:
604 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
605 	case 9:
606 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
607 	case 8:
608 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
609 	}
610 }
611 
612 static void
613 lrc_setup_indirect_ctx(u32 *regs,
614 		       const struct intel_engine_cs *engine,
615 		       u32 ctx_bb_ggtt_addr,
616 		       u32 size)
617 {
618 	GEM_BUG_ON(!size);
619 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
620 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
621 	regs[lrc_ring_indirect_ptr(engine) + 1] =
622 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
623 
624 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
625 	regs[lrc_ring_indirect_offset(engine) + 1] =
626 		lrc_ring_indirect_offset_default(engine) << 6;
627 }
628 
629 static void init_common_regs(u32 * const regs,
630 			     const struct intel_context *ce,
631 			     const struct intel_engine_cs *engine,
632 			     bool inhibit)
633 {
634 	u32 ctl;
635 
636 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
637 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
638 	if (inhibit)
639 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
640 	if (INTEL_GEN(engine->i915) < 11)
641 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
642 					   CTX_CTRL_RS_CTX_ENABLE);
643 	regs[CTX_CONTEXT_CONTROL] = ctl;
644 
645 	regs[CTX_TIMESTAMP] = ce->runtime.last;
646 }
647 
648 static void init_wa_bb_regs(u32 * const regs,
649 			    const struct intel_engine_cs *engine)
650 {
651 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
652 
653 	if (wa_ctx->per_ctx.size) {
654 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
655 
656 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
657 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
658 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
659 	}
660 
661 	if (wa_ctx->indirect_ctx.size) {
662 		lrc_setup_indirect_ctx(regs, engine,
663 				       i915_ggtt_offset(wa_ctx->vma) +
664 				       wa_ctx->indirect_ctx.offset,
665 				       wa_ctx->indirect_ctx.size);
666 	}
667 }
668 
669 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
670 {
671 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
672 		/* 64b PPGTT (48bit canonical)
673 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
674 		 * other PDP Descriptors are ignored.
675 		 */
676 		ASSIGN_CTX_PML4(ppgtt, regs);
677 	} else {
678 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
679 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
680 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
681 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
682 	}
683 }
684 
685 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
686 {
687 	if (i915_is_ggtt(vm))
688 		return i915_vm_to_ggtt(vm)->alias;
689 	else
690 		return i915_vm_to_ppgtt(vm);
691 }
692 
693 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
694 {
695 	int x;
696 
697 	x = lrc_ring_mi_mode(engine);
698 	if (x != -1) {
699 		regs[x + 1] &= ~STOP_RING;
700 		regs[x + 1] |= STOP_RING << 16;
701 	}
702 }
703 
704 static void __lrc_init_regs(u32 *regs,
705 			    const struct intel_context *ce,
706 			    const struct intel_engine_cs *engine,
707 			    bool inhibit)
708 {
709 	/*
710 	 * A context is actually a big batch buffer with several
711 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
712 	 * values we are setting here are only for the first context restore:
713 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
714 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
715 	 * we are not initializing here).
716 	 *
717 	 * Must keep consistent with virtual_update_register_offsets().
718 	 */
719 
720 	if (inhibit)
721 		memset(regs, 0, PAGE_SIZE);
722 
723 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
724 
725 	init_common_regs(regs, ce, engine, inhibit);
726 	init_ppgtt_regs(regs, vm_alias(ce->vm));
727 
728 	init_wa_bb_regs(regs, engine);
729 
730 	__reset_stop_ring(regs, engine);
731 }
732 
733 void lrc_init_regs(const struct intel_context *ce,
734 		   const struct intel_engine_cs *engine,
735 		   bool inhibit)
736 {
737 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
738 }
739 
740 void lrc_reset_regs(const struct intel_context *ce,
741 		    const struct intel_engine_cs *engine)
742 {
743 	__reset_stop_ring(ce->lrc_reg_state, engine);
744 }
745 
746 static void
747 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
748 {
749 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
750 		return;
751 
752 	vaddr += engine->context_size;
753 
754 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
755 }
756 
757 static void
758 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
759 {
760 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
761 		return;
762 
763 	vaddr += engine->context_size;
764 
765 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
766 		drm_err_once(&engine->i915->drm,
767 			     "%s context redzone overwritten!\n",
768 			     engine->name);
769 }
770 
771 void lrc_init_state(struct intel_context *ce,
772 		    struct intel_engine_cs *engine,
773 		    void *state)
774 {
775 	bool inhibit = true;
776 
777 	set_redzone(state, engine);
778 
779 	if (engine->default_state) {
780 		shmem_read(engine->default_state, 0,
781 			   state, engine->context_size);
782 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
783 		inhibit = false;
784 	}
785 
786 	/* Clear the ppHWSP (inc. per-context counters) */
787 	memset(state, 0, PAGE_SIZE);
788 
789 	/*
790 	 * The second page of the context object contains some registers which
791 	 * must be set up prior to the first execution.
792 	 */
793 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
794 }
795 
796 static struct i915_vma *
797 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
798 {
799 	struct drm_i915_gem_object *obj;
800 	struct i915_vma *vma;
801 	u32 context_size;
802 
803 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
804 
805 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
806 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
807 
808 	if (INTEL_GEN(engine->i915) == 12) {
809 		ce->wa_bb_page = context_size / PAGE_SIZE;
810 		context_size += PAGE_SIZE;
811 	}
812 
813 	obj = i915_gem_object_create_lmem(engine->i915, context_size, 0);
814 	if (IS_ERR(obj))
815 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
816 	if (IS_ERR(obj))
817 		return ERR_CAST(obj);
818 
819 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
820 	if (IS_ERR(vma)) {
821 		i915_gem_object_put(obj);
822 		return vma;
823 	}
824 
825 	return vma;
826 }
827 
828 static struct intel_timeline *
829 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
830 {
831 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
832 
833 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
834 }
835 
836 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
837 {
838 	struct intel_ring *ring;
839 	struct i915_vma *vma;
840 	int err;
841 
842 	GEM_BUG_ON(ce->state);
843 
844 	vma = __lrc_alloc_state(ce, engine);
845 	if (IS_ERR(vma))
846 		return PTR_ERR(vma);
847 
848 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
849 	if (IS_ERR(ring)) {
850 		err = PTR_ERR(ring);
851 		goto err_vma;
852 	}
853 
854 	if (!page_mask_bits(ce->timeline)) {
855 		struct intel_timeline *tl;
856 
857 		/*
858 		 * Use the static global HWSP for the kernel context, and
859 		 * a dynamically allocated cacheline for everyone else.
860 		 */
861 		if (unlikely(ce->timeline))
862 			tl = pinned_timeline(ce, engine);
863 		else
864 			tl = intel_timeline_create(engine->gt);
865 		if (IS_ERR(tl)) {
866 			err = PTR_ERR(tl);
867 			goto err_ring;
868 		}
869 
870 		ce->timeline = tl;
871 	}
872 
873 	ce->ring = ring;
874 	ce->state = vma;
875 
876 	return 0;
877 
878 err_ring:
879 	intel_ring_put(ring);
880 err_vma:
881 	i915_vma_put(vma);
882 	return err;
883 }
884 
885 void lrc_reset(struct intel_context *ce)
886 {
887 	GEM_BUG_ON(!intel_context_is_pinned(ce));
888 
889 	intel_ring_reset(ce->ring, ce->ring->emit);
890 
891 	/* Scrub away the garbage */
892 	lrc_init_regs(ce, ce->engine, true);
893 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
894 }
895 
896 int
897 lrc_pre_pin(struct intel_context *ce,
898 	    struct intel_engine_cs *engine,
899 	    struct i915_gem_ww_ctx *ww,
900 	    void **vaddr)
901 {
902 	GEM_BUG_ON(!ce->state);
903 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
904 
905 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
906 					 i915_coherent_map_type(ce->engine->i915) |
907 					 I915_MAP_OVERRIDE);
908 
909 	return PTR_ERR_OR_ZERO(*vaddr);
910 }
911 
912 int
913 lrc_pin(struct intel_context *ce,
914 	struct intel_engine_cs *engine,
915 	void *vaddr)
916 {
917 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
918 
919 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
920 		lrc_init_state(ce, engine, vaddr);
921 
922 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
923 	return 0;
924 }
925 
926 void lrc_unpin(struct intel_context *ce)
927 {
928 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
929 		      ce->engine);
930 }
931 
932 void lrc_post_unpin(struct intel_context *ce)
933 {
934 	i915_gem_object_unpin_map(ce->state->obj);
935 }
936 
937 void lrc_fini(struct intel_context *ce)
938 {
939 	if (!ce->state)
940 		return;
941 
942 	intel_ring_put(fetch_and_zero(&ce->ring));
943 	i915_vma_put(fetch_and_zero(&ce->state));
944 }
945 
946 void lrc_destroy(struct kref *kref)
947 {
948 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
949 
950 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
951 	GEM_BUG_ON(intel_context_is_pinned(ce));
952 
953 	lrc_fini(ce);
954 
955 	intel_context_fini(ce);
956 	intel_context_free(ce);
957 }
958 
959 static u32 *
960 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
961 {
962 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
963 		MI_SRM_LRM_GLOBAL_GTT |
964 		MI_LRI_LRM_CS_MMIO;
965 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
966 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
967 		CTX_TIMESTAMP * sizeof(u32);
968 	*cs++ = 0;
969 
970 	*cs++ = MI_LOAD_REGISTER_REG |
971 		MI_LRR_SOURCE_CS_MMIO |
972 		MI_LRI_LRM_CS_MMIO;
973 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
974 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
975 
976 	*cs++ = MI_LOAD_REGISTER_REG |
977 		MI_LRR_SOURCE_CS_MMIO |
978 		MI_LRI_LRM_CS_MMIO;
979 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
980 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
981 
982 	return cs;
983 }
984 
985 static u32 *
986 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
987 {
988 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
989 
990 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
991 		MI_SRM_LRM_GLOBAL_GTT |
992 		MI_LRI_LRM_CS_MMIO;
993 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
994 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
995 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
996 	*cs++ = 0;
997 
998 	return cs;
999 }
1000 
1001 static u32 *
1002 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1003 {
1004 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1005 
1006 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1007 		MI_SRM_LRM_GLOBAL_GTT |
1008 		MI_LRI_LRM_CS_MMIO;
1009 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1010 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1011 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1012 	*cs++ = 0;
1013 
1014 	*cs++ = MI_LOAD_REGISTER_REG |
1015 		MI_LRR_SOURCE_CS_MMIO |
1016 		MI_LRI_LRM_CS_MMIO;
1017 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1018 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1019 
1020 	return cs;
1021 }
1022 
1023 static u32 *
1024 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1025 {
1026 	cs = gen12_emit_timestamp_wa(ce, cs);
1027 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1028 	cs = gen12_emit_restore_scratch(ce, cs);
1029 
1030 	return cs;
1031 }
1032 
1033 static u32 *
1034 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1035 {
1036 	cs = gen12_emit_timestamp_wa(ce, cs);
1037 	cs = gen12_emit_restore_scratch(ce, cs);
1038 
1039 	return cs;
1040 }
1041 
1042 static u32 context_wa_bb_offset(const struct intel_context *ce)
1043 {
1044 	return PAGE_SIZE * ce->wa_bb_page;
1045 }
1046 
1047 static u32 *context_indirect_bb(const struct intel_context *ce)
1048 {
1049 	void *ptr;
1050 
1051 	GEM_BUG_ON(!ce->wa_bb_page);
1052 
1053 	ptr = ce->lrc_reg_state;
1054 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1055 	ptr += context_wa_bb_offset(ce);
1056 
1057 	return ptr;
1058 }
1059 
1060 static void
1061 setup_indirect_ctx_bb(const struct intel_context *ce,
1062 		      const struct intel_engine_cs *engine,
1063 		      u32 *(*emit)(const struct intel_context *, u32 *))
1064 {
1065 	u32 * const start = context_indirect_bb(ce);
1066 	u32 *cs;
1067 
1068 	cs = emit(ce, start);
1069 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1070 	while ((unsigned long)cs % CACHELINE_BYTES)
1071 		*cs++ = MI_NOOP;
1072 
1073 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1074 			       i915_ggtt_offset(ce->state) +
1075 			       context_wa_bb_offset(ce),
1076 			       (cs - start) * sizeof(*cs));
1077 }
1078 
1079 /*
1080  * The context descriptor encodes various attributes of a context,
1081  * including its GTT address and some flags. Because it's fairly
1082  * expensive to calculate, we'll just do it once and cache the result,
1083  * which remains valid until the context is unpinned.
1084  *
1085  * This is what a descriptor looks like, from LSB to MSB::
1086  *
1087  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1088  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1089  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1090  *      bits 53-54:    mbz, reserved for use by hardware
1091  *      bits 55-63:    group ID, currently unused and set to 0
1092  *
1093  * Starting from Gen11, the upper dword of the descriptor has a new format:
1094  *
1095  *      bits 32-36:    reserved
1096  *      bits 37-47:    SW context ID
1097  *      bits 48:53:    engine instance
1098  *      bit 54:        mbz, reserved for use by hardware
1099  *      bits 55-60:    SW counter
1100  *      bits 61-63:    engine class
1101  *
1102  * engine info, SW context ID and SW counter need to form a unique number
1103  * (Context ID) per lrc.
1104  */
1105 static u32 lrc_descriptor(const struct intel_context *ce)
1106 {
1107 	u32 desc;
1108 
1109 	desc = INTEL_LEGACY_32B_CONTEXT;
1110 	if (i915_vm_is_4lvl(ce->vm))
1111 		desc = INTEL_LEGACY_64B_CONTEXT;
1112 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1113 
1114 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1115 	if (IS_GEN(ce->vm->i915, 8))
1116 		desc |= GEN8_CTX_L3LLC_COHERENT;
1117 
1118 	return i915_ggtt_offset(ce->state) | desc;
1119 }
1120 
1121 u32 lrc_update_regs(const struct intel_context *ce,
1122 		    const struct intel_engine_cs *engine,
1123 		    u32 head)
1124 {
1125 	struct intel_ring *ring = ce->ring;
1126 	u32 *regs = ce->lrc_reg_state;
1127 
1128 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1129 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1130 
1131 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1132 	regs[CTX_RING_HEAD] = head;
1133 	regs[CTX_RING_TAIL] = ring->tail;
1134 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1135 
1136 	/* RPCS */
1137 	if (engine->class == RENDER_CLASS) {
1138 		regs[CTX_R_PWR_CLK_STATE] =
1139 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1140 
1141 		i915_oa_init_reg_state(ce, engine);
1142 	}
1143 
1144 	if (ce->wa_bb_page) {
1145 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1146 
1147 		fn = gen12_emit_indirect_ctx_xcs;
1148 		if (ce->engine->class == RENDER_CLASS)
1149 			fn = gen12_emit_indirect_ctx_rcs;
1150 
1151 		/* Mutually exclusive wrt to global indirect bb */
1152 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1153 		setup_indirect_ctx_bb(ce, engine, fn);
1154 	}
1155 
1156 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1157 }
1158 
1159 void lrc_update_offsets(struct intel_context *ce,
1160 			struct intel_engine_cs *engine)
1161 {
1162 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1163 }
1164 
1165 void lrc_check_regs(const struct intel_context *ce,
1166 		    const struct intel_engine_cs *engine,
1167 		    const char *when)
1168 {
1169 	const struct intel_ring *ring = ce->ring;
1170 	u32 *regs = ce->lrc_reg_state;
1171 	bool valid = true;
1172 	int x;
1173 
1174 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1175 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1176 		       engine->name,
1177 		       regs[CTX_RING_START],
1178 		       i915_ggtt_offset(ring->vma));
1179 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1180 		valid = false;
1181 	}
1182 
1183 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1184 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1185 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1186 		       engine->name,
1187 		       regs[CTX_RING_CTL],
1188 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1189 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1190 		valid = false;
1191 	}
1192 
1193 	x = lrc_ring_mi_mode(engine);
1194 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1195 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1196 		       engine->name, regs[x + 1]);
1197 		regs[x + 1] &= ~STOP_RING;
1198 		regs[x + 1] |= STOP_RING << 16;
1199 		valid = false;
1200 	}
1201 
1202 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1203 }
1204 
1205 /*
1206  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1207  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1208  * but there is a slight complication as this is applied in WA batch where the
1209  * values are only initialized once so we cannot take register value at the
1210  * beginning and reuse it further; hence we save its value to memory, upload a
1211  * constant value with bit21 set and then we restore it back with the saved value.
1212  * To simplify the WA, a constant value is formed by using the default value
1213  * of this register. This shouldn't be a problem because we are only modifying
1214  * it for a short period and this batch in non-premptible. We can ofcourse
1215  * use additional instructions that read the actual value of the register
1216  * at that time and set our bit of interest but it makes the WA complicated.
1217  *
1218  * This WA is also required for Gen9 so extracting as a function avoids
1219  * code duplication.
1220  */
1221 static u32 *
1222 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1223 {
1224 	/* NB no one else is allowed to scribble over scratch + 256! */
1225 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1226 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1227 	*batch++ = intel_gt_scratch_offset(engine->gt,
1228 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1229 	*batch++ = 0;
1230 
1231 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1232 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1233 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1234 
1235 	batch = gen8_emit_pipe_control(batch,
1236 				       PIPE_CONTROL_CS_STALL |
1237 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1238 				       0);
1239 
1240 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1241 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1242 	*batch++ = intel_gt_scratch_offset(engine->gt,
1243 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1244 	*batch++ = 0;
1245 
1246 	return batch;
1247 }
1248 
1249 /*
1250  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1251  * initialized at the beginning and shared across all contexts but this field
1252  * helps us to have multiple batches at different offsets and select them based
1253  * on a criteria. At the moment this batch always start at the beginning of the page
1254  * and at this point we don't have multiple wa_ctx batch buffers.
1255  *
1256  * The number of WA applied are not known at the beginning; we use this field
1257  * to return the no of DWORDS written.
1258  *
1259  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1260  * so it adds NOOPs as padding to make it cacheline aligned.
1261  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1262  * makes a complete batch buffer.
1263  */
1264 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1265 {
1266 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1267 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1268 
1269 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1270 	if (IS_BROADWELL(engine->i915))
1271 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1272 
1273 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1274 	/* Actual scratch location is at 128 bytes offset */
1275 	batch = gen8_emit_pipe_control(batch,
1276 				       PIPE_CONTROL_FLUSH_L3 |
1277 				       PIPE_CONTROL_STORE_DATA_INDEX |
1278 				       PIPE_CONTROL_CS_STALL |
1279 				       PIPE_CONTROL_QW_WRITE,
1280 				       LRC_PPHWSP_SCRATCH_ADDR);
1281 
1282 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1283 
1284 	/* Pad to end of cacheline */
1285 	while ((unsigned long)batch % CACHELINE_BYTES)
1286 		*batch++ = MI_NOOP;
1287 
1288 	/*
1289 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1290 	 * execution depends on the length specified in terms of cache lines
1291 	 * in the register CTX_RCS_INDIRECT_CTX
1292 	 */
1293 
1294 	return batch;
1295 }
1296 
1297 struct lri {
1298 	i915_reg_t reg;
1299 	u32 value;
1300 };
1301 
1302 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1303 {
1304 	GEM_BUG_ON(!count || count > 63);
1305 
1306 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1307 	do {
1308 		*batch++ = i915_mmio_reg_offset(lri->reg);
1309 		*batch++ = lri->value;
1310 	} while (lri++, --count);
1311 	*batch++ = MI_NOOP;
1312 
1313 	return batch;
1314 }
1315 
1316 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1317 {
1318 	static const struct lri lri[] = {
1319 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1320 		{
1321 			COMMON_SLICE_CHICKEN2,
1322 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1323 				       0),
1324 		},
1325 
1326 		/* BSpec: 11391 */
1327 		{
1328 			FF_SLICE_CHICKEN,
1329 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1330 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1331 		},
1332 
1333 		/* BSpec: 11299 */
1334 		{
1335 			_3D_CHICKEN3,
1336 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1337 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1338 		}
1339 	};
1340 
1341 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1342 
1343 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1344 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1345 
1346 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1347 	batch = gen8_emit_pipe_control(batch,
1348 				       PIPE_CONTROL_FLUSH_L3 |
1349 				       PIPE_CONTROL_STORE_DATA_INDEX |
1350 				       PIPE_CONTROL_CS_STALL |
1351 				       PIPE_CONTROL_QW_WRITE,
1352 				       LRC_PPHWSP_SCRATCH_ADDR);
1353 
1354 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1355 
1356 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1357 	if (HAS_POOLED_EU(engine->i915)) {
1358 		/*
1359 		 * EU pool configuration is setup along with golden context
1360 		 * during context initialization. This value depends on
1361 		 * device type (2x6 or 3x6) and needs to be updated based
1362 		 * on which subslice is disabled especially for 2x6
1363 		 * devices, however it is safe to load default
1364 		 * configuration of 3x6 device instead of masking off
1365 		 * corresponding bits because HW ignores bits of a disabled
1366 		 * subslice and drops down to appropriate config. Please
1367 		 * see render_state_setup() in i915_gem_render_state.c for
1368 		 * possible configurations, to avoid duplication they are
1369 		 * not shown here again.
1370 		 */
1371 		*batch++ = GEN9_MEDIA_POOL_STATE;
1372 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1373 		*batch++ = 0x00777000;
1374 		*batch++ = 0;
1375 		*batch++ = 0;
1376 		*batch++ = 0;
1377 	}
1378 
1379 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1380 
1381 	/* Pad to end of cacheline */
1382 	while ((unsigned long)batch % CACHELINE_BYTES)
1383 		*batch++ = MI_NOOP;
1384 
1385 	return batch;
1386 }
1387 
1388 static u32 *
1389 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1390 {
1391 	int i;
1392 
1393 	/*
1394 	 * WaPipeControlBefore3DStateSamplePattern: cnl
1395 	 *
1396 	 * Ensure the engine is idle prior to programming a
1397 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
1398 	 */
1399 	batch = gen8_emit_pipe_control(batch,
1400 				       PIPE_CONTROL_CS_STALL,
1401 				       0);
1402 	/*
1403 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
1404 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
1405 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
1406 	 * confusing. Since gen8_emit_pipe_control() already advances the
1407 	 * batch by 6 dwords, we advance the other 10 here, completing a
1408 	 * cacheline. It's not clear if the workaround requires this padding
1409 	 * before other commands, or if it's just the regular padding we would
1410 	 * already have for the workaround bb, so leave it here for now.
1411 	 */
1412 	for (i = 0; i < 10; i++)
1413 		*batch++ = MI_NOOP;
1414 
1415 	/* Pad to end of cacheline */
1416 	while ((unsigned long)batch % CACHELINE_BYTES)
1417 		*batch++ = MI_NOOP;
1418 
1419 	return batch;
1420 }
1421 
1422 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1423 
1424 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
1425 {
1426 	struct drm_i915_gem_object *obj;
1427 	struct i915_vma *vma;
1428 	int err;
1429 
1430 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1431 	if (IS_ERR(obj))
1432 		return PTR_ERR(obj);
1433 
1434 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1435 	if (IS_ERR(vma)) {
1436 		err = PTR_ERR(vma);
1437 		goto err;
1438 	}
1439 
1440 	err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
1441 	if (err)
1442 		goto err;
1443 
1444 	engine->wa_ctx.vma = vma;
1445 	return 0;
1446 
1447 err:
1448 	i915_gem_object_put(obj);
1449 	return err;
1450 }
1451 
1452 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1453 {
1454 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1455 
1456 	/* Called on error unwind, clear all flags to prevent further use */
1457 	memset(&engine->wa_ctx, 0, sizeof(engine->wa_ctx));
1458 }
1459 
1460 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1461 
1462 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1463 {
1464 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1465 	struct i915_wa_ctx_bb *wa_bb[] = {
1466 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1467 	};
1468 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1469 	void *batch, *batch_ptr;
1470 	unsigned int i;
1471 	int err;
1472 
1473 	if (engine->class != RENDER_CLASS)
1474 		return;
1475 
1476 	switch (INTEL_GEN(engine->i915)) {
1477 	case 12:
1478 	case 11:
1479 		return;
1480 	case 10:
1481 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
1482 		wa_bb_fn[1] = NULL;
1483 		break;
1484 	case 9:
1485 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1486 		wa_bb_fn[1] = NULL;
1487 		break;
1488 	case 8:
1489 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1490 		wa_bb_fn[1] = NULL;
1491 		break;
1492 	default:
1493 		MISSING_CASE(INTEL_GEN(engine->i915));
1494 		return;
1495 	}
1496 
1497 	err = lrc_setup_wa_ctx(engine);
1498 	if (err) {
1499 		/*
1500 		 * We continue even if we fail to initialize WA batch
1501 		 * because we only expect rare glitches but nothing
1502 		 * critical to prevent us from using GPU
1503 		 */
1504 		drm_err(&engine->i915->drm,
1505 			"Ignoring context switch w/a allocation error:%d\n",
1506 			err);
1507 		return;
1508 	}
1509 
1510 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1511 
1512 	/*
1513 	 * Emit the two workaround batch buffers, recording the offset from the
1514 	 * start of the workaround batch buffer object for each and their
1515 	 * respective sizes.
1516 	 */
1517 	batch_ptr = batch;
1518 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1519 		wa_bb[i]->offset = batch_ptr - batch;
1520 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1521 						  CACHELINE_BYTES))) {
1522 			err = -EINVAL;
1523 			break;
1524 		}
1525 		if (wa_bb_fn[i])
1526 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1527 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1528 	}
1529 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1530 
1531 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1532 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1533 
1534 	/* Verify that we can handle failure to setup the wa_ctx */
1535 	if (err || i915_inject_probe_error(engine->i915, -ENODEV))
1536 		lrc_fini_wa_ctx(engine);
1537 }
1538 
1539 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1540 {
1541 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1542 	ce->runtime.num_underflow++;
1543 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1544 #endif
1545 }
1546 
1547 void lrc_update_runtime(struct intel_context *ce)
1548 {
1549 	u32 old;
1550 	s32 dt;
1551 
1552 	if (intel_context_is_barrier(ce))
1553 		return;
1554 
1555 	old = ce->runtime.last;
1556 	ce->runtime.last = lrc_get_runtime(ce);
1557 	dt = ce->runtime.last - old;
1558 
1559 	if (unlikely(dt < 0)) {
1560 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1561 			 old, ce->runtime.last, dt);
1562 		st_update_runtime_underflow(ce, dt);
1563 		return;
1564 	}
1565 
1566 	ewma_runtime_add(&ce->runtime.avg, dt);
1567 	ce->runtime.total += dt;
1568 }
1569 
1570 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1571 #include "selftest_lrc.c"
1572 #endif
1573