xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 359f608f66b4434fb83b74e23ad14631ea3efc4e)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gem/i915_gem_lmem.h"
7 
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "i915_reg.h"
12 #include "intel_context.h"
13 #include "intel_engine.h"
14 #include "intel_engine_regs.h"
15 #include "intel_gpu_commands.h"
16 #include "intel_gt.h"
17 #include "intel_gt_regs.h"
18 #include "intel_lrc.h"
19 #include "intel_lrc_reg.h"
20 #include "intel_ring.h"
21 #include "shmem_utils.h"
22 
23 static void set_offsets(u32 *regs,
24 			const u8 *data,
25 			const struct intel_engine_cs *engine,
26 			bool close)
27 #define NOP(x) (BIT(7) | (x))
28 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
29 #define POSTED BIT(0)
30 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
31 #define REG16(x) \
32 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
33 	(((x) >> 2) & 0x7f)
34 #define END 0
35 {
36 	const u32 base = engine->mmio_base;
37 
38 	while (*data) {
39 		u8 count, flags;
40 
41 		if (*data & BIT(7)) { /* skip */
42 			count = *data++ & ~BIT(7);
43 			regs += count;
44 			continue;
45 		}
46 
47 		count = *data & 0x3f;
48 		flags = *data >> 6;
49 		data++;
50 
51 		*regs = MI_LOAD_REGISTER_IMM(count);
52 		if (flags & POSTED)
53 			*regs |= MI_LRI_FORCE_POSTED;
54 		if (GRAPHICS_VER(engine->i915) >= 11)
55 			*regs |= MI_LRI_LRM_CS_MMIO;
56 		regs++;
57 
58 		GEM_BUG_ON(!count);
59 		do {
60 			u32 offset = 0;
61 			u8 v;
62 
63 			do {
64 				v = *data++;
65 				offset <<= 7;
66 				offset |= v & ~BIT(7);
67 			} while (v & BIT(7));
68 
69 			regs[0] = base + (offset << 2);
70 			regs += 2;
71 		} while (--count);
72 	}
73 
74 	if (close) {
75 		/* Close the batch; used mainly by live_lrc_layout() */
76 		*regs = MI_BATCH_BUFFER_END;
77 		if (GRAPHICS_VER(engine->i915) >= 11)
78 			*regs |= BIT(0);
79 	}
80 }
81 
82 static const u8 gen8_xcs_offsets[] = {
83 	NOP(1),
84 	LRI(11, 0),
85 	REG16(0x244),
86 	REG(0x034),
87 	REG(0x030),
88 	REG(0x038),
89 	REG(0x03c),
90 	REG(0x168),
91 	REG(0x140),
92 	REG(0x110),
93 	REG(0x11c),
94 	REG(0x114),
95 	REG(0x118),
96 
97 	NOP(9),
98 	LRI(9, 0),
99 	REG16(0x3a8),
100 	REG16(0x28c),
101 	REG16(0x288),
102 	REG16(0x284),
103 	REG16(0x280),
104 	REG16(0x27c),
105 	REG16(0x278),
106 	REG16(0x274),
107 	REG16(0x270),
108 
109 	NOP(13),
110 	LRI(2, 0),
111 	REG16(0x200),
112 	REG(0x028),
113 
114 	END
115 };
116 
117 static const u8 gen9_xcs_offsets[] = {
118 	NOP(1),
119 	LRI(14, POSTED),
120 	REG16(0x244),
121 	REG(0x034),
122 	REG(0x030),
123 	REG(0x038),
124 	REG(0x03c),
125 	REG(0x168),
126 	REG(0x140),
127 	REG(0x110),
128 	REG(0x11c),
129 	REG(0x114),
130 	REG(0x118),
131 	REG(0x1c0),
132 	REG(0x1c4),
133 	REG(0x1c8),
134 
135 	NOP(3),
136 	LRI(9, POSTED),
137 	REG16(0x3a8),
138 	REG16(0x28c),
139 	REG16(0x288),
140 	REG16(0x284),
141 	REG16(0x280),
142 	REG16(0x27c),
143 	REG16(0x278),
144 	REG16(0x274),
145 	REG16(0x270),
146 
147 	NOP(13),
148 	LRI(1, POSTED),
149 	REG16(0x200),
150 
151 	NOP(13),
152 	LRI(44, POSTED),
153 	REG(0x028),
154 	REG(0x09c),
155 	REG(0x0c0),
156 	REG(0x178),
157 	REG(0x17c),
158 	REG16(0x358),
159 	REG(0x170),
160 	REG(0x150),
161 	REG(0x154),
162 	REG(0x158),
163 	REG16(0x41c),
164 	REG16(0x600),
165 	REG16(0x604),
166 	REG16(0x608),
167 	REG16(0x60c),
168 	REG16(0x610),
169 	REG16(0x614),
170 	REG16(0x618),
171 	REG16(0x61c),
172 	REG16(0x620),
173 	REG16(0x624),
174 	REG16(0x628),
175 	REG16(0x62c),
176 	REG16(0x630),
177 	REG16(0x634),
178 	REG16(0x638),
179 	REG16(0x63c),
180 	REG16(0x640),
181 	REG16(0x644),
182 	REG16(0x648),
183 	REG16(0x64c),
184 	REG16(0x650),
185 	REG16(0x654),
186 	REG16(0x658),
187 	REG16(0x65c),
188 	REG16(0x660),
189 	REG16(0x664),
190 	REG16(0x668),
191 	REG16(0x66c),
192 	REG16(0x670),
193 	REG16(0x674),
194 	REG16(0x678),
195 	REG16(0x67c),
196 	REG(0x068),
197 
198 	END
199 };
200 
201 static const u8 gen12_xcs_offsets[] = {
202 	NOP(1),
203 	LRI(13, POSTED),
204 	REG16(0x244),
205 	REG(0x034),
206 	REG(0x030),
207 	REG(0x038),
208 	REG(0x03c),
209 	REG(0x168),
210 	REG(0x140),
211 	REG(0x110),
212 	REG(0x1c0),
213 	REG(0x1c4),
214 	REG(0x1c8),
215 	REG(0x180),
216 	REG16(0x2b4),
217 
218 	NOP(5),
219 	LRI(9, POSTED),
220 	REG16(0x3a8),
221 	REG16(0x28c),
222 	REG16(0x288),
223 	REG16(0x284),
224 	REG16(0x280),
225 	REG16(0x27c),
226 	REG16(0x278),
227 	REG16(0x274),
228 	REG16(0x270),
229 
230 	END
231 };
232 
233 static const u8 dg2_xcs_offsets[] = {
234 	NOP(1),
235 	LRI(15, POSTED),
236 	REG16(0x244),
237 	REG(0x034),
238 	REG(0x030),
239 	REG(0x038),
240 	REG(0x03c),
241 	REG(0x168),
242 	REG(0x140),
243 	REG(0x110),
244 	REG(0x1c0),
245 	REG(0x1c4),
246 	REG(0x1c8),
247 	REG(0x180),
248 	REG16(0x2b4),
249 	REG(0x120),
250 	REG(0x124),
251 
252 	NOP(1),
253 	LRI(9, POSTED),
254 	REG16(0x3a8),
255 	REG16(0x28c),
256 	REG16(0x288),
257 	REG16(0x284),
258 	REG16(0x280),
259 	REG16(0x27c),
260 	REG16(0x278),
261 	REG16(0x274),
262 	REG16(0x270),
263 
264 	END
265 };
266 
267 static const u8 gen8_rcs_offsets[] = {
268 	NOP(1),
269 	LRI(14, POSTED),
270 	REG16(0x244),
271 	REG(0x034),
272 	REG(0x030),
273 	REG(0x038),
274 	REG(0x03c),
275 	REG(0x168),
276 	REG(0x140),
277 	REG(0x110),
278 	REG(0x11c),
279 	REG(0x114),
280 	REG(0x118),
281 	REG(0x1c0),
282 	REG(0x1c4),
283 	REG(0x1c8),
284 
285 	NOP(3),
286 	LRI(9, POSTED),
287 	REG16(0x3a8),
288 	REG16(0x28c),
289 	REG16(0x288),
290 	REG16(0x284),
291 	REG16(0x280),
292 	REG16(0x27c),
293 	REG16(0x278),
294 	REG16(0x274),
295 	REG16(0x270),
296 
297 	NOP(13),
298 	LRI(1, 0),
299 	REG(0x0c8),
300 
301 	END
302 };
303 
304 static const u8 gen9_rcs_offsets[] = {
305 	NOP(1),
306 	LRI(14, POSTED),
307 	REG16(0x244),
308 	REG(0x34),
309 	REG(0x30),
310 	REG(0x38),
311 	REG(0x3c),
312 	REG(0x168),
313 	REG(0x140),
314 	REG(0x110),
315 	REG(0x11c),
316 	REG(0x114),
317 	REG(0x118),
318 	REG(0x1c0),
319 	REG(0x1c4),
320 	REG(0x1c8),
321 
322 	NOP(3),
323 	LRI(9, POSTED),
324 	REG16(0x3a8),
325 	REG16(0x28c),
326 	REG16(0x288),
327 	REG16(0x284),
328 	REG16(0x280),
329 	REG16(0x27c),
330 	REG16(0x278),
331 	REG16(0x274),
332 	REG16(0x270),
333 
334 	NOP(13),
335 	LRI(1, 0),
336 	REG(0xc8),
337 
338 	NOP(13),
339 	LRI(44, POSTED),
340 	REG(0x28),
341 	REG(0x9c),
342 	REG(0xc0),
343 	REG(0x178),
344 	REG(0x17c),
345 	REG16(0x358),
346 	REG(0x170),
347 	REG(0x150),
348 	REG(0x154),
349 	REG(0x158),
350 	REG16(0x41c),
351 	REG16(0x600),
352 	REG16(0x604),
353 	REG16(0x608),
354 	REG16(0x60c),
355 	REG16(0x610),
356 	REG16(0x614),
357 	REG16(0x618),
358 	REG16(0x61c),
359 	REG16(0x620),
360 	REG16(0x624),
361 	REG16(0x628),
362 	REG16(0x62c),
363 	REG16(0x630),
364 	REG16(0x634),
365 	REG16(0x638),
366 	REG16(0x63c),
367 	REG16(0x640),
368 	REG16(0x644),
369 	REG16(0x648),
370 	REG16(0x64c),
371 	REG16(0x650),
372 	REG16(0x654),
373 	REG16(0x658),
374 	REG16(0x65c),
375 	REG16(0x660),
376 	REG16(0x664),
377 	REG16(0x668),
378 	REG16(0x66c),
379 	REG16(0x670),
380 	REG16(0x674),
381 	REG16(0x678),
382 	REG16(0x67c),
383 	REG(0x68),
384 
385 	END
386 };
387 
388 static const u8 gen11_rcs_offsets[] = {
389 	NOP(1),
390 	LRI(15, POSTED),
391 	REG16(0x244),
392 	REG(0x034),
393 	REG(0x030),
394 	REG(0x038),
395 	REG(0x03c),
396 	REG(0x168),
397 	REG(0x140),
398 	REG(0x110),
399 	REG(0x11c),
400 	REG(0x114),
401 	REG(0x118),
402 	REG(0x1c0),
403 	REG(0x1c4),
404 	REG(0x1c8),
405 	REG(0x180),
406 
407 	NOP(1),
408 	LRI(9, POSTED),
409 	REG16(0x3a8),
410 	REG16(0x28c),
411 	REG16(0x288),
412 	REG16(0x284),
413 	REG16(0x280),
414 	REG16(0x27c),
415 	REG16(0x278),
416 	REG16(0x274),
417 	REG16(0x270),
418 
419 	LRI(1, POSTED),
420 	REG(0x1b0),
421 
422 	NOP(10),
423 	LRI(1, 0),
424 	REG(0x0c8),
425 
426 	END
427 };
428 
429 static const u8 gen12_rcs_offsets[] = {
430 	NOP(1),
431 	LRI(13, POSTED),
432 	REG16(0x244),
433 	REG(0x034),
434 	REG(0x030),
435 	REG(0x038),
436 	REG(0x03c),
437 	REG(0x168),
438 	REG(0x140),
439 	REG(0x110),
440 	REG(0x1c0),
441 	REG(0x1c4),
442 	REG(0x1c8),
443 	REG(0x180),
444 	REG16(0x2b4),
445 
446 	NOP(5),
447 	LRI(9, POSTED),
448 	REG16(0x3a8),
449 	REG16(0x28c),
450 	REG16(0x288),
451 	REG16(0x284),
452 	REG16(0x280),
453 	REG16(0x27c),
454 	REG16(0x278),
455 	REG16(0x274),
456 	REG16(0x270),
457 
458 	LRI(3, POSTED),
459 	REG(0x1b0),
460 	REG16(0x5a8),
461 	REG16(0x5ac),
462 
463 	NOP(6),
464 	LRI(1, 0),
465 	REG(0x0c8),
466 	NOP(3 + 9 + 1),
467 
468 	LRI(51, POSTED),
469 	REG16(0x588),
470 	REG16(0x588),
471 	REG16(0x588),
472 	REG16(0x588),
473 	REG16(0x588),
474 	REG16(0x588),
475 	REG(0x028),
476 	REG(0x09c),
477 	REG(0x0c0),
478 	REG(0x178),
479 	REG(0x17c),
480 	REG16(0x358),
481 	REG(0x170),
482 	REG(0x150),
483 	REG(0x154),
484 	REG(0x158),
485 	REG16(0x41c),
486 	REG16(0x600),
487 	REG16(0x604),
488 	REG16(0x608),
489 	REG16(0x60c),
490 	REG16(0x610),
491 	REG16(0x614),
492 	REG16(0x618),
493 	REG16(0x61c),
494 	REG16(0x620),
495 	REG16(0x624),
496 	REG16(0x628),
497 	REG16(0x62c),
498 	REG16(0x630),
499 	REG16(0x634),
500 	REG16(0x638),
501 	REG16(0x63c),
502 	REG16(0x640),
503 	REG16(0x644),
504 	REG16(0x648),
505 	REG16(0x64c),
506 	REG16(0x650),
507 	REG16(0x654),
508 	REG16(0x658),
509 	REG16(0x65c),
510 	REG16(0x660),
511 	REG16(0x664),
512 	REG16(0x668),
513 	REG16(0x66c),
514 	REG16(0x670),
515 	REG16(0x674),
516 	REG16(0x678),
517 	REG16(0x67c),
518 	REG(0x068),
519 	REG(0x084),
520 	NOP(1),
521 
522 	END
523 };
524 
525 static const u8 xehp_rcs_offsets[] = {
526 	NOP(1),
527 	LRI(13, POSTED),
528 	REG16(0x244),
529 	REG(0x034),
530 	REG(0x030),
531 	REG(0x038),
532 	REG(0x03c),
533 	REG(0x168),
534 	REG(0x140),
535 	REG(0x110),
536 	REG(0x1c0),
537 	REG(0x1c4),
538 	REG(0x1c8),
539 	REG(0x180),
540 	REG16(0x2b4),
541 
542 	NOP(5),
543 	LRI(9, POSTED),
544 	REG16(0x3a8),
545 	REG16(0x28c),
546 	REG16(0x288),
547 	REG16(0x284),
548 	REG16(0x280),
549 	REG16(0x27c),
550 	REG16(0x278),
551 	REG16(0x274),
552 	REG16(0x270),
553 
554 	LRI(3, POSTED),
555 	REG(0x1b0),
556 	REG16(0x5a8),
557 	REG16(0x5ac),
558 
559 	NOP(6),
560 	LRI(1, 0),
561 	REG(0x0c8),
562 
563 	END
564 };
565 
566 static const u8 dg2_rcs_offsets[] = {
567 	NOP(1),
568 	LRI(15, POSTED),
569 	REG16(0x244),
570 	REG(0x034),
571 	REG(0x030),
572 	REG(0x038),
573 	REG(0x03c),
574 	REG(0x168),
575 	REG(0x140),
576 	REG(0x110),
577 	REG(0x1c0),
578 	REG(0x1c4),
579 	REG(0x1c8),
580 	REG(0x180),
581 	REG16(0x2b4),
582 	REG(0x120),
583 	REG(0x124),
584 
585 	NOP(1),
586 	LRI(9, POSTED),
587 	REG16(0x3a8),
588 	REG16(0x28c),
589 	REG16(0x288),
590 	REG16(0x284),
591 	REG16(0x280),
592 	REG16(0x27c),
593 	REG16(0x278),
594 	REG16(0x274),
595 	REG16(0x270),
596 
597 	LRI(3, POSTED),
598 	REG(0x1b0),
599 	REG16(0x5a8),
600 	REG16(0x5ac),
601 
602 	NOP(6),
603 	LRI(1, 0),
604 	REG(0x0c8),
605 
606 	END
607 };
608 
609 #undef END
610 #undef REG16
611 #undef REG
612 #undef LRI
613 #undef NOP
614 
615 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
616 {
617 	/*
618 	 * The gen12+ lists only have the registers we program in the basic
619 	 * default state. We rely on the context image using relative
620 	 * addressing to automatic fixup the register state between the
621 	 * physical engines for virtual engine.
622 	 */
623 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
624 		   !intel_engine_has_relative_mmio(engine));
625 
626 	if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
627 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
628 			return dg2_rcs_offsets;
629 		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
630 			return xehp_rcs_offsets;
631 		else if (GRAPHICS_VER(engine->i915) >= 12)
632 			return gen12_rcs_offsets;
633 		else if (GRAPHICS_VER(engine->i915) >= 11)
634 			return gen11_rcs_offsets;
635 		else if (GRAPHICS_VER(engine->i915) >= 9)
636 			return gen9_rcs_offsets;
637 		else
638 			return gen8_rcs_offsets;
639 	} else {
640 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
641 			return dg2_xcs_offsets;
642 		else if (GRAPHICS_VER(engine->i915) >= 12)
643 			return gen12_xcs_offsets;
644 		else if (GRAPHICS_VER(engine->i915) >= 9)
645 			return gen9_xcs_offsets;
646 		else
647 			return gen8_xcs_offsets;
648 	}
649 }
650 
651 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
652 {
653 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
654 		return 0x70;
655 	else if (GRAPHICS_VER(engine->i915) >= 12)
656 		return 0x60;
657 	else if (GRAPHICS_VER(engine->i915) >= 9)
658 		return 0x54;
659 	else if (engine->class == RENDER_CLASS)
660 		return 0x58;
661 	else
662 		return -1;
663 }
664 
665 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
666 {
667 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
668 		return 0x80;
669 	else if (GRAPHICS_VER(engine->i915) >= 12)
670 		return 0x70;
671 	else if (GRAPHICS_VER(engine->i915) >= 9)
672 		return 0x64;
673 	else if (GRAPHICS_VER(engine->i915) >= 8 &&
674 		 engine->class == RENDER_CLASS)
675 		return 0xc4;
676 	else
677 		return -1;
678 }
679 
680 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
681 {
682 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
683 		return 0x84;
684 	else if (GRAPHICS_VER(engine->i915) >= 12)
685 		return 0x74;
686 	else if (GRAPHICS_VER(engine->i915) >= 9)
687 		return 0x68;
688 	else if (engine->class == RENDER_CLASS)
689 		return 0xd8;
690 	else
691 		return -1;
692 }
693 
694 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
695 {
696 	if (GRAPHICS_VER(engine->i915) >= 12)
697 		return 0x12;
698 	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
699 		return 0x18;
700 	else
701 		return -1;
702 }
703 
704 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
705 {
706 	int x;
707 
708 	x = lrc_ring_wa_bb_per_ctx(engine);
709 	if (x < 0)
710 		return x;
711 
712 	return x + 2;
713 }
714 
715 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
716 {
717 	int x;
718 
719 	x = lrc_ring_indirect_ptr(engine);
720 	if (x < 0)
721 		return x;
722 
723 	return x + 2;
724 }
725 
726 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
727 {
728 
729 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
730 		/*
731 		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
732 		 * simply to match the RCS context image layout.
733 		 */
734 		return 0xc6;
735 	else if (engine->class != RENDER_CLASS)
736 		return -1;
737 	else if (GRAPHICS_VER(engine->i915) >= 12)
738 		return 0xb6;
739 	else if (GRAPHICS_VER(engine->i915) >= 11)
740 		return 0xaa;
741 	else
742 		return -1;
743 }
744 
745 static u32
746 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
747 {
748 	switch (GRAPHICS_VER(engine->i915)) {
749 	default:
750 		MISSING_CASE(GRAPHICS_VER(engine->i915));
751 		fallthrough;
752 	case 12:
753 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
754 	case 11:
755 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
756 	case 9:
757 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
758 	case 8:
759 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
760 	}
761 }
762 
763 static void
764 lrc_setup_indirect_ctx(u32 *regs,
765 		       const struct intel_engine_cs *engine,
766 		       u32 ctx_bb_ggtt_addr,
767 		       u32 size)
768 {
769 	GEM_BUG_ON(!size);
770 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
771 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
772 	regs[lrc_ring_indirect_ptr(engine) + 1] =
773 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
774 
775 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
776 	regs[lrc_ring_indirect_offset(engine) + 1] =
777 		lrc_ring_indirect_offset_default(engine) << 6;
778 }
779 
780 static void init_common_regs(u32 * const regs,
781 			     const struct intel_context *ce,
782 			     const struct intel_engine_cs *engine,
783 			     bool inhibit)
784 {
785 	u32 ctl;
786 	int loc;
787 
788 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
789 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
790 	if (inhibit)
791 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
792 	if (GRAPHICS_VER(engine->i915) < 11)
793 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
794 					   CTX_CTRL_RS_CTX_ENABLE);
795 	regs[CTX_CONTEXT_CONTROL] = ctl;
796 
797 	regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
798 
799 	loc = lrc_ring_bb_offset(engine);
800 	if (loc != -1)
801 		regs[loc + 1] = 0;
802 }
803 
804 static void init_wa_bb_regs(u32 * const regs,
805 			    const struct intel_engine_cs *engine)
806 {
807 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
808 
809 	if (wa_ctx->per_ctx.size) {
810 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
811 
812 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
813 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
814 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
815 	}
816 
817 	if (wa_ctx->indirect_ctx.size) {
818 		lrc_setup_indirect_ctx(regs, engine,
819 				       i915_ggtt_offset(wa_ctx->vma) +
820 				       wa_ctx->indirect_ctx.offset,
821 				       wa_ctx->indirect_ctx.size);
822 	}
823 }
824 
825 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
826 {
827 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
828 		/* 64b PPGTT (48bit canonical)
829 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
830 		 * other PDP Descriptors are ignored.
831 		 */
832 		ASSIGN_CTX_PML4(ppgtt, regs);
833 	} else {
834 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
835 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
836 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
837 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
838 	}
839 }
840 
841 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
842 {
843 	if (i915_is_ggtt(vm))
844 		return i915_vm_to_ggtt(vm)->alias;
845 	else
846 		return i915_vm_to_ppgtt(vm);
847 }
848 
849 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
850 {
851 	int x;
852 
853 	x = lrc_ring_mi_mode(engine);
854 	if (x != -1) {
855 		regs[x + 1] &= ~STOP_RING;
856 		regs[x + 1] |= STOP_RING << 16;
857 	}
858 }
859 
860 static void __lrc_init_regs(u32 *regs,
861 			    const struct intel_context *ce,
862 			    const struct intel_engine_cs *engine,
863 			    bool inhibit)
864 {
865 	/*
866 	 * A context is actually a big batch buffer with several
867 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
868 	 * values we are setting here are only for the first context restore:
869 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
870 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
871 	 * we are not initializing here).
872 	 *
873 	 * Must keep consistent with virtual_update_register_offsets().
874 	 */
875 
876 	if (inhibit)
877 		memset(regs, 0, PAGE_SIZE);
878 
879 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
880 
881 	init_common_regs(regs, ce, engine, inhibit);
882 	init_ppgtt_regs(regs, vm_alias(ce->vm));
883 
884 	init_wa_bb_regs(regs, engine);
885 
886 	__reset_stop_ring(regs, engine);
887 }
888 
889 void lrc_init_regs(const struct intel_context *ce,
890 		   const struct intel_engine_cs *engine,
891 		   bool inhibit)
892 {
893 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
894 }
895 
896 void lrc_reset_regs(const struct intel_context *ce,
897 		    const struct intel_engine_cs *engine)
898 {
899 	__reset_stop_ring(ce->lrc_reg_state, engine);
900 }
901 
902 static void
903 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
904 {
905 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
906 		return;
907 
908 	vaddr += engine->context_size;
909 
910 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
911 }
912 
913 static void
914 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
915 {
916 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
917 		return;
918 
919 	vaddr += engine->context_size;
920 
921 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
922 		drm_err_once(&engine->i915->drm,
923 			     "%s context redzone overwritten!\n",
924 			     engine->name);
925 }
926 
927 static u32 context_wa_bb_offset(const struct intel_context *ce)
928 {
929 	return PAGE_SIZE * ce->wa_bb_page;
930 }
931 
932 static u32 *context_indirect_bb(const struct intel_context *ce)
933 {
934 	void *ptr;
935 
936 	GEM_BUG_ON(!ce->wa_bb_page);
937 
938 	ptr = ce->lrc_reg_state;
939 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
940 	ptr += context_wa_bb_offset(ce);
941 
942 	return ptr;
943 }
944 
945 void lrc_init_state(struct intel_context *ce,
946 		    struct intel_engine_cs *engine,
947 		    void *state)
948 {
949 	bool inhibit = true;
950 
951 	set_redzone(state, engine);
952 
953 	if (engine->default_state) {
954 		shmem_read(engine->default_state, 0,
955 			   state, engine->context_size);
956 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
957 		inhibit = false;
958 	}
959 
960 	/* Clear the ppHWSP (inc. per-context counters) */
961 	memset(state, 0, PAGE_SIZE);
962 
963 	/* Clear the indirect wa and storage */
964 	if (ce->wa_bb_page)
965 		memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
966 
967 	/*
968 	 * The second page of the context object contains some registers which
969 	 * must be set up prior to the first execution.
970 	 */
971 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
972 }
973 
974 u32 lrc_indirect_bb(const struct intel_context *ce)
975 {
976 	return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
977 }
978 
979 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
980 {
981 	/* If predication is active, this will be noop'ed */
982 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
983 	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
984 	*cs++ = 0;
985 	*cs++ = 0; /* No predication */
986 
987 	/* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
988 	*cs++ = MI_BATCH_BUFFER_END | BIT(15);
989 	*cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
990 
991 	/* Instructions are no longer predicated (disabled), we can proceed */
992 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
993 	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
994 	*cs++ = 0;
995 	*cs++ = 1; /* enable predication before the next BB */
996 
997 	*cs++ = MI_BATCH_BUFFER_END;
998 	GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
999 
1000 	return cs;
1001 }
1002 
1003 static struct i915_vma *
1004 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1005 {
1006 	struct drm_i915_gem_object *obj;
1007 	struct i915_vma *vma;
1008 	u32 context_size;
1009 
1010 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1011 
1012 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1013 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1014 
1015 	if (GRAPHICS_VER(engine->i915) == 12) {
1016 		ce->wa_bb_page = context_size / PAGE_SIZE;
1017 		context_size += PAGE_SIZE;
1018 	}
1019 
1020 	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1021 		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1022 		context_size += PARENT_SCRATCH_SIZE;
1023 	}
1024 
1025 	obj = i915_gem_object_create_lmem(engine->i915, context_size,
1026 					  I915_BO_ALLOC_PM_VOLATILE);
1027 	if (IS_ERR(obj))
1028 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
1029 	if (IS_ERR(obj))
1030 		return ERR_CAST(obj);
1031 
1032 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1033 	if (IS_ERR(vma)) {
1034 		i915_gem_object_put(obj);
1035 		return vma;
1036 	}
1037 
1038 	return vma;
1039 }
1040 
1041 static struct intel_timeline *
1042 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1043 {
1044 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1045 
1046 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1047 }
1048 
1049 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1050 {
1051 	struct intel_ring *ring;
1052 	struct i915_vma *vma;
1053 	int err;
1054 
1055 	GEM_BUG_ON(ce->state);
1056 
1057 	vma = __lrc_alloc_state(ce, engine);
1058 	if (IS_ERR(vma))
1059 		return PTR_ERR(vma);
1060 
1061 	ring = intel_engine_create_ring(engine, ce->ring_size);
1062 	if (IS_ERR(ring)) {
1063 		err = PTR_ERR(ring);
1064 		goto err_vma;
1065 	}
1066 
1067 	if (!page_mask_bits(ce->timeline)) {
1068 		struct intel_timeline *tl;
1069 
1070 		/*
1071 		 * Use the static global HWSP for the kernel context, and
1072 		 * a dynamically allocated cacheline for everyone else.
1073 		 */
1074 		if (unlikely(ce->timeline))
1075 			tl = pinned_timeline(ce, engine);
1076 		else
1077 			tl = intel_timeline_create(engine->gt);
1078 		if (IS_ERR(tl)) {
1079 			err = PTR_ERR(tl);
1080 			goto err_ring;
1081 		}
1082 
1083 		ce->timeline = tl;
1084 	}
1085 
1086 	ce->ring = ring;
1087 	ce->state = vma;
1088 
1089 	return 0;
1090 
1091 err_ring:
1092 	intel_ring_put(ring);
1093 err_vma:
1094 	i915_vma_put(vma);
1095 	return err;
1096 }
1097 
1098 void lrc_reset(struct intel_context *ce)
1099 {
1100 	GEM_BUG_ON(!intel_context_is_pinned(ce));
1101 
1102 	intel_ring_reset(ce->ring, ce->ring->emit);
1103 
1104 	/* Scrub away the garbage */
1105 	lrc_init_regs(ce, ce->engine, true);
1106 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1107 }
1108 
1109 int
1110 lrc_pre_pin(struct intel_context *ce,
1111 	    struct intel_engine_cs *engine,
1112 	    struct i915_gem_ww_ctx *ww,
1113 	    void **vaddr)
1114 {
1115 	GEM_BUG_ON(!ce->state);
1116 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1117 
1118 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1119 					 i915_coherent_map_type(ce->engine->i915,
1120 								ce->state->obj,
1121 								false) |
1122 					 I915_MAP_OVERRIDE);
1123 
1124 	return PTR_ERR_OR_ZERO(*vaddr);
1125 }
1126 
1127 int
1128 lrc_pin(struct intel_context *ce,
1129 	struct intel_engine_cs *engine,
1130 	void *vaddr)
1131 {
1132 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1133 
1134 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1135 		lrc_init_state(ce, engine, vaddr);
1136 
1137 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1138 	return 0;
1139 }
1140 
1141 void lrc_unpin(struct intel_context *ce)
1142 {
1143 	if (unlikely(ce->parallel.last_rq)) {
1144 		i915_request_put(ce->parallel.last_rq);
1145 		ce->parallel.last_rq = NULL;
1146 	}
1147 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1148 		      ce->engine);
1149 }
1150 
1151 void lrc_post_unpin(struct intel_context *ce)
1152 {
1153 	i915_gem_object_unpin_map(ce->state->obj);
1154 }
1155 
1156 void lrc_fini(struct intel_context *ce)
1157 {
1158 	if (!ce->state)
1159 		return;
1160 
1161 	intel_ring_put(fetch_and_zero(&ce->ring));
1162 	i915_vma_put(fetch_and_zero(&ce->state));
1163 }
1164 
1165 void lrc_destroy(struct kref *kref)
1166 {
1167 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1168 
1169 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1170 	GEM_BUG_ON(intel_context_is_pinned(ce));
1171 
1172 	lrc_fini(ce);
1173 
1174 	intel_context_fini(ce);
1175 	intel_context_free(ce);
1176 }
1177 
1178 static u32 *
1179 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1180 {
1181 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1182 		MI_SRM_LRM_GLOBAL_GTT |
1183 		MI_LRI_LRM_CS_MMIO;
1184 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1185 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1186 		CTX_TIMESTAMP * sizeof(u32);
1187 	*cs++ = 0;
1188 
1189 	*cs++ = MI_LOAD_REGISTER_REG |
1190 		MI_LRR_SOURCE_CS_MMIO |
1191 		MI_LRI_LRM_CS_MMIO;
1192 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1193 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1194 
1195 	*cs++ = MI_LOAD_REGISTER_REG |
1196 		MI_LRR_SOURCE_CS_MMIO |
1197 		MI_LRI_LRM_CS_MMIO;
1198 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1199 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1200 
1201 	return cs;
1202 }
1203 
1204 static u32 *
1205 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1206 {
1207 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1208 
1209 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1210 		MI_SRM_LRM_GLOBAL_GTT |
1211 		MI_LRI_LRM_CS_MMIO;
1212 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1213 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1214 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1215 	*cs++ = 0;
1216 
1217 	return cs;
1218 }
1219 
1220 static u32 *
1221 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1222 {
1223 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1224 
1225 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1226 		MI_SRM_LRM_GLOBAL_GTT |
1227 		MI_LRI_LRM_CS_MMIO;
1228 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1229 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1230 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1231 	*cs++ = 0;
1232 
1233 	*cs++ = MI_LOAD_REGISTER_REG |
1234 		MI_LRR_SOURCE_CS_MMIO |
1235 		MI_LRI_LRM_CS_MMIO;
1236 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1237 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1238 
1239 	return cs;
1240 }
1241 
1242 /*
1243  * On DG2 during context restore of a preempted context in GPGPU mode,
1244  * RCS restore hang is detected. This is extremely timing dependent.
1245  * To address this below sw wabb is implemented for DG2 A steppings.
1246  */
1247 static u32 *
1248 dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs)
1249 {
1250 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1251 	*cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG);
1252 	*cs++ = 0x21;
1253 
1254 	*cs++ = MI_LOAD_REGISTER_REG;
1255 	*cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1256 	*cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1);
1257 
1258 	*cs++ = MI_LOAD_REGISTER_REG;
1259 	*cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1260 	*cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2);
1261 
1262 	return cs;
1263 }
1264 
1265 /*
1266  * The bspec's tuning guide asks us to program a vertical watermark value of
1267  * 0x3FF.  However this register is not saved/restored properly by the
1268  * hardware, so we're required to apply the desired value via INDIRECT_CTX
1269  * batch buffer to ensure the value takes effect properly.  All other bits
1270  * in this register should remain at 0 (the hardware default).
1271  */
1272 static u32 *
1273 dg2_emit_draw_watermark_setting(u32 *cs)
1274 {
1275 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1276 	*cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1277 	*cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1278 
1279 	return cs;
1280 }
1281 
1282 static u32 *
1283 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1284 {
1285 	cs = gen12_emit_timestamp_wa(ce, cs);
1286 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1287 	cs = gen12_emit_restore_scratch(ce, cs);
1288 
1289 	/* Wa_22011450934:dg2 */
1290 	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) ||
1291 	    IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0))
1292 		cs = dg2_emit_rcs_hang_wabb(ce, cs);
1293 
1294 	/* Wa_16013000631:dg2 */
1295 	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1296 	    IS_DG2_G11(ce->engine->i915))
1297 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1298 
1299 	/* hsdes: 1809175790 */
1300 	if (!HAS_FLAT_CCS(ce->engine->i915))
1301 		cs = gen12_emit_aux_table_inv(ce->engine->gt,
1302 					      cs, GEN12_GFX_CCS_AUX_NV);
1303 
1304 	/* Wa_16014892111 */
1305 	if (IS_DG2(ce->engine->i915))
1306 		cs = dg2_emit_draw_watermark_setting(cs);
1307 
1308 	return cs;
1309 }
1310 
1311 static u32 *
1312 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1313 {
1314 	cs = gen12_emit_timestamp_wa(ce, cs);
1315 	cs = gen12_emit_restore_scratch(ce, cs);
1316 
1317 	/* Wa_16013000631:dg2 */
1318 	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1319 	    IS_DG2_G11(ce->engine->i915))
1320 		if (ce->engine->class == COMPUTE_CLASS)
1321 			cs = gen8_emit_pipe_control(cs,
1322 						    PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1323 						    0);
1324 
1325 	/* hsdes: 1809175790 */
1326 	if (!HAS_FLAT_CCS(ce->engine->i915)) {
1327 		if (ce->engine->class == VIDEO_DECODE_CLASS)
1328 			cs = gen12_emit_aux_table_inv(ce->engine->gt,
1329 						      cs, GEN12_VD0_AUX_NV);
1330 		else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS)
1331 			cs = gen12_emit_aux_table_inv(ce->engine->gt,
1332 						      cs, GEN12_VE0_AUX_NV);
1333 	}
1334 
1335 	return cs;
1336 }
1337 
1338 static void
1339 setup_indirect_ctx_bb(const struct intel_context *ce,
1340 		      const struct intel_engine_cs *engine,
1341 		      u32 *(*emit)(const struct intel_context *, u32 *))
1342 {
1343 	u32 * const start = context_indirect_bb(ce);
1344 	u32 *cs;
1345 
1346 	cs = emit(ce, start);
1347 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1348 	while ((unsigned long)cs % CACHELINE_BYTES)
1349 		*cs++ = MI_NOOP;
1350 
1351 	GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1352 	setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1353 
1354 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1355 			       lrc_indirect_bb(ce),
1356 			       (cs - start) * sizeof(*cs));
1357 }
1358 
1359 /*
1360  * The context descriptor encodes various attributes of a context,
1361  * including its GTT address and some flags. Because it's fairly
1362  * expensive to calculate, we'll just do it once and cache the result,
1363  * which remains valid until the context is unpinned.
1364  *
1365  * This is what a descriptor looks like, from LSB to MSB::
1366  *
1367  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1368  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1369  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1370  *      bits 53-54:    mbz, reserved for use by hardware
1371  *      bits 55-63:    group ID, currently unused and set to 0
1372  *
1373  * Starting from Gen11, the upper dword of the descriptor has a new format:
1374  *
1375  *      bits 32-36:    reserved
1376  *      bits 37-47:    SW context ID
1377  *      bits 48:53:    engine instance
1378  *      bit 54:        mbz, reserved for use by hardware
1379  *      bits 55-60:    SW counter
1380  *      bits 61-63:    engine class
1381  *
1382  * On Xe_HP, the upper dword of the descriptor has a new format:
1383  *
1384  *      bits 32-37:    virtual function number
1385  *      bit 38:        mbz, reserved for use by hardware
1386  *      bits 39-54:    SW context ID
1387  *      bits 55-57:    reserved
1388  *      bits 58-63:    SW counter
1389  *
1390  * engine info, SW context ID and SW counter need to form a unique number
1391  * (Context ID) per lrc.
1392  */
1393 static u32 lrc_descriptor(const struct intel_context *ce)
1394 {
1395 	u32 desc;
1396 
1397 	desc = INTEL_LEGACY_32B_CONTEXT;
1398 	if (i915_vm_is_4lvl(ce->vm))
1399 		desc = INTEL_LEGACY_64B_CONTEXT;
1400 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1401 
1402 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1403 	if (GRAPHICS_VER(ce->vm->i915) == 8)
1404 		desc |= GEN8_CTX_L3LLC_COHERENT;
1405 
1406 	return i915_ggtt_offset(ce->state) | desc;
1407 }
1408 
1409 u32 lrc_update_regs(const struct intel_context *ce,
1410 		    const struct intel_engine_cs *engine,
1411 		    u32 head)
1412 {
1413 	struct intel_ring *ring = ce->ring;
1414 	u32 *regs = ce->lrc_reg_state;
1415 
1416 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1417 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1418 
1419 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1420 	regs[CTX_RING_HEAD] = head;
1421 	regs[CTX_RING_TAIL] = ring->tail;
1422 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1423 
1424 	/* RPCS */
1425 	if (engine->class == RENDER_CLASS) {
1426 		regs[CTX_R_PWR_CLK_STATE] =
1427 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1428 
1429 		i915_oa_init_reg_state(ce, engine);
1430 	}
1431 
1432 	if (ce->wa_bb_page) {
1433 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1434 
1435 		fn = gen12_emit_indirect_ctx_xcs;
1436 		if (ce->engine->class == RENDER_CLASS)
1437 			fn = gen12_emit_indirect_ctx_rcs;
1438 
1439 		/* Mutually exclusive wrt to global indirect bb */
1440 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1441 		setup_indirect_ctx_bb(ce, engine, fn);
1442 	}
1443 
1444 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1445 }
1446 
1447 void lrc_update_offsets(struct intel_context *ce,
1448 			struct intel_engine_cs *engine)
1449 {
1450 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1451 }
1452 
1453 void lrc_check_regs(const struct intel_context *ce,
1454 		    const struct intel_engine_cs *engine,
1455 		    const char *when)
1456 {
1457 	const struct intel_ring *ring = ce->ring;
1458 	u32 *regs = ce->lrc_reg_state;
1459 	bool valid = true;
1460 	int x;
1461 
1462 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1463 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1464 		       engine->name,
1465 		       regs[CTX_RING_START],
1466 		       i915_ggtt_offset(ring->vma));
1467 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1468 		valid = false;
1469 	}
1470 
1471 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1472 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1473 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1474 		       engine->name,
1475 		       regs[CTX_RING_CTL],
1476 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1477 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1478 		valid = false;
1479 	}
1480 
1481 	x = lrc_ring_mi_mode(engine);
1482 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1483 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1484 		       engine->name, regs[x + 1]);
1485 		regs[x + 1] &= ~STOP_RING;
1486 		regs[x + 1] |= STOP_RING << 16;
1487 		valid = false;
1488 	}
1489 
1490 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1491 }
1492 
1493 /*
1494  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1495  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1496  * but there is a slight complication as this is applied in WA batch where the
1497  * values are only initialized once so we cannot take register value at the
1498  * beginning and reuse it further; hence we save its value to memory, upload a
1499  * constant value with bit21 set and then we restore it back with the saved value.
1500  * To simplify the WA, a constant value is formed by using the default value
1501  * of this register. This shouldn't be a problem because we are only modifying
1502  * it for a short period and this batch in non-premptible. We can ofcourse
1503  * use additional instructions that read the actual value of the register
1504  * at that time and set our bit of interest but it makes the WA complicated.
1505  *
1506  * This WA is also required for Gen9 so extracting as a function avoids
1507  * code duplication.
1508  */
1509 static u32 *
1510 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1511 {
1512 	/* NB no one else is allowed to scribble over scratch + 256! */
1513 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1514 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1515 	*batch++ = intel_gt_scratch_offset(engine->gt,
1516 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1517 	*batch++ = 0;
1518 
1519 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1520 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1521 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1522 
1523 	batch = gen8_emit_pipe_control(batch,
1524 				       PIPE_CONTROL_CS_STALL |
1525 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1526 				       0);
1527 
1528 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1529 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1530 	*batch++ = intel_gt_scratch_offset(engine->gt,
1531 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1532 	*batch++ = 0;
1533 
1534 	return batch;
1535 }
1536 
1537 /*
1538  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1539  * initialized at the beginning and shared across all contexts but this field
1540  * helps us to have multiple batches at different offsets and select them based
1541  * on a criteria. At the moment this batch always start at the beginning of the page
1542  * and at this point we don't have multiple wa_ctx batch buffers.
1543  *
1544  * The number of WA applied are not known at the beginning; we use this field
1545  * to return the no of DWORDS written.
1546  *
1547  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1548  * so it adds NOOPs as padding to make it cacheline aligned.
1549  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1550  * makes a complete batch buffer.
1551  */
1552 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1553 {
1554 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1555 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1556 
1557 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1558 	if (IS_BROADWELL(engine->i915))
1559 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1560 
1561 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1562 	/* Actual scratch location is at 128 bytes offset */
1563 	batch = gen8_emit_pipe_control(batch,
1564 				       PIPE_CONTROL_FLUSH_L3 |
1565 				       PIPE_CONTROL_STORE_DATA_INDEX |
1566 				       PIPE_CONTROL_CS_STALL |
1567 				       PIPE_CONTROL_QW_WRITE,
1568 				       LRC_PPHWSP_SCRATCH_ADDR);
1569 
1570 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1571 
1572 	/* Pad to end of cacheline */
1573 	while ((unsigned long)batch % CACHELINE_BYTES)
1574 		*batch++ = MI_NOOP;
1575 
1576 	/*
1577 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1578 	 * execution depends on the length specified in terms of cache lines
1579 	 * in the register CTX_RCS_INDIRECT_CTX
1580 	 */
1581 
1582 	return batch;
1583 }
1584 
1585 struct lri {
1586 	i915_reg_t reg;
1587 	u32 value;
1588 };
1589 
1590 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1591 {
1592 	GEM_BUG_ON(!count || count > 63);
1593 
1594 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1595 	do {
1596 		*batch++ = i915_mmio_reg_offset(lri->reg);
1597 		*batch++ = lri->value;
1598 	} while (lri++, --count);
1599 	*batch++ = MI_NOOP;
1600 
1601 	return batch;
1602 }
1603 
1604 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1605 {
1606 	static const struct lri lri[] = {
1607 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1608 		{
1609 			COMMON_SLICE_CHICKEN2,
1610 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1611 				       0),
1612 		},
1613 
1614 		/* BSpec: 11391 */
1615 		{
1616 			FF_SLICE_CHICKEN,
1617 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1618 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1619 		},
1620 
1621 		/* BSpec: 11299 */
1622 		{
1623 			_3D_CHICKEN3,
1624 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1625 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1626 		}
1627 	};
1628 
1629 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1630 
1631 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1632 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1633 
1634 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1635 	batch = gen8_emit_pipe_control(batch,
1636 				       PIPE_CONTROL_FLUSH_L3 |
1637 				       PIPE_CONTROL_STORE_DATA_INDEX |
1638 				       PIPE_CONTROL_CS_STALL |
1639 				       PIPE_CONTROL_QW_WRITE,
1640 				       LRC_PPHWSP_SCRATCH_ADDR);
1641 
1642 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1643 
1644 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1645 	if (HAS_POOLED_EU(engine->i915)) {
1646 		/*
1647 		 * EU pool configuration is setup along with golden context
1648 		 * during context initialization. This value depends on
1649 		 * device type (2x6 or 3x6) and needs to be updated based
1650 		 * on which subslice is disabled especially for 2x6
1651 		 * devices, however it is safe to load default
1652 		 * configuration of 3x6 device instead of masking off
1653 		 * corresponding bits because HW ignores bits of a disabled
1654 		 * subslice and drops down to appropriate config. Please
1655 		 * see render_state_setup() in i915_gem_render_state.c for
1656 		 * possible configurations, to avoid duplication they are
1657 		 * not shown here again.
1658 		 */
1659 		*batch++ = GEN9_MEDIA_POOL_STATE;
1660 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1661 		*batch++ = 0x00777000;
1662 		*batch++ = 0;
1663 		*batch++ = 0;
1664 		*batch++ = 0;
1665 	}
1666 
1667 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1668 
1669 	/* Pad to end of cacheline */
1670 	while ((unsigned long)batch % CACHELINE_BYTES)
1671 		*batch++ = MI_NOOP;
1672 
1673 	return batch;
1674 }
1675 
1676 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1677 
1678 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1679 {
1680 	struct drm_i915_gem_object *obj;
1681 	struct i915_vma *vma;
1682 	int err;
1683 
1684 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1685 	if (IS_ERR(obj))
1686 		return PTR_ERR(obj);
1687 
1688 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1689 	if (IS_ERR(vma)) {
1690 		err = PTR_ERR(vma);
1691 		goto err;
1692 	}
1693 
1694 	engine->wa_ctx.vma = vma;
1695 	return 0;
1696 
1697 err:
1698 	i915_gem_object_put(obj);
1699 	return err;
1700 }
1701 
1702 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1703 {
1704 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1705 }
1706 
1707 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1708 
1709 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1710 {
1711 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1712 	struct i915_wa_ctx_bb *wa_bb[] = {
1713 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1714 	};
1715 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1716 	struct i915_gem_ww_ctx ww;
1717 	void *batch, *batch_ptr;
1718 	unsigned int i;
1719 	int err;
1720 
1721 	if (!(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1722 		return;
1723 
1724 	switch (GRAPHICS_VER(engine->i915)) {
1725 	case 12:
1726 	case 11:
1727 		return;
1728 	case 9:
1729 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1730 		wa_bb_fn[1] = NULL;
1731 		break;
1732 	case 8:
1733 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1734 		wa_bb_fn[1] = NULL;
1735 		break;
1736 	default:
1737 		MISSING_CASE(GRAPHICS_VER(engine->i915));
1738 		return;
1739 	}
1740 
1741 	err = lrc_create_wa_ctx(engine);
1742 	if (err) {
1743 		/*
1744 		 * We continue even if we fail to initialize WA batch
1745 		 * because we only expect rare glitches but nothing
1746 		 * critical to prevent us from using GPU
1747 		 */
1748 		drm_err(&engine->i915->drm,
1749 			"Ignoring context switch w/a allocation error:%d\n",
1750 			err);
1751 		return;
1752 	}
1753 
1754 	if (!engine->wa_ctx.vma)
1755 		return;
1756 
1757 	i915_gem_ww_ctx_init(&ww, true);
1758 retry:
1759 	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1760 	if (!err)
1761 		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1762 	if (err)
1763 		goto err;
1764 
1765 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1766 	if (IS_ERR(batch)) {
1767 		err = PTR_ERR(batch);
1768 		goto err_unpin;
1769 	}
1770 
1771 	/*
1772 	 * Emit the two workaround batch buffers, recording the offset from the
1773 	 * start of the workaround batch buffer object for each and their
1774 	 * respective sizes.
1775 	 */
1776 	batch_ptr = batch;
1777 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1778 		wa_bb[i]->offset = batch_ptr - batch;
1779 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1780 						  CACHELINE_BYTES))) {
1781 			err = -EINVAL;
1782 			break;
1783 		}
1784 		if (wa_bb_fn[i])
1785 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1786 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1787 	}
1788 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1789 
1790 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1791 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1792 
1793 	/* Verify that we can handle failure to setup the wa_ctx */
1794 	if (!err)
1795 		err = i915_inject_probe_error(engine->i915, -ENODEV);
1796 
1797 err_unpin:
1798 	if (err)
1799 		i915_vma_unpin(wa_ctx->vma);
1800 err:
1801 	if (err == -EDEADLK) {
1802 		err = i915_gem_ww_ctx_backoff(&ww);
1803 		if (!err)
1804 			goto retry;
1805 	}
1806 	i915_gem_ww_ctx_fini(&ww);
1807 
1808 	if (err) {
1809 		i915_vma_put(engine->wa_ctx.vma);
1810 
1811 		/* Clear all flags to prevent further use */
1812 		memset(wa_ctx, 0, sizeof(*wa_ctx));
1813 	}
1814 }
1815 
1816 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1817 {
1818 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1819 	stats->runtime.num_underflow++;
1820 	stats->runtime.max_underflow =
1821 		max_t(u32, stats->runtime.max_underflow, -dt);
1822 #endif
1823 }
1824 
1825 static u32 lrc_get_runtime(const struct intel_context *ce)
1826 {
1827 	/*
1828 	 * We can use either ppHWSP[16] which is recorded before the context
1829 	 * switch (and so excludes the cost of context switches) or use the
1830 	 * value from the context image itself, which is saved/restored earlier
1831 	 * and so includes the cost of the save.
1832 	 */
1833 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1834 }
1835 
1836 void lrc_update_runtime(struct intel_context *ce)
1837 {
1838 	struct intel_context_stats *stats = &ce->stats;
1839 	u32 old;
1840 	s32 dt;
1841 
1842 	old = stats->runtime.last;
1843 	stats->runtime.last = lrc_get_runtime(ce);
1844 	dt = stats->runtime.last - old;
1845 	if (!dt)
1846 		return;
1847 
1848 	if (unlikely(dt < 0)) {
1849 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1850 			 old, stats->runtime.last, dt);
1851 		st_runtime_underflow(stats, dt);
1852 		return;
1853 	}
1854 
1855 	ewma_runtime_add(&stats->runtime.avg, dt);
1856 	stats->runtime.total += dt;
1857 }
1858 
1859 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1860 #include "selftest_lrc.c"
1861 #endif
1862