xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 25bcc828)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gem/i915_gem_lmem.h"
7 
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "i915_reg.h"
12 #include "intel_context.h"
13 #include "intel_engine.h"
14 #include "intel_engine_regs.h"
15 #include "intel_gpu_commands.h"
16 #include "intel_gt.h"
17 #include "intel_gt_regs.h"
18 #include "intel_lrc.h"
19 #include "intel_lrc_reg.h"
20 #include "intel_ring.h"
21 #include "shmem_utils.h"
22 
23 static void set_offsets(u32 *regs,
24 			const u8 *data,
25 			const struct intel_engine_cs *engine,
26 			bool close)
27 #define NOP(x) (BIT(7) | (x))
28 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
29 #define POSTED BIT(0)
30 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
31 #define REG16(x) \
32 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
33 	(((x) >> 2) & 0x7f)
34 #define END 0
35 {
36 	const u32 base = engine->mmio_base;
37 
38 	while (*data) {
39 		u8 count, flags;
40 
41 		if (*data & BIT(7)) { /* skip */
42 			count = *data++ & ~BIT(7);
43 			regs += count;
44 			continue;
45 		}
46 
47 		count = *data & 0x3f;
48 		flags = *data >> 6;
49 		data++;
50 
51 		*regs = MI_LOAD_REGISTER_IMM(count);
52 		if (flags & POSTED)
53 			*regs |= MI_LRI_FORCE_POSTED;
54 		if (GRAPHICS_VER(engine->i915) >= 11)
55 			*regs |= MI_LRI_LRM_CS_MMIO;
56 		regs++;
57 
58 		GEM_BUG_ON(!count);
59 		do {
60 			u32 offset = 0;
61 			u8 v;
62 
63 			do {
64 				v = *data++;
65 				offset <<= 7;
66 				offset |= v & ~BIT(7);
67 			} while (v & BIT(7));
68 
69 			regs[0] = base + (offset << 2);
70 			regs += 2;
71 		} while (--count);
72 	}
73 
74 	if (close) {
75 		/* Close the batch; used mainly by live_lrc_layout() */
76 		*regs = MI_BATCH_BUFFER_END;
77 		if (GRAPHICS_VER(engine->i915) >= 11)
78 			*regs |= BIT(0);
79 	}
80 }
81 
82 static const u8 gen8_xcs_offsets[] = {
83 	NOP(1),
84 	LRI(11, 0),
85 	REG16(0x244),
86 	REG(0x034),
87 	REG(0x030),
88 	REG(0x038),
89 	REG(0x03c),
90 	REG(0x168),
91 	REG(0x140),
92 	REG(0x110),
93 	REG(0x11c),
94 	REG(0x114),
95 	REG(0x118),
96 
97 	NOP(9),
98 	LRI(9, 0),
99 	REG16(0x3a8),
100 	REG16(0x28c),
101 	REG16(0x288),
102 	REG16(0x284),
103 	REG16(0x280),
104 	REG16(0x27c),
105 	REG16(0x278),
106 	REG16(0x274),
107 	REG16(0x270),
108 
109 	NOP(13),
110 	LRI(2, 0),
111 	REG16(0x200),
112 	REG(0x028),
113 
114 	END
115 };
116 
117 static const u8 gen9_xcs_offsets[] = {
118 	NOP(1),
119 	LRI(14, POSTED),
120 	REG16(0x244),
121 	REG(0x034),
122 	REG(0x030),
123 	REG(0x038),
124 	REG(0x03c),
125 	REG(0x168),
126 	REG(0x140),
127 	REG(0x110),
128 	REG(0x11c),
129 	REG(0x114),
130 	REG(0x118),
131 	REG(0x1c0),
132 	REG(0x1c4),
133 	REG(0x1c8),
134 
135 	NOP(3),
136 	LRI(9, POSTED),
137 	REG16(0x3a8),
138 	REG16(0x28c),
139 	REG16(0x288),
140 	REG16(0x284),
141 	REG16(0x280),
142 	REG16(0x27c),
143 	REG16(0x278),
144 	REG16(0x274),
145 	REG16(0x270),
146 
147 	NOP(13),
148 	LRI(1, POSTED),
149 	REG16(0x200),
150 
151 	NOP(13),
152 	LRI(44, POSTED),
153 	REG(0x028),
154 	REG(0x09c),
155 	REG(0x0c0),
156 	REG(0x178),
157 	REG(0x17c),
158 	REG16(0x358),
159 	REG(0x170),
160 	REG(0x150),
161 	REG(0x154),
162 	REG(0x158),
163 	REG16(0x41c),
164 	REG16(0x600),
165 	REG16(0x604),
166 	REG16(0x608),
167 	REG16(0x60c),
168 	REG16(0x610),
169 	REG16(0x614),
170 	REG16(0x618),
171 	REG16(0x61c),
172 	REG16(0x620),
173 	REG16(0x624),
174 	REG16(0x628),
175 	REG16(0x62c),
176 	REG16(0x630),
177 	REG16(0x634),
178 	REG16(0x638),
179 	REG16(0x63c),
180 	REG16(0x640),
181 	REG16(0x644),
182 	REG16(0x648),
183 	REG16(0x64c),
184 	REG16(0x650),
185 	REG16(0x654),
186 	REG16(0x658),
187 	REG16(0x65c),
188 	REG16(0x660),
189 	REG16(0x664),
190 	REG16(0x668),
191 	REG16(0x66c),
192 	REG16(0x670),
193 	REG16(0x674),
194 	REG16(0x678),
195 	REG16(0x67c),
196 	REG(0x068),
197 
198 	END
199 };
200 
201 static const u8 gen12_xcs_offsets[] = {
202 	NOP(1),
203 	LRI(13, POSTED),
204 	REG16(0x244),
205 	REG(0x034),
206 	REG(0x030),
207 	REG(0x038),
208 	REG(0x03c),
209 	REG(0x168),
210 	REG(0x140),
211 	REG(0x110),
212 	REG(0x1c0),
213 	REG(0x1c4),
214 	REG(0x1c8),
215 	REG(0x180),
216 	REG16(0x2b4),
217 
218 	NOP(5),
219 	LRI(9, POSTED),
220 	REG16(0x3a8),
221 	REG16(0x28c),
222 	REG16(0x288),
223 	REG16(0x284),
224 	REG16(0x280),
225 	REG16(0x27c),
226 	REG16(0x278),
227 	REG16(0x274),
228 	REG16(0x270),
229 
230 	END
231 };
232 
233 static const u8 dg2_xcs_offsets[] = {
234 	NOP(1),
235 	LRI(15, POSTED),
236 	REG16(0x244),
237 	REG(0x034),
238 	REG(0x030),
239 	REG(0x038),
240 	REG(0x03c),
241 	REG(0x168),
242 	REG(0x140),
243 	REG(0x110),
244 	REG(0x1c0),
245 	REG(0x1c4),
246 	REG(0x1c8),
247 	REG(0x180),
248 	REG16(0x2b4),
249 	REG(0x120),
250 	REG(0x124),
251 
252 	NOP(1),
253 	LRI(9, POSTED),
254 	REG16(0x3a8),
255 	REG16(0x28c),
256 	REG16(0x288),
257 	REG16(0x284),
258 	REG16(0x280),
259 	REG16(0x27c),
260 	REG16(0x278),
261 	REG16(0x274),
262 	REG16(0x270),
263 
264 	END
265 };
266 
267 static const u8 gen8_rcs_offsets[] = {
268 	NOP(1),
269 	LRI(14, POSTED),
270 	REG16(0x244),
271 	REG(0x034),
272 	REG(0x030),
273 	REG(0x038),
274 	REG(0x03c),
275 	REG(0x168),
276 	REG(0x140),
277 	REG(0x110),
278 	REG(0x11c),
279 	REG(0x114),
280 	REG(0x118),
281 	REG(0x1c0),
282 	REG(0x1c4),
283 	REG(0x1c8),
284 
285 	NOP(3),
286 	LRI(9, POSTED),
287 	REG16(0x3a8),
288 	REG16(0x28c),
289 	REG16(0x288),
290 	REG16(0x284),
291 	REG16(0x280),
292 	REG16(0x27c),
293 	REG16(0x278),
294 	REG16(0x274),
295 	REG16(0x270),
296 
297 	NOP(13),
298 	LRI(1, 0),
299 	REG(0x0c8),
300 
301 	END
302 };
303 
304 static const u8 gen9_rcs_offsets[] = {
305 	NOP(1),
306 	LRI(14, POSTED),
307 	REG16(0x244),
308 	REG(0x34),
309 	REG(0x30),
310 	REG(0x38),
311 	REG(0x3c),
312 	REG(0x168),
313 	REG(0x140),
314 	REG(0x110),
315 	REG(0x11c),
316 	REG(0x114),
317 	REG(0x118),
318 	REG(0x1c0),
319 	REG(0x1c4),
320 	REG(0x1c8),
321 
322 	NOP(3),
323 	LRI(9, POSTED),
324 	REG16(0x3a8),
325 	REG16(0x28c),
326 	REG16(0x288),
327 	REG16(0x284),
328 	REG16(0x280),
329 	REG16(0x27c),
330 	REG16(0x278),
331 	REG16(0x274),
332 	REG16(0x270),
333 
334 	NOP(13),
335 	LRI(1, 0),
336 	REG(0xc8),
337 
338 	NOP(13),
339 	LRI(44, POSTED),
340 	REG(0x28),
341 	REG(0x9c),
342 	REG(0xc0),
343 	REG(0x178),
344 	REG(0x17c),
345 	REG16(0x358),
346 	REG(0x170),
347 	REG(0x150),
348 	REG(0x154),
349 	REG(0x158),
350 	REG16(0x41c),
351 	REG16(0x600),
352 	REG16(0x604),
353 	REG16(0x608),
354 	REG16(0x60c),
355 	REG16(0x610),
356 	REG16(0x614),
357 	REG16(0x618),
358 	REG16(0x61c),
359 	REG16(0x620),
360 	REG16(0x624),
361 	REG16(0x628),
362 	REG16(0x62c),
363 	REG16(0x630),
364 	REG16(0x634),
365 	REG16(0x638),
366 	REG16(0x63c),
367 	REG16(0x640),
368 	REG16(0x644),
369 	REG16(0x648),
370 	REG16(0x64c),
371 	REG16(0x650),
372 	REG16(0x654),
373 	REG16(0x658),
374 	REG16(0x65c),
375 	REG16(0x660),
376 	REG16(0x664),
377 	REG16(0x668),
378 	REG16(0x66c),
379 	REG16(0x670),
380 	REG16(0x674),
381 	REG16(0x678),
382 	REG16(0x67c),
383 	REG(0x68),
384 
385 	END
386 };
387 
388 static const u8 gen11_rcs_offsets[] = {
389 	NOP(1),
390 	LRI(15, POSTED),
391 	REG16(0x244),
392 	REG(0x034),
393 	REG(0x030),
394 	REG(0x038),
395 	REG(0x03c),
396 	REG(0x168),
397 	REG(0x140),
398 	REG(0x110),
399 	REG(0x11c),
400 	REG(0x114),
401 	REG(0x118),
402 	REG(0x1c0),
403 	REG(0x1c4),
404 	REG(0x1c8),
405 	REG(0x180),
406 
407 	NOP(1),
408 	LRI(9, POSTED),
409 	REG16(0x3a8),
410 	REG16(0x28c),
411 	REG16(0x288),
412 	REG16(0x284),
413 	REG16(0x280),
414 	REG16(0x27c),
415 	REG16(0x278),
416 	REG16(0x274),
417 	REG16(0x270),
418 
419 	LRI(1, POSTED),
420 	REG(0x1b0),
421 
422 	NOP(10),
423 	LRI(1, 0),
424 	REG(0x0c8),
425 
426 	END
427 };
428 
429 static const u8 gen12_rcs_offsets[] = {
430 	NOP(1),
431 	LRI(13, POSTED),
432 	REG16(0x244),
433 	REG(0x034),
434 	REG(0x030),
435 	REG(0x038),
436 	REG(0x03c),
437 	REG(0x168),
438 	REG(0x140),
439 	REG(0x110),
440 	REG(0x1c0),
441 	REG(0x1c4),
442 	REG(0x1c8),
443 	REG(0x180),
444 	REG16(0x2b4),
445 
446 	NOP(5),
447 	LRI(9, POSTED),
448 	REG16(0x3a8),
449 	REG16(0x28c),
450 	REG16(0x288),
451 	REG16(0x284),
452 	REG16(0x280),
453 	REG16(0x27c),
454 	REG16(0x278),
455 	REG16(0x274),
456 	REG16(0x270),
457 
458 	LRI(3, POSTED),
459 	REG(0x1b0),
460 	REG16(0x5a8),
461 	REG16(0x5ac),
462 
463 	NOP(6),
464 	LRI(1, 0),
465 	REG(0x0c8),
466 	NOP(3 + 9 + 1),
467 
468 	LRI(51, POSTED),
469 	REG16(0x588),
470 	REG16(0x588),
471 	REG16(0x588),
472 	REG16(0x588),
473 	REG16(0x588),
474 	REG16(0x588),
475 	REG(0x028),
476 	REG(0x09c),
477 	REG(0x0c0),
478 	REG(0x178),
479 	REG(0x17c),
480 	REG16(0x358),
481 	REG(0x170),
482 	REG(0x150),
483 	REG(0x154),
484 	REG(0x158),
485 	REG16(0x41c),
486 	REG16(0x600),
487 	REG16(0x604),
488 	REG16(0x608),
489 	REG16(0x60c),
490 	REG16(0x610),
491 	REG16(0x614),
492 	REG16(0x618),
493 	REG16(0x61c),
494 	REG16(0x620),
495 	REG16(0x624),
496 	REG16(0x628),
497 	REG16(0x62c),
498 	REG16(0x630),
499 	REG16(0x634),
500 	REG16(0x638),
501 	REG16(0x63c),
502 	REG16(0x640),
503 	REG16(0x644),
504 	REG16(0x648),
505 	REG16(0x64c),
506 	REG16(0x650),
507 	REG16(0x654),
508 	REG16(0x658),
509 	REG16(0x65c),
510 	REG16(0x660),
511 	REG16(0x664),
512 	REG16(0x668),
513 	REG16(0x66c),
514 	REG16(0x670),
515 	REG16(0x674),
516 	REG16(0x678),
517 	REG16(0x67c),
518 	REG(0x068),
519 	REG(0x084),
520 	NOP(1),
521 
522 	END
523 };
524 
525 static const u8 xehp_rcs_offsets[] = {
526 	NOP(1),
527 	LRI(13, POSTED),
528 	REG16(0x244),
529 	REG(0x034),
530 	REG(0x030),
531 	REG(0x038),
532 	REG(0x03c),
533 	REG(0x168),
534 	REG(0x140),
535 	REG(0x110),
536 	REG(0x1c0),
537 	REG(0x1c4),
538 	REG(0x1c8),
539 	REG(0x180),
540 	REG16(0x2b4),
541 
542 	NOP(5),
543 	LRI(9, POSTED),
544 	REG16(0x3a8),
545 	REG16(0x28c),
546 	REG16(0x288),
547 	REG16(0x284),
548 	REG16(0x280),
549 	REG16(0x27c),
550 	REG16(0x278),
551 	REG16(0x274),
552 	REG16(0x270),
553 
554 	LRI(3, POSTED),
555 	REG(0x1b0),
556 	REG16(0x5a8),
557 	REG16(0x5ac),
558 
559 	NOP(6),
560 	LRI(1, 0),
561 	REG(0x0c8),
562 
563 	END
564 };
565 
566 static const u8 dg2_rcs_offsets[] = {
567 	NOP(1),
568 	LRI(15, POSTED),
569 	REG16(0x244),
570 	REG(0x034),
571 	REG(0x030),
572 	REG(0x038),
573 	REG(0x03c),
574 	REG(0x168),
575 	REG(0x140),
576 	REG(0x110),
577 	REG(0x1c0),
578 	REG(0x1c4),
579 	REG(0x1c8),
580 	REG(0x180),
581 	REG16(0x2b4),
582 	REG(0x120),
583 	REG(0x124),
584 
585 	NOP(1),
586 	LRI(9, POSTED),
587 	REG16(0x3a8),
588 	REG16(0x28c),
589 	REG16(0x288),
590 	REG16(0x284),
591 	REG16(0x280),
592 	REG16(0x27c),
593 	REG16(0x278),
594 	REG16(0x274),
595 	REG16(0x270),
596 
597 	LRI(3, POSTED),
598 	REG(0x1b0),
599 	REG16(0x5a8),
600 	REG16(0x5ac),
601 
602 	NOP(6),
603 	LRI(1, 0),
604 	REG(0x0c8),
605 
606 	END
607 };
608 
609 #undef END
610 #undef REG16
611 #undef REG
612 #undef LRI
613 #undef NOP
614 
615 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
616 {
617 	/*
618 	 * The gen12+ lists only have the registers we program in the basic
619 	 * default state. We rely on the context image using relative
620 	 * addressing to automatic fixup the register state between the
621 	 * physical engines for virtual engine.
622 	 */
623 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
624 		   !intel_engine_has_relative_mmio(engine));
625 
626 	if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
627 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
628 			return dg2_rcs_offsets;
629 		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
630 			return xehp_rcs_offsets;
631 		else if (GRAPHICS_VER(engine->i915) >= 12)
632 			return gen12_rcs_offsets;
633 		else if (GRAPHICS_VER(engine->i915) >= 11)
634 			return gen11_rcs_offsets;
635 		else if (GRAPHICS_VER(engine->i915) >= 9)
636 			return gen9_rcs_offsets;
637 		else
638 			return gen8_rcs_offsets;
639 	} else {
640 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
641 			return dg2_xcs_offsets;
642 		else if (GRAPHICS_VER(engine->i915) >= 12)
643 			return gen12_xcs_offsets;
644 		else if (GRAPHICS_VER(engine->i915) >= 9)
645 			return gen9_xcs_offsets;
646 		else
647 			return gen8_xcs_offsets;
648 	}
649 }
650 
651 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
652 {
653 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
654 		return 0x70;
655 	else if (GRAPHICS_VER(engine->i915) >= 12)
656 		return 0x60;
657 	else if (GRAPHICS_VER(engine->i915) >= 9)
658 		return 0x54;
659 	else if (engine->class == RENDER_CLASS)
660 		return 0x58;
661 	else
662 		return -1;
663 }
664 
665 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
666 {
667 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
668 		return 0x84;
669 	else if (GRAPHICS_VER(engine->i915) >= 12)
670 		return 0x74;
671 	else if (GRAPHICS_VER(engine->i915) >= 9)
672 		return 0x68;
673 	else if (engine->class == RENDER_CLASS)
674 		return 0xd8;
675 	else
676 		return -1;
677 }
678 
679 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
680 {
681 	if (GRAPHICS_VER(engine->i915) >= 12)
682 		return 0x12;
683 	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
684 		return 0x18;
685 	else
686 		return -1;
687 }
688 
689 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
690 {
691 	int x;
692 
693 	x = lrc_ring_wa_bb_per_ctx(engine);
694 	if (x < 0)
695 		return x;
696 
697 	return x + 2;
698 }
699 
700 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
701 {
702 	int x;
703 
704 	x = lrc_ring_indirect_ptr(engine);
705 	if (x < 0)
706 		return x;
707 
708 	return x + 2;
709 }
710 
711 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
712 {
713 
714 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
715 		/*
716 		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
717 		 * simply to match the RCS context image layout.
718 		 */
719 		return 0xc6;
720 	else if (engine->class != RENDER_CLASS)
721 		return -1;
722 	else if (GRAPHICS_VER(engine->i915) >= 12)
723 		return 0xb6;
724 	else if (GRAPHICS_VER(engine->i915) >= 11)
725 		return 0xaa;
726 	else
727 		return -1;
728 }
729 
730 static u32
731 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
732 {
733 	switch (GRAPHICS_VER(engine->i915)) {
734 	default:
735 		MISSING_CASE(GRAPHICS_VER(engine->i915));
736 		fallthrough;
737 	case 12:
738 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
739 	case 11:
740 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
741 	case 9:
742 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
743 	case 8:
744 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
745 	}
746 }
747 
748 static void
749 lrc_setup_indirect_ctx(u32 *regs,
750 		       const struct intel_engine_cs *engine,
751 		       u32 ctx_bb_ggtt_addr,
752 		       u32 size)
753 {
754 	GEM_BUG_ON(!size);
755 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
756 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
757 	regs[lrc_ring_indirect_ptr(engine) + 1] =
758 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
759 
760 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
761 	regs[lrc_ring_indirect_offset(engine) + 1] =
762 		lrc_ring_indirect_offset_default(engine) << 6;
763 }
764 
765 static void init_common_regs(u32 * const regs,
766 			     const struct intel_context *ce,
767 			     const struct intel_engine_cs *engine,
768 			     bool inhibit)
769 {
770 	u32 ctl;
771 
772 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
773 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
774 	if (inhibit)
775 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
776 	if (GRAPHICS_VER(engine->i915) < 11)
777 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
778 					   CTX_CTRL_RS_CTX_ENABLE);
779 	regs[CTX_CONTEXT_CONTROL] = ctl;
780 
781 	regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
782 }
783 
784 static void init_wa_bb_regs(u32 * const regs,
785 			    const struct intel_engine_cs *engine)
786 {
787 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
788 
789 	if (wa_ctx->per_ctx.size) {
790 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
791 
792 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
793 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
794 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
795 	}
796 
797 	if (wa_ctx->indirect_ctx.size) {
798 		lrc_setup_indirect_ctx(regs, engine,
799 				       i915_ggtt_offset(wa_ctx->vma) +
800 				       wa_ctx->indirect_ctx.offset,
801 				       wa_ctx->indirect_ctx.size);
802 	}
803 }
804 
805 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
806 {
807 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
808 		/* 64b PPGTT (48bit canonical)
809 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
810 		 * other PDP Descriptors are ignored.
811 		 */
812 		ASSIGN_CTX_PML4(ppgtt, regs);
813 	} else {
814 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
815 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
816 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
817 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
818 	}
819 }
820 
821 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
822 {
823 	if (i915_is_ggtt(vm))
824 		return i915_vm_to_ggtt(vm)->alias;
825 	else
826 		return i915_vm_to_ppgtt(vm);
827 }
828 
829 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
830 {
831 	int x;
832 
833 	x = lrc_ring_mi_mode(engine);
834 	if (x != -1) {
835 		regs[x + 1] &= ~STOP_RING;
836 		regs[x + 1] |= STOP_RING << 16;
837 	}
838 }
839 
840 static void __lrc_init_regs(u32 *regs,
841 			    const struct intel_context *ce,
842 			    const struct intel_engine_cs *engine,
843 			    bool inhibit)
844 {
845 	/*
846 	 * A context is actually a big batch buffer with several
847 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
848 	 * values we are setting here are only for the first context restore:
849 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
850 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
851 	 * we are not initializing here).
852 	 *
853 	 * Must keep consistent with virtual_update_register_offsets().
854 	 */
855 
856 	if (inhibit)
857 		memset(regs, 0, PAGE_SIZE);
858 
859 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
860 
861 	init_common_regs(regs, ce, engine, inhibit);
862 	init_ppgtt_regs(regs, vm_alias(ce->vm));
863 
864 	init_wa_bb_regs(regs, engine);
865 
866 	__reset_stop_ring(regs, engine);
867 }
868 
869 void lrc_init_regs(const struct intel_context *ce,
870 		   const struct intel_engine_cs *engine,
871 		   bool inhibit)
872 {
873 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
874 }
875 
876 void lrc_reset_regs(const struct intel_context *ce,
877 		    const struct intel_engine_cs *engine)
878 {
879 	__reset_stop_ring(ce->lrc_reg_state, engine);
880 }
881 
882 static void
883 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
884 {
885 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
886 		return;
887 
888 	vaddr += engine->context_size;
889 
890 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
891 }
892 
893 static void
894 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
895 {
896 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
897 		return;
898 
899 	vaddr += engine->context_size;
900 
901 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
902 		drm_err_once(&engine->i915->drm,
903 			     "%s context redzone overwritten!\n",
904 			     engine->name);
905 }
906 
907 static u32 context_wa_bb_offset(const struct intel_context *ce)
908 {
909 	return PAGE_SIZE * ce->wa_bb_page;
910 }
911 
912 static u32 *context_indirect_bb(const struct intel_context *ce)
913 {
914 	void *ptr;
915 
916 	GEM_BUG_ON(!ce->wa_bb_page);
917 
918 	ptr = ce->lrc_reg_state;
919 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
920 	ptr += context_wa_bb_offset(ce);
921 
922 	return ptr;
923 }
924 
925 void lrc_init_state(struct intel_context *ce,
926 		    struct intel_engine_cs *engine,
927 		    void *state)
928 {
929 	bool inhibit = true;
930 
931 	set_redzone(state, engine);
932 
933 	if (engine->default_state) {
934 		shmem_read(engine->default_state, 0,
935 			   state, engine->context_size);
936 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
937 		inhibit = false;
938 	}
939 
940 	/* Clear the ppHWSP (inc. per-context counters) */
941 	memset(state, 0, PAGE_SIZE);
942 
943 	/* Clear the indirect wa and storage */
944 	if (ce->wa_bb_page)
945 		memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
946 
947 	/*
948 	 * The second page of the context object contains some registers which
949 	 * must be set up prior to the first execution.
950 	 */
951 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
952 }
953 
954 u32 lrc_indirect_bb(const struct intel_context *ce)
955 {
956 	return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
957 }
958 
959 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
960 {
961 	/* If predication is active, this will be noop'ed */
962 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
963 	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
964 	*cs++ = 0;
965 	*cs++ = 0; /* No predication */
966 
967 	/* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
968 	*cs++ = MI_BATCH_BUFFER_END | BIT(15);
969 	*cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
970 
971 	/* Instructions are no longer predicated (disabled), we can proceed */
972 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
973 	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
974 	*cs++ = 0;
975 	*cs++ = 1; /* enable predication before the next BB */
976 
977 	*cs++ = MI_BATCH_BUFFER_END;
978 	GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
979 
980 	return cs;
981 }
982 
983 static struct i915_vma *
984 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
985 {
986 	struct drm_i915_gem_object *obj;
987 	struct i915_vma *vma;
988 	u32 context_size;
989 
990 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
991 
992 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
993 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
994 
995 	if (GRAPHICS_VER(engine->i915) == 12) {
996 		ce->wa_bb_page = context_size / PAGE_SIZE;
997 		context_size += PAGE_SIZE;
998 	}
999 
1000 	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1001 		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1002 		context_size += PARENT_SCRATCH_SIZE;
1003 	}
1004 
1005 	obj = i915_gem_object_create_lmem(engine->i915, context_size,
1006 					  I915_BO_ALLOC_PM_VOLATILE);
1007 	if (IS_ERR(obj))
1008 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
1009 	if (IS_ERR(obj))
1010 		return ERR_CAST(obj);
1011 
1012 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1013 	if (IS_ERR(vma)) {
1014 		i915_gem_object_put(obj);
1015 		return vma;
1016 	}
1017 
1018 	return vma;
1019 }
1020 
1021 static struct intel_timeline *
1022 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1023 {
1024 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1025 
1026 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1027 }
1028 
1029 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1030 {
1031 	struct intel_ring *ring;
1032 	struct i915_vma *vma;
1033 	int err;
1034 
1035 	GEM_BUG_ON(ce->state);
1036 
1037 	vma = __lrc_alloc_state(ce, engine);
1038 	if (IS_ERR(vma))
1039 		return PTR_ERR(vma);
1040 
1041 	ring = intel_engine_create_ring(engine, ce->ring_size);
1042 	if (IS_ERR(ring)) {
1043 		err = PTR_ERR(ring);
1044 		goto err_vma;
1045 	}
1046 
1047 	if (!page_mask_bits(ce->timeline)) {
1048 		struct intel_timeline *tl;
1049 
1050 		/*
1051 		 * Use the static global HWSP for the kernel context, and
1052 		 * a dynamically allocated cacheline for everyone else.
1053 		 */
1054 		if (unlikely(ce->timeline))
1055 			tl = pinned_timeline(ce, engine);
1056 		else
1057 			tl = intel_timeline_create(engine->gt);
1058 		if (IS_ERR(tl)) {
1059 			err = PTR_ERR(tl);
1060 			goto err_ring;
1061 		}
1062 
1063 		ce->timeline = tl;
1064 	}
1065 
1066 	ce->ring = ring;
1067 	ce->state = vma;
1068 
1069 	return 0;
1070 
1071 err_ring:
1072 	intel_ring_put(ring);
1073 err_vma:
1074 	i915_vma_put(vma);
1075 	return err;
1076 }
1077 
1078 void lrc_reset(struct intel_context *ce)
1079 {
1080 	GEM_BUG_ON(!intel_context_is_pinned(ce));
1081 
1082 	intel_ring_reset(ce->ring, ce->ring->emit);
1083 
1084 	/* Scrub away the garbage */
1085 	lrc_init_regs(ce, ce->engine, true);
1086 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1087 }
1088 
1089 int
1090 lrc_pre_pin(struct intel_context *ce,
1091 	    struct intel_engine_cs *engine,
1092 	    struct i915_gem_ww_ctx *ww,
1093 	    void **vaddr)
1094 {
1095 	GEM_BUG_ON(!ce->state);
1096 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1097 
1098 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1099 					 i915_coherent_map_type(ce->engine->i915,
1100 								ce->state->obj,
1101 								false) |
1102 					 I915_MAP_OVERRIDE);
1103 
1104 	return PTR_ERR_OR_ZERO(*vaddr);
1105 }
1106 
1107 int
1108 lrc_pin(struct intel_context *ce,
1109 	struct intel_engine_cs *engine,
1110 	void *vaddr)
1111 {
1112 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1113 
1114 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1115 		lrc_init_state(ce, engine, vaddr);
1116 
1117 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1118 	return 0;
1119 }
1120 
1121 void lrc_unpin(struct intel_context *ce)
1122 {
1123 	if (unlikely(ce->parallel.last_rq)) {
1124 		i915_request_put(ce->parallel.last_rq);
1125 		ce->parallel.last_rq = NULL;
1126 	}
1127 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1128 		      ce->engine);
1129 }
1130 
1131 void lrc_post_unpin(struct intel_context *ce)
1132 {
1133 	i915_gem_object_unpin_map(ce->state->obj);
1134 }
1135 
1136 void lrc_fini(struct intel_context *ce)
1137 {
1138 	if (!ce->state)
1139 		return;
1140 
1141 	intel_ring_put(fetch_and_zero(&ce->ring));
1142 	i915_vma_put(fetch_and_zero(&ce->state));
1143 }
1144 
1145 void lrc_destroy(struct kref *kref)
1146 {
1147 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1148 
1149 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1150 	GEM_BUG_ON(intel_context_is_pinned(ce));
1151 
1152 	lrc_fini(ce);
1153 
1154 	intel_context_fini(ce);
1155 	intel_context_free(ce);
1156 }
1157 
1158 static u32 *
1159 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1160 {
1161 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1162 		MI_SRM_LRM_GLOBAL_GTT |
1163 		MI_LRI_LRM_CS_MMIO;
1164 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1165 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1166 		CTX_TIMESTAMP * sizeof(u32);
1167 	*cs++ = 0;
1168 
1169 	*cs++ = MI_LOAD_REGISTER_REG |
1170 		MI_LRR_SOURCE_CS_MMIO |
1171 		MI_LRI_LRM_CS_MMIO;
1172 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1173 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1174 
1175 	*cs++ = MI_LOAD_REGISTER_REG |
1176 		MI_LRR_SOURCE_CS_MMIO |
1177 		MI_LRI_LRM_CS_MMIO;
1178 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1179 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1180 
1181 	return cs;
1182 }
1183 
1184 static u32 *
1185 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1186 {
1187 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1188 
1189 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1190 		MI_SRM_LRM_GLOBAL_GTT |
1191 		MI_LRI_LRM_CS_MMIO;
1192 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1193 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1194 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1195 	*cs++ = 0;
1196 
1197 	return cs;
1198 }
1199 
1200 static u32 *
1201 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1202 {
1203 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1204 
1205 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1206 		MI_SRM_LRM_GLOBAL_GTT |
1207 		MI_LRI_LRM_CS_MMIO;
1208 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1209 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1210 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1211 	*cs++ = 0;
1212 
1213 	*cs++ = MI_LOAD_REGISTER_REG |
1214 		MI_LRR_SOURCE_CS_MMIO |
1215 		MI_LRI_LRM_CS_MMIO;
1216 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1217 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1218 
1219 	return cs;
1220 }
1221 
1222 /*
1223  * On DG2 during context restore of a preempted context in GPGPU mode,
1224  * RCS restore hang is detected. This is extremely timing dependent.
1225  * To address this below sw wabb is implemented for DG2 A steppings.
1226  */
1227 static u32 *
1228 dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs)
1229 {
1230 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1231 	*cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG);
1232 	*cs++ = 0x21;
1233 
1234 	*cs++ = MI_LOAD_REGISTER_REG;
1235 	*cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1236 	*cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1);
1237 
1238 	*cs++ = MI_LOAD_REGISTER_REG;
1239 	*cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1240 	*cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2);
1241 
1242 	return cs;
1243 }
1244 
1245 /*
1246  * The bspec's tuning guide asks us to program a vertical watermark value of
1247  * 0x3FF.  However this register is not saved/restored properly by the
1248  * hardware, so we're required to apply the desired value via INDIRECT_CTX
1249  * batch buffer to ensure the value takes effect properly.  All other bits
1250  * in this register should remain at 0 (the hardware default).
1251  */
1252 static u32 *
1253 dg2_emit_draw_watermark_setting(u32 *cs)
1254 {
1255 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1256 	*cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1257 	*cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1258 
1259 	return cs;
1260 }
1261 
1262 static u32 *
1263 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1264 {
1265 	cs = gen12_emit_timestamp_wa(ce, cs);
1266 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1267 	cs = gen12_emit_restore_scratch(ce, cs);
1268 
1269 	/* Wa_22011450934:dg2 */
1270 	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) ||
1271 	    IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0))
1272 		cs = dg2_emit_rcs_hang_wabb(ce, cs);
1273 
1274 	/* Wa_16013000631:dg2 */
1275 	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1276 	    IS_DG2_G11(ce->engine->i915))
1277 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1278 
1279 	/* hsdes: 1809175790 */
1280 	if (!HAS_FLAT_CCS(ce->engine->i915))
1281 		cs = gen12_emit_aux_table_inv(cs, GEN12_GFX_CCS_AUX_NV);
1282 
1283 	/* Wa_16014892111 */
1284 	if (IS_DG2(ce->engine->i915))
1285 		cs = dg2_emit_draw_watermark_setting(cs);
1286 
1287 	return cs;
1288 }
1289 
1290 static u32 *
1291 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1292 {
1293 	cs = gen12_emit_timestamp_wa(ce, cs);
1294 	cs = gen12_emit_restore_scratch(ce, cs);
1295 
1296 	/* Wa_16013000631:dg2 */
1297 	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1298 	    IS_DG2_G11(ce->engine->i915))
1299 		if (ce->engine->class == COMPUTE_CLASS)
1300 			cs = gen8_emit_pipe_control(cs,
1301 						    PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1302 						    0);
1303 
1304 	/* hsdes: 1809175790 */
1305 	if (!HAS_FLAT_CCS(ce->engine->i915)) {
1306 		if (ce->engine->class == VIDEO_DECODE_CLASS)
1307 			cs = gen12_emit_aux_table_inv(cs, GEN12_VD0_AUX_NV);
1308 		else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS)
1309 			cs = gen12_emit_aux_table_inv(cs, GEN12_VE0_AUX_NV);
1310 	}
1311 
1312 	return cs;
1313 }
1314 
1315 static void
1316 setup_indirect_ctx_bb(const struct intel_context *ce,
1317 		      const struct intel_engine_cs *engine,
1318 		      u32 *(*emit)(const struct intel_context *, u32 *))
1319 {
1320 	u32 * const start = context_indirect_bb(ce);
1321 	u32 *cs;
1322 
1323 	cs = emit(ce, start);
1324 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1325 	while ((unsigned long)cs % CACHELINE_BYTES)
1326 		*cs++ = MI_NOOP;
1327 
1328 	GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1329 	setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1330 
1331 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1332 			       lrc_indirect_bb(ce),
1333 			       (cs - start) * sizeof(*cs));
1334 }
1335 
1336 /*
1337  * The context descriptor encodes various attributes of a context,
1338  * including its GTT address and some flags. Because it's fairly
1339  * expensive to calculate, we'll just do it once and cache the result,
1340  * which remains valid until the context is unpinned.
1341  *
1342  * This is what a descriptor looks like, from LSB to MSB::
1343  *
1344  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1345  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1346  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1347  *      bits 53-54:    mbz, reserved for use by hardware
1348  *      bits 55-63:    group ID, currently unused and set to 0
1349  *
1350  * Starting from Gen11, the upper dword of the descriptor has a new format:
1351  *
1352  *      bits 32-36:    reserved
1353  *      bits 37-47:    SW context ID
1354  *      bits 48:53:    engine instance
1355  *      bit 54:        mbz, reserved for use by hardware
1356  *      bits 55-60:    SW counter
1357  *      bits 61-63:    engine class
1358  *
1359  * On Xe_HP, the upper dword of the descriptor has a new format:
1360  *
1361  *      bits 32-37:    virtual function number
1362  *      bit 38:        mbz, reserved for use by hardware
1363  *      bits 39-54:    SW context ID
1364  *      bits 55-57:    reserved
1365  *      bits 58-63:    SW counter
1366  *
1367  * engine info, SW context ID and SW counter need to form a unique number
1368  * (Context ID) per lrc.
1369  */
1370 static u32 lrc_descriptor(const struct intel_context *ce)
1371 {
1372 	u32 desc;
1373 
1374 	desc = INTEL_LEGACY_32B_CONTEXT;
1375 	if (i915_vm_is_4lvl(ce->vm))
1376 		desc = INTEL_LEGACY_64B_CONTEXT;
1377 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1378 
1379 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1380 	if (GRAPHICS_VER(ce->vm->i915) == 8)
1381 		desc |= GEN8_CTX_L3LLC_COHERENT;
1382 
1383 	return i915_ggtt_offset(ce->state) | desc;
1384 }
1385 
1386 u32 lrc_update_regs(const struct intel_context *ce,
1387 		    const struct intel_engine_cs *engine,
1388 		    u32 head)
1389 {
1390 	struct intel_ring *ring = ce->ring;
1391 	u32 *regs = ce->lrc_reg_state;
1392 
1393 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1394 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1395 
1396 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1397 	regs[CTX_RING_HEAD] = head;
1398 	regs[CTX_RING_TAIL] = ring->tail;
1399 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1400 
1401 	/* RPCS */
1402 	if (engine->class == RENDER_CLASS) {
1403 		regs[CTX_R_PWR_CLK_STATE] =
1404 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1405 
1406 		i915_oa_init_reg_state(ce, engine);
1407 	}
1408 
1409 	if (ce->wa_bb_page) {
1410 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1411 
1412 		fn = gen12_emit_indirect_ctx_xcs;
1413 		if (ce->engine->class == RENDER_CLASS)
1414 			fn = gen12_emit_indirect_ctx_rcs;
1415 
1416 		/* Mutually exclusive wrt to global indirect bb */
1417 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1418 		setup_indirect_ctx_bb(ce, engine, fn);
1419 	}
1420 
1421 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1422 }
1423 
1424 void lrc_update_offsets(struct intel_context *ce,
1425 			struct intel_engine_cs *engine)
1426 {
1427 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1428 }
1429 
1430 void lrc_check_regs(const struct intel_context *ce,
1431 		    const struct intel_engine_cs *engine,
1432 		    const char *when)
1433 {
1434 	const struct intel_ring *ring = ce->ring;
1435 	u32 *regs = ce->lrc_reg_state;
1436 	bool valid = true;
1437 	int x;
1438 
1439 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1440 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1441 		       engine->name,
1442 		       regs[CTX_RING_START],
1443 		       i915_ggtt_offset(ring->vma));
1444 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1445 		valid = false;
1446 	}
1447 
1448 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1449 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1450 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1451 		       engine->name,
1452 		       regs[CTX_RING_CTL],
1453 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1454 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1455 		valid = false;
1456 	}
1457 
1458 	x = lrc_ring_mi_mode(engine);
1459 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1460 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1461 		       engine->name, regs[x + 1]);
1462 		regs[x + 1] &= ~STOP_RING;
1463 		regs[x + 1] |= STOP_RING << 16;
1464 		valid = false;
1465 	}
1466 
1467 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1468 }
1469 
1470 /*
1471  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1472  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1473  * but there is a slight complication as this is applied in WA batch where the
1474  * values are only initialized once so we cannot take register value at the
1475  * beginning and reuse it further; hence we save its value to memory, upload a
1476  * constant value with bit21 set and then we restore it back with the saved value.
1477  * To simplify the WA, a constant value is formed by using the default value
1478  * of this register. This shouldn't be a problem because we are only modifying
1479  * it for a short period and this batch in non-premptible. We can ofcourse
1480  * use additional instructions that read the actual value of the register
1481  * at that time and set our bit of interest but it makes the WA complicated.
1482  *
1483  * This WA is also required for Gen9 so extracting as a function avoids
1484  * code duplication.
1485  */
1486 static u32 *
1487 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1488 {
1489 	/* NB no one else is allowed to scribble over scratch + 256! */
1490 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1491 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1492 	*batch++ = intel_gt_scratch_offset(engine->gt,
1493 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1494 	*batch++ = 0;
1495 
1496 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1497 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1498 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1499 
1500 	batch = gen8_emit_pipe_control(batch,
1501 				       PIPE_CONTROL_CS_STALL |
1502 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1503 				       0);
1504 
1505 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1506 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1507 	*batch++ = intel_gt_scratch_offset(engine->gt,
1508 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1509 	*batch++ = 0;
1510 
1511 	return batch;
1512 }
1513 
1514 /*
1515  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1516  * initialized at the beginning and shared across all contexts but this field
1517  * helps us to have multiple batches at different offsets and select them based
1518  * on a criteria. At the moment this batch always start at the beginning of the page
1519  * and at this point we don't have multiple wa_ctx batch buffers.
1520  *
1521  * The number of WA applied are not known at the beginning; we use this field
1522  * to return the no of DWORDS written.
1523  *
1524  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1525  * so it adds NOOPs as padding to make it cacheline aligned.
1526  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1527  * makes a complete batch buffer.
1528  */
1529 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1530 {
1531 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1532 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1533 
1534 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1535 	if (IS_BROADWELL(engine->i915))
1536 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1537 
1538 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1539 	/* Actual scratch location is at 128 bytes offset */
1540 	batch = gen8_emit_pipe_control(batch,
1541 				       PIPE_CONTROL_FLUSH_L3 |
1542 				       PIPE_CONTROL_STORE_DATA_INDEX |
1543 				       PIPE_CONTROL_CS_STALL |
1544 				       PIPE_CONTROL_QW_WRITE,
1545 				       LRC_PPHWSP_SCRATCH_ADDR);
1546 
1547 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1548 
1549 	/* Pad to end of cacheline */
1550 	while ((unsigned long)batch % CACHELINE_BYTES)
1551 		*batch++ = MI_NOOP;
1552 
1553 	/*
1554 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1555 	 * execution depends on the length specified in terms of cache lines
1556 	 * in the register CTX_RCS_INDIRECT_CTX
1557 	 */
1558 
1559 	return batch;
1560 }
1561 
1562 struct lri {
1563 	i915_reg_t reg;
1564 	u32 value;
1565 };
1566 
1567 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1568 {
1569 	GEM_BUG_ON(!count || count > 63);
1570 
1571 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1572 	do {
1573 		*batch++ = i915_mmio_reg_offset(lri->reg);
1574 		*batch++ = lri->value;
1575 	} while (lri++, --count);
1576 	*batch++ = MI_NOOP;
1577 
1578 	return batch;
1579 }
1580 
1581 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1582 {
1583 	static const struct lri lri[] = {
1584 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1585 		{
1586 			COMMON_SLICE_CHICKEN2,
1587 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1588 				       0),
1589 		},
1590 
1591 		/* BSpec: 11391 */
1592 		{
1593 			FF_SLICE_CHICKEN,
1594 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1595 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1596 		},
1597 
1598 		/* BSpec: 11299 */
1599 		{
1600 			_3D_CHICKEN3,
1601 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1602 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1603 		}
1604 	};
1605 
1606 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1607 
1608 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1609 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1610 
1611 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1612 	batch = gen8_emit_pipe_control(batch,
1613 				       PIPE_CONTROL_FLUSH_L3 |
1614 				       PIPE_CONTROL_STORE_DATA_INDEX |
1615 				       PIPE_CONTROL_CS_STALL |
1616 				       PIPE_CONTROL_QW_WRITE,
1617 				       LRC_PPHWSP_SCRATCH_ADDR);
1618 
1619 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1620 
1621 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1622 	if (HAS_POOLED_EU(engine->i915)) {
1623 		/*
1624 		 * EU pool configuration is setup along with golden context
1625 		 * during context initialization. This value depends on
1626 		 * device type (2x6 or 3x6) and needs to be updated based
1627 		 * on which subslice is disabled especially for 2x6
1628 		 * devices, however it is safe to load default
1629 		 * configuration of 3x6 device instead of masking off
1630 		 * corresponding bits because HW ignores bits of a disabled
1631 		 * subslice and drops down to appropriate config. Please
1632 		 * see render_state_setup() in i915_gem_render_state.c for
1633 		 * possible configurations, to avoid duplication they are
1634 		 * not shown here again.
1635 		 */
1636 		*batch++ = GEN9_MEDIA_POOL_STATE;
1637 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1638 		*batch++ = 0x00777000;
1639 		*batch++ = 0;
1640 		*batch++ = 0;
1641 		*batch++ = 0;
1642 	}
1643 
1644 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1645 
1646 	/* Pad to end of cacheline */
1647 	while ((unsigned long)batch % CACHELINE_BYTES)
1648 		*batch++ = MI_NOOP;
1649 
1650 	return batch;
1651 }
1652 
1653 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1654 
1655 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1656 {
1657 	struct drm_i915_gem_object *obj;
1658 	struct i915_vma *vma;
1659 	int err;
1660 
1661 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1662 	if (IS_ERR(obj))
1663 		return PTR_ERR(obj);
1664 
1665 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1666 	if (IS_ERR(vma)) {
1667 		err = PTR_ERR(vma);
1668 		goto err;
1669 	}
1670 
1671 	engine->wa_ctx.vma = vma;
1672 	return 0;
1673 
1674 err:
1675 	i915_gem_object_put(obj);
1676 	return err;
1677 }
1678 
1679 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1680 {
1681 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1682 }
1683 
1684 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1685 
1686 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1687 {
1688 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1689 	struct i915_wa_ctx_bb *wa_bb[] = {
1690 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1691 	};
1692 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1693 	struct i915_gem_ww_ctx ww;
1694 	void *batch, *batch_ptr;
1695 	unsigned int i;
1696 	int err;
1697 
1698 	if (!(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1699 		return;
1700 
1701 	switch (GRAPHICS_VER(engine->i915)) {
1702 	case 12:
1703 	case 11:
1704 		return;
1705 	case 9:
1706 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1707 		wa_bb_fn[1] = NULL;
1708 		break;
1709 	case 8:
1710 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1711 		wa_bb_fn[1] = NULL;
1712 		break;
1713 	default:
1714 		MISSING_CASE(GRAPHICS_VER(engine->i915));
1715 		return;
1716 	}
1717 
1718 	err = lrc_create_wa_ctx(engine);
1719 	if (err) {
1720 		/*
1721 		 * We continue even if we fail to initialize WA batch
1722 		 * because we only expect rare glitches but nothing
1723 		 * critical to prevent us from using GPU
1724 		 */
1725 		drm_err(&engine->i915->drm,
1726 			"Ignoring context switch w/a allocation error:%d\n",
1727 			err);
1728 		return;
1729 	}
1730 
1731 	if (!engine->wa_ctx.vma)
1732 		return;
1733 
1734 	i915_gem_ww_ctx_init(&ww, true);
1735 retry:
1736 	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1737 	if (!err)
1738 		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1739 	if (err)
1740 		goto err;
1741 
1742 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1743 	if (IS_ERR(batch)) {
1744 		err = PTR_ERR(batch);
1745 		goto err_unpin;
1746 	}
1747 
1748 	/*
1749 	 * Emit the two workaround batch buffers, recording the offset from the
1750 	 * start of the workaround batch buffer object for each and their
1751 	 * respective sizes.
1752 	 */
1753 	batch_ptr = batch;
1754 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1755 		wa_bb[i]->offset = batch_ptr - batch;
1756 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1757 						  CACHELINE_BYTES))) {
1758 			err = -EINVAL;
1759 			break;
1760 		}
1761 		if (wa_bb_fn[i])
1762 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1763 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1764 	}
1765 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1766 
1767 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1768 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1769 
1770 	/* Verify that we can handle failure to setup the wa_ctx */
1771 	if (!err)
1772 		err = i915_inject_probe_error(engine->i915, -ENODEV);
1773 
1774 err_unpin:
1775 	if (err)
1776 		i915_vma_unpin(wa_ctx->vma);
1777 err:
1778 	if (err == -EDEADLK) {
1779 		err = i915_gem_ww_ctx_backoff(&ww);
1780 		if (!err)
1781 			goto retry;
1782 	}
1783 	i915_gem_ww_ctx_fini(&ww);
1784 
1785 	if (err) {
1786 		i915_vma_put(engine->wa_ctx.vma);
1787 
1788 		/* Clear all flags to prevent further use */
1789 		memset(wa_ctx, 0, sizeof(*wa_ctx));
1790 	}
1791 }
1792 
1793 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1794 {
1795 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1796 	stats->runtime.num_underflow++;
1797 	stats->runtime.max_underflow =
1798 		max_t(u32, stats->runtime.max_underflow, -dt);
1799 #endif
1800 }
1801 
1802 static u32 lrc_get_runtime(const struct intel_context *ce)
1803 {
1804 	/*
1805 	 * We can use either ppHWSP[16] which is recorded before the context
1806 	 * switch (and so excludes the cost of context switches) or use the
1807 	 * value from the context image itself, which is saved/restored earlier
1808 	 * and so includes the cost of the save.
1809 	 */
1810 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1811 }
1812 
1813 void lrc_update_runtime(struct intel_context *ce)
1814 {
1815 	struct intel_context_stats *stats = &ce->stats;
1816 	u32 old;
1817 	s32 dt;
1818 
1819 	old = stats->runtime.last;
1820 	stats->runtime.last = lrc_get_runtime(ce);
1821 	dt = stats->runtime.last - old;
1822 	if (!dt)
1823 		return;
1824 
1825 	if (unlikely(dt < 0)) {
1826 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1827 			 old, stats->runtime.last, dt);
1828 		st_runtime_underflow(stats, dt);
1829 		return;
1830 	}
1831 
1832 	ewma_runtime_add(&stats->runtime.avg, dt);
1833 	stats->runtime.total += dt;
1834 }
1835 
1836 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1837 #include "selftest_lrc.c"
1838 #endif
1839