xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 0d8ee5ba)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gem/i915_gem_lmem.h"
7 
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "intel_engine.h"
12 #include "intel_gpu_commands.h"
13 #include "intel_gt.h"
14 #include "intel_lrc.h"
15 #include "intel_lrc_reg.h"
16 #include "intel_ring.h"
17 #include "shmem_utils.h"
18 
19 static void set_offsets(u32 *regs,
20 			const u8 *data,
21 			const struct intel_engine_cs *engine,
22 			bool close)
23 #define NOP(x) (BIT(7) | (x))
24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
25 #define POSTED BIT(0)
26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
27 #define REG16(x) \
28 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
29 	(((x) >> 2) & 0x7f)
30 #define END 0
31 {
32 	const u32 base = engine->mmio_base;
33 
34 	while (*data) {
35 		u8 count, flags;
36 
37 		if (*data & BIT(7)) { /* skip */
38 			count = *data++ & ~BIT(7);
39 			regs += count;
40 			continue;
41 		}
42 
43 		count = *data & 0x3f;
44 		flags = *data >> 6;
45 		data++;
46 
47 		*regs = MI_LOAD_REGISTER_IMM(count);
48 		if (flags & POSTED)
49 			*regs |= MI_LRI_FORCE_POSTED;
50 		if (GRAPHICS_VER(engine->i915) >= 11)
51 			*regs |= MI_LRI_LRM_CS_MMIO;
52 		regs++;
53 
54 		GEM_BUG_ON(!count);
55 		do {
56 			u32 offset = 0;
57 			u8 v;
58 
59 			do {
60 				v = *data++;
61 				offset <<= 7;
62 				offset |= v & ~BIT(7);
63 			} while (v & BIT(7));
64 
65 			regs[0] = base + (offset << 2);
66 			regs += 2;
67 		} while (--count);
68 	}
69 
70 	if (close) {
71 		/* Close the batch; used mainly by live_lrc_layout() */
72 		*regs = MI_BATCH_BUFFER_END;
73 		if (GRAPHICS_VER(engine->i915) >= 11)
74 			*regs |= BIT(0);
75 	}
76 }
77 
78 static const u8 gen8_xcs_offsets[] = {
79 	NOP(1),
80 	LRI(11, 0),
81 	REG16(0x244),
82 	REG(0x034),
83 	REG(0x030),
84 	REG(0x038),
85 	REG(0x03c),
86 	REG(0x168),
87 	REG(0x140),
88 	REG(0x110),
89 	REG(0x11c),
90 	REG(0x114),
91 	REG(0x118),
92 
93 	NOP(9),
94 	LRI(9, 0),
95 	REG16(0x3a8),
96 	REG16(0x28c),
97 	REG16(0x288),
98 	REG16(0x284),
99 	REG16(0x280),
100 	REG16(0x27c),
101 	REG16(0x278),
102 	REG16(0x274),
103 	REG16(0x270),
104 
105 	NOP(13),
106 	LRI(2, 0),
107 	REG16(0x200),
108 	REG(0x028),
109 
110 	END
111 };
112 
113 static const u8 gen9_xcs_offsets[] = {
114 	NOP(1),
115 	LRI(14, POSTED),
116 	REG16(0x244),
117 	REG(0x034),
118 	REG(0x030),
119 	REG(0x038),
120 	REG(0x03c),
121 	REG(0x168),
122 	REG(0x140),
123 	REG(0x110),
124 	REG(0x11c),
125 	REG(0x114),
126 	REG(0x118),
127 	REG(0x1c0),
128 	REG(0x1c4),
129 	REG(0x1c8),
130 
131 	NOP(3),
132 	LRI(9, POSTED),
133 	REG16(0x3a8),
134 	REG16(0x28c),
135 	REG16(0x288),
136 	REG16(0x284),
137 	REG16(0x280),
138 	REG16(0x27c),
139 	REG16(0x278),
140 	REG16(0x274),
141 	REG16(0x270),
142 
143 	NOP(13),
144 	LRI(1, POSTED),
145 	REG16(0x200),
146 
147 	NOP(13),
148 	LRI(44, POSTED),
149 	REG(0x028),
150 	REG(0x09c),
151 	REG(0x0c0),
152 	REG(0x178),
153 	REG(0x17c),
154 	REG16(0x358),
155 	REG(0x170),
156 	REG(0x150),
157 	REG(0x154),
158 	REG(0x158),
159 	REG16(0x41c),
160 	REG16(0x600),
161 	REG16(0x604),
162 	REG16(0x608),
163 	REG16(0x60c),
164 	REG16(0x610),
165 	REG16(0x614),
166 	REG16(0x618),
167 	REG16(0x61c),
168 	REG16(0x620),
169 	REG16(0x624),
170 	REG16(0x628),
171 	REG16(0x62c),
172 	REG16(0x630),
173 	REG16(0x634),
174 	REG16(0x638),
175 	REG16(0x63c),
176 	REG16(0x640),
177 	REG16(0x644),
178 	REG16(0x648),
179 	REG16(0x64c),
180 	REG16(0x650),
181 	REG16(0x654),
182 	REG16(0x658),
183 	REG16(0x65c),
184 	REG16(0x660),
185 	REG16(0x664),
186 	REG16(0x668),
187 	REG16(0x66c),
188 	REG16(0x670),
189 	REG16(0x674),
190 	REG16(0x678),
191 	REG16(0x67c),
192 	REG(0x068),
193 
194 	END
195 };
196 
197 static const u8 gen12_xcs_offsets[] = {
198 	NOP(1),
199 	LRI(13, POSTED),
200 	REG16(0x244),
201 	REG(0x034),
202 	REG(0x030),
203 	REG(0x038),
204 	REG(0x03c),
205 	REG(0x168),
206 	REG(0x140),
207 	REG(0x110),
208 	REG(0x1c0),
209 	REG(0x1c4),
210 	REG(0x1c8),
211 	REG(0x180),
212 	REG16(0x2b4),
213 
214 	NOP(5),
215 	LRI(9, POSTED),
216 	REG16(0x3a8),
217 	REG16(0x28c),
218 	REG16(0x288),
219 	REG16(0x284),
220 	REG16(0x280),
221 	REG16(0x27c),
222 	REG16(0x278),
223 	REG16(0x274),
224 	REG16(0x270),
225 
226 	END
227 };
228 
229 static const u8 dg2_xcs_offsets[] = {
230 	NOP(1),
231 	LRI(15, POSTED),
232 	REG16(0x244),
233 	REG(0x034),
234 	REG(0x030),
235 	REG(0x038),
236 	REG(0x03c),
237 	REG(0x168),
238 	REG(0x140),
239 	REG(0x110),
240 	REG(0x1c0),
241 	REG(0x1c4),
242 	REG(0x1c8),
243 	REG(0x180),
244 	REG16(0x2b4),
245 	REG(0x120),
246 	REG(0x124),
247 
248 	NOP(1),
249 	LRI(9, POSTED),
250 	REG16(0x3a8),
251 	REG16(0x28c),
252 	REG16(0x288),
253 	REG16(0x284),
254 	REG16(0x280),
255 	REG16(0x27c),
256 	REG16(0x278),
257 	REG16(0x274),
258 	REG16(0x270),
259 
260 	END
261 };
262 
263 static const u8 gen8_rcs_offsets[] = {
264 	NOP(1),
265 	LRI(14, POSTED),
266 	REG16(0x244),
267 	REG(0x034),
268 	REG(0x030),
269 	REG(0x038),
270 	REG(0x03c),
271 	REG(0x168),
272 	REG(0x140),
273 	REG(0x110),
274 	REG(0x11c),
275 	REG(0x114),
276 	REG(0x118),
277 	REG(0x1c0),
278 	REG(0x1c4),
279 	REG(0x1c8),
280 
281 	NOP(3),
282 	LRI(9, POSTED),
283 	REG16(0x3a8),
284 	REG16(0x28c),
285 	REG16(0x288),
286 	REG16(0x284),
287 	REG16(0x280),
288 	REG16(0x27c),
289 	REG16(0x278),
290 	REG16(0x274),
291 	REG16(0x270),
292 
293 	NOP(13),
294 	LRI(1, 0),
295 	REG(0x0c8),
296 
297 	END
298 };
299 
300 static const u8 gen9_rcs_offsets[] = {
301 	NOP(1),
302 	LRI(14, POSTED),
303 	REG16(0x244),
304 	REG(0x34),
305 	REG(0x30),
306 	REG(0x38),
307 	REG(0x3c),
308 	REG(0x168),
309 	REG(0x140),
310 	REG(0x110),
311 	REG(0x11c),
312 	REG(0x114),
313 	REG(0x118),
314 	REG(0x1c0),
315 	REG(0x1c4),
316 	REG(0x1c8),
317 
318 	NOP(3),
319 	LRI(9, POSTED),
320 	REG16(0x3a8),
321 	REG16(0x28c),
322 	REG16(0x288),
323 	REG16(0x284),
324 	REG16(0x280),
325 	REG16(0x27c),
326 	REG16(0x278),
327 	REG16(0x274),
328 	REG16(0x270),
329 
330 	NOP(13),
331 	LRI(1, 0),
332 	REG(0xc8),
333 
334 	NOP(13),
335 	LRI(44, POSTED),
336 	REG(0x28),
337 	REG(0x9c),
338 	REG(0xc0),
339 	REG(0x178),
340 	REG(0x17c),
341 	REG16(0x358),
342 	REG(0x170),
343 	REG(0x150),
344 	REG(0x154),
345 	REG(0x158),
346 	REG16(0x41c),
347 	REG16(0x600),
348 	REG16(0x604),
349 	REG16(0x608),
350 	REG16(0x60c),
351 	REG16(0x610),
352 	REG16(0x614),
353 	REG16(0x618),
354 	REG16(0x61c),
355 	REG16(0x620),
356 	REG16(0x624),
357 	REG16(0x628),
358 	REG16(0x62c),
359 	REG16(0x630),
360 	REG16(0x634),
361 	REG16(0x638),
362 	REG16(0x63c),
363 	REG16(0x640),
364 	REG16(0x644),
365 	REG16(0x648),
366 	REG16(0x64c),
367 	REG16(0x650),
368 	REG16(0x654),
369 	REG16(0x658),
370 	REG16(0x65c),
371 	REG16(0x660),
372 	REG16(0x664),
373 	REG16(0x668),
374 	REG16(0x66c),
375 	REG16(0x670),
376 	REG16(0x674),
377 	REG16(0x678),
378 	REG16(0x67c),
379 	REG(0x68),
380 
381 	END
382 };
383 
384 static const u8 gen11_rcs_offsets[] = {
385 	NOP(1),
386 	LRI(15, POSTED),
387 	REG16(0x244),
388 	REG(0x034),
389 	REG(0x030),
390 	REG(0x038),
391 	REG(0x03c),
392 	REG(0x168),
393 	REG(0x140),
394 	REG(0x110),
395 	REG(0x11c),
396 	REG(0x114),
397 	REG(0x118),
398 	REG(0x1c0),
399 	REG(0x1c4),
400 	REG(0x1c8),
401 	REG(0x180),
402 
403 	NOP(1),
404 	LRI(9, POSTED),
405 	REG16(0x3a8),
406 	REG16(0x28c),
407 	REG16(0x288),
408 	REG16(0x284),
409 	REG16(0x280),
410 	REG16(0x27c),
411 	REG16(0x278),
412 	REG16(0x274),
413 	REG16(0x270),
414 
415 	LRI(1, POSTED),
416 	REG(0x1b0),
417 
418 	NOP(10),
419 	LRI(1, 0),
420 	REG(0x0c8),
421 
422 	END
423 };
424 
425 static const u8 gen12_rcs_offsets[] = {
426 	NOP(1),
427 	LRI(13, POSTED),
428 	REG16(0x244),
429 	REG(0x034),
430 	REG(0x030),
431 	REG(0x038),
432 	REG(0x03c),
433 	REG(0x168),
434 	REG(0x140),
435 	REG(0x110),
436 	REG(0x1c0),
437 	REG(0x1c4),
438 	REG(0x1c8),
439 	REG(0x180),
440 	REG16(0x2b4),
441 
442 	NOP(5),
443 	LRI(9, POSTED),
444 	REG16(0x3a8),
445 	REG16(0x28c),
446 	REG16(0x288),
447 	REG16(0x284),
448 	REG16(0x280),
449 	REG16(0x27c),
450 	REG16(0x278),
451 	REG16(0x274),
452 	REG16(0x270),
453 
454 	LRI(3, POSTED),
455 	REG(0x1b0),
456 	REG16(0x5a8),
457 	REG16(0x5ac),
458 
459 	NOP(6),
460 	LRI(1, 0),
461 	REG(0x0c8),
462 	NOP(3 + 9 + 1),
463 
464 	LRI(51, POSTED),
465 	REG16(0x588),
466 	REG16(0x588),
467 	REG16(0x588),
468 	REG16(0x588),
469 	REG16(0x588),
470 	REG16(0x588),
471 	REG(0x028),
472 	REG(0x09c),
473 	REG(0x0c0),
474 	REG(0x178),
475 	REG(0x17c),
476 	REG16(0x358),
477 	REG(0x170),
478 	REG(0x150),
479 	REG(0x154),
480 	REG(0x158),
481 	REG16(0x41c),
482 	REG16(0x600),
483 	REG16(0x604),
484 	REG16(0x608),
485 	REG16(0x60c),
486 	REG16(0x610),
487 	REG16(0x614),
488 	REG16(0x618),
489 	REG16(0x61c),
490 	REG16(0x620),
491 	REG16(0x624),
492 	REG16(0x628),
493 	REG16(0x62c),
494 	REG16(0x630),
495 	REG16(0x634),
496 	REG16(0x638),
497 	REG16(0x63c),
498 	REG16(0x640),
499 	REG16(0x644),
500 	REG16(0x648),
501 	REG16(0x64c),
502 	REG16(0x650),
503 	REG16(0x654),
504 	REG16(0x658),
505 	REG16(0x65c),
506 	REG16(0x660),
507 	REG16(0x664),
508 	REG16(0x668),
509 	REG16(0x66c),
510 	REG16(0x670),
511 	REG16(0x674),
512 	REG16(0x678),
513 	REG16(0x67c),
514 	REG(0x068),
515 	REG(0x084),
516 	NOP(1),
517 
518 	END
519 };
520 
521 static const u8 xehp_rcs_offsets[] = {
522 	NOP(1),
523 	LRI(13, POSTED),
524 	REG16(0x244),
525 	REG(0x034),
526 	REG(0x030),
527 	REG(0x038),
528 	REG(0x03c),
529 	REG(0x168),
530 	REG(0x140),
531 	REG(0x110),
532 	REG(0x1c0),
533 	REG(0x1c4),
534 	REG(0x1c8),
535 	REG(0x180),
536 	REG16(0x2b4),
537 
538 	NOP(5),
539 	LRI(9, POSTED),
540 	REG16(0x3a8),
541 	REG16(0x28c),
542 	REG16(0x288),
543 	REG16(0x284),
544 	REG16(0x280),
545 	REG16(0x27c),
546 	REG16(0x278),
547 	REG16(0x274),
548 	REG16(0x270),
549 
550 	LRI(3, POSTED),
551 	REG(0x1b0),
552 	REG16(0x5a8),
553 	REG16(0x5ac),
554 
555 	NOP(6),
556 	LRI(1, 0),
557 	REG(0x0c8),
558 
559 	END
560 };
561 
562 static const u8 dg2_rcs_offsets[] = {
563 	NOP(1),
564 	LRI(15, POSTED),
565 	REG16(0x244),
566 	REG(0x034),
567 	REG(0x030),
568 	REG(0x038),
569 	REG(0x03c),
570 	REG(0x168),
571 	REG(0x140),
572 	REG(0x110),
573 	REG(0x1c0),
574 	REG(0x1c4),
575 	REG(0x1c8),
576 	REG(0x180),
577 	REG16(0x2b4),
578 	REG(0x120),
579 	REG(0x124),
580 
581 	NOP(1),
582 	LRI(9, POSTED),
583 	REG16(0x3a8),
584 	REG16(0x28c),
585 	REG16(0x288),
586 	REG16(0x284),
587 	REG16(0x280),
588 	REG16(0x27c),
589 	REG16(0x278),
590 	REG16(0x274),
591 	REG16(0x270),
592 
593 	LRI(3, POSTED),
594 	REG(0x1b0),
595 	REG16(0x5a8),
596 	REG16(0x5ac),
597 
598 	NOP(6),
599 	LRI(1, 0),
600 	REG(0x0c8),
601 
602 	END
603 };
604 
605 #undef END
606 #undef REG16
607 #undef REG
608 #undef LRI
609 #undef NOP
610 
611 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
612 {
613 	/*
614 	 * The gen12+ lists only have the registers we program in the basic
615 	 * default state. We rely on the context image using relative
616 	 * addressing to automatic fixup the register state between the
617 	 * physical engines for virtual engine.
618 	 */
619 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
620 		   !intel_engine_has_relative_mmio(engine));
621 
622 	if (engine->class == RENDER_CLASS) {
623 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
624 			return dg2_rcs_offsets;
625 		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
626 			return xehp_rcs_offsets;
627 		else if (GRAPHICS_VER(engine->i915) >= 12)
628 			return gen12_rcs_offsets;
629 		else if (GRAPHICS_VER(engine->i915) >= 11)
630 			return gen11_rcs_offsets;
631 		else if (GRAPHICS_VER(engine->i915) >= 9)
632 			return gen9_rcs_offsets;
633 		else
634 			return gen8_rcs_offsets;
635 	} else {
636 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
637 			return dg2_xcs_offsets;
638 		else if (GRAPHICS_VER(engine->i915) >= 12)
639 			return gen12_xcs_offsets;
640 		else if (GRAPHICS_VER(engine->i915) >= 9)
641 			return gen9_xcs_offsets;
642 		else
643 			return gen8_xcs_offsets;
644 	}
645 }
646 
647 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
648 {
649 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
650 		return 0x70;
651 	else if (GRAPHICS_VER(engine->i915) >= 12)
652 		return 0x60;
653 	else if (GRAPHICS_VER(engine->i915) >= 9)
654 		return 0x54;
655 	else if (engine->class == RENDER_CLASS)
656 		return 0x58;
657 	else
658 		return -1;
659 }
660 
661 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
662 {
663 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
664 		return 0x84;
665 	else if (GRAPHICS_VER(engine->i915) >= 12)
666 		return 0x74;
667 	else if (GRAPHICS_VER(engine->i915) >= 9)
668 		return 0x68;
669 	else if (engine->class == RENDER_CLASS)
670 		return 0xd8;
671 	else
672 		return -1;
673 }
674 
675 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
676 {
677 	if (GRAPHICS_VER(engine->i915) >= 12)
678 		return 0x12;
679 	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
680 		return 0x18;
681 	else
682 		return -1;
683 }
684 
685 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
686 {
687 	int x;
688 
689 	x = lrc_ring_wa_bb_per_ctx(engine);
690 	if (x < 0)
691 		return x;
692 
693 	return x + 2;
694 }
695 
696 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
697 {
698 	int x;
699 
700 	x = lrc_ring_indirect_ptr(engine);
701 	if (x < 0)
702 		return x;
703 
704 	return x + 2;
705 }
706 
707 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
708 {
709 
710 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
711 		/*
712 		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
713 		 * simply to match the RCS context image layout.
714 		 */
715 		return 0xc6;
716 	else if (engine->class != RENDER_CLASS)
717 		return -1;
718 	else if (GRAPHICS_VER(engine->i915) >= 12)
719 		return 0xb6;
720 	else if (GRAPHICS_VER(engine->i915) >= 11)
721 		return 0xaa;
722 	else
723 		return -1;
724 }
725 
726 static u32
727 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
728 {
729 	switch (GRAPHICS_VER(engine->i915)) {
730 	default:
731 		MISSING_CASE(GRAPHICS_VER(engine->i915));
732 		fallthrough;
733 	case 12:
734 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
735 	case 11:
736 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
737 	case 9:
738 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
739 	case 8:
740 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
741 	}
742 }
743 
744 static void
745 lrc_setup_indirect_ctx(u32 *regs,
746 		       const struct intel_engine_cs *engine,
747 		       u32 ctx_bb_ggtt_addr,
748 		       u32 size)
749 {
750 	GEM_BUG_ON(!size);
751 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
752 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
753 	regs[lrc_ring_indirect_ptr(engine) + 1] =
754 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
755 
756 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
757 	regs[lrc_ring_indirect_offset(engine) + 1] =
758 		lrc_ring_indirect_offset_default(engine) << 6;
759 }
760 
761 static void init_common_regs(u32 * const regs,
762 			     const struct intel_context *ce,
763 			     const struct intel_engine_cs *engine,
764 			     bool inhibit)
765 {
766 	u32 ctl;
767 
768 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
769 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
770 	if (inhibit)
771 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
772 	if (GRAPHICS_VER(engine->i915) < 11)
773 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
774 					   CTX_CTRL_RS_CTX_ENABLE);
775 	regs[CTX_CONTEXT_CONTROL] = ctl;
776 
777 	regs[CTX_TIMESTAMP] = ce->runtime.last;
778 }
779 
780 static void init_wa_bb_regs(u32 * const regs,
781 			    const struct intel_engine_cs *engine)
782 {
783 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
784 
785 	if (wa_ctx->per_ctx.size) {
786 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
787 
788 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
789 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
790 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
791 	}
792 
793 	if (wa_ctx->indirect_ctx.size) {
794 		lrc_setup_indirect_ctx(regs, engine,
795 				       i915_ggtt_offset(wa_ctx->vma) +
796 				       wa_ctx->indirect_ctx.offset,
797 				       wa_ctx->indirect_ctx.size);
798 	}
799 }
800 
801 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
802 {
803 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
804 		/* 64b PPGTT (48bit canonical)
805 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
806 		 * other PDP Descriptors are ignored.
807 		 */
808 		ASSIGN_CTX_PML4(ppgtt, regs);
809 	} else {
810 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
811 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
812 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
813 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
814 	}
815 }
816 
817 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
818 {
819 	if (i915_is_ggtt(vm))
820 		return i915_vm_to_ggtt(vm)->alias;
821 	else
822 		return i915_vm_to_ppgtt(vm);
823 }
824 
825 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
826 {
827 	int x;
828 
829 	x = lrc_ring_mi_mode(engine);
830 	if (x != -1) {
831 		regs[x + 1] &= ~STOP_RING;
832 		regs[x + 1] |= STOP_RING << 16;
833 	}
834 }
835 
836 static void __lrc_init_regs(u32 *regs,
837 			    const struct intel_context *ce,
838 			    const struct intel_engine_cs *engine,
839 			    bool inhibit)
840 {
841 	/*
842 	 * A context is actually a big batch buffer with several
843 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
844 	 * values we are setting here are only for the first context restore:
845 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
846 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
847 	 * we are not initializing here).
848 	 *
849 	 * Must keep consistent with virtual_update_register_offsets().
850 	 */
851 
852 	if (inhibit)
853 		memset(regs, 0, PAGE_SIZE);
854 
855 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
856 
857 	init_common_regs(regs, ce, engine, inhibit);
858 	init_ppgtt_regs(regs, vm_alias(ce->vm));
859 
860 	init_wa_bb_regs(regs, engine);
861 
862 	__reset_stop_ring(regs, engine);
863 }
864 
865 void lrc_init_regs(const struct intel_context *ce,
866 		   const struct intel_engine_cs *engine,
867 		   bool inhibit)
868 {
869 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
870 }
871 
872 void lrc_reset_regs(const struct intel_context *ce,
873 		    const struct intel_engine_cs *engine)
874 {
875 	__reset_stop_ring(ce->lrc_reg_state, engine);
876 }
877 
878 static void
879 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
880 {
881 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
882 		return;
883 
884 	vaddr += engine->context_size;
885 
886 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
887 }
888 
889 static void
890 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
891 {
892 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
893 		return;
894 
895 	vaddr += engine->context_size;
896 
897 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
898 		drm_err_once(&engine->i915->drm,
899 			     "%s context redzone overwritten!\n",
900 			     engine->name);
901 }
902 
903 void lrc_init_state(struct intel_context *ce,
904 		    struct intel_engine_cs *engine,
905 		    void *state)
906 {
907 	bool inhibit = true;
908 
909 	set_redzone(state, engine);
910 
911 	if (engine->default_state) {
912 		shmem_read(engine->default_state, 0,
913 			   state, engine->context_size);
914 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
915 		inhibit = false;
916 	}
917 
918 	/* Clear the ppHWSP (inc. per-context counters) */
919 	memset(state, 0, PAGE_SIZE);
920 
921 	/*
922 	 * The second page of the context object contains some registers which
923 	 * must be set up prior to the first execution.
924 	 */
925 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
926 }
927 
928 static struct i915_vma *
929 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
930 {
931 	struct drm_i915_gem_object *obj;
932 	struct i915_vma *vma;
933 	u32 context_size;
934 
935 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
936 
937 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
938 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
939 
940 	if (GRAPHICS_VER(engine->i915) == 12) {
941 		ce->wa_bb_page = context_size / PAGE_SIZE;
942 		context_size += PAGE_SIZE;
943 	}
944 
945 	obj = i915_gem_object_create_lmem(engine->i915, context_size,
946 					  I915_BO_ALLOC_PM_VOLATILE);
947 	if (IS_ERR(obj))
948 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
949 	if (IS_ERR(obj))
950 		return ERR_CAST(obj);
951 
952 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
953 	if (IS_ERR(vma)) {
954 		i915_gem_object_put(obj);
955 		return vma;
956 	}
957 
958 	return vma;
959 }
960 
961 static struct intel_timeline *
962 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
963 {
964 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
965 
966 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
967 }
968 
969 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
970 {
971 	struct intel_ring *ring;
972 	struct i915_vma *vma;
973 	int err;
974 
975 	GEM_BUG_ON(ce->state);
976 
977 	vma = __lrc_alloc_state(ce, engine);
978 	if (IS_ERR(vma))
979 		return PTR_ERR(vma);
980 
981 	ring = intel_engine_create_ring(engine, ce->ring_size);
982 	if (IS_ERR(ring)) {
983 		err = PTR_ERR(ring);
984 		goto err_vma;
985 	}
986 
987 	if (!page_mask_bits(ce->timeline)) {
988 		struct intel_timeline *tl;
989 
990 		/*
991 		 * Use the static global HWSP for the kernel context, and
992 		 * a dynamically allocated cacheline for everyone else.
993 		 */
994 		if (unlikely(ce->timeline))
995 			tl = pinned_timeline(ce, engine);
996 		else
997 			tl = intel_timeline_create(engine->gt);
998 		if (IS_ERR(tl)) {
999 			err = PTR_ERR(tl);
1000 			goto err_ring;
1001 		}
1002 
1003 		ce->timeline = tl;
1004 	}
1005 
1006 	ce->ring = ring;
1007 	ce->state = vma;
1008 
1009 	return 0;
1010 
1011 err_ring:
1012 	intel_ring_put(ring);
1013 err_vma:
1014 	i915_vma_put(vma);
1015 	return err;
1016 }
1017 
1018 void lrc_reset(struct intel_context *ce)
1019 {
1020 	GEM_BUG_ON(!intel_context_is_pinned(ce));
1021 
1022 	intel_ring_reset(ce->ring, ce->ring->emit);
1023 
1024 	/* Scrub away the garbage */
1025 	lrc_init_regs(ce, ce->engine, true);
1026 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1027 }
1028 
1029 int
1030 lrc_pre_pin(struct intel_context *ce,
1031 	    struct intel_engine_cs *engine,
1032 	    struct i915_gem_ww_ctx *ww,
1033 	    void **vaddr)
1034 {
1035 	GEM_BUG_ON(!ce->state);
1036 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1037 
1038 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1039 					 i915_coherent_map_type(ce->engine->i915,
1040 								ce->state->obj,
1041 								false) |
1042 					 I915_MAP_OVERRIDE);
1043 
1044 	return PTR_ERR_OR_ZERO(*vaddr);
1045 }
1046 
1047 int
1048 lrc_pin(struct intel_context *ce,
1049 	struct intel_engine_cs *engine,
1050 	void *vaddr)
1051 {
1052 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1053 
1054 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1055 		lrc_init_state(ce, engine, vaddr);
1056 
1057 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1058 	return 0;
1059 }
1060 
1061 void lrc_unpin(struct intel_context *ce)
1062 {
1063 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1064 		      ce->engine);
1065 }
1066 
1067 void lrc_post_unpin(struct intel_context *ce)
1068 {
1069 	i915_gem_object_unpin_map(ce->state->obj);
1070 }
1071 
1072 void lrc_fini(struct intel_context *ce)
1073 {
1074 	if (!ce->state)
1075 		return;
1076 
1077 	intel_ring_put(fetch_and_zero(&ce->ring));
1078 	i915_vma_put(fetch_and_zero(&ce->state));
1079 }
1080 
1081 void lrc_destroy(struct kref *kref)
1082 {
1083 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1084 
1085 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1086 	GEM_BUG_ON(intel_context_is_pinned(ce));
1087 
1088 	lrc_fini(ce);
1089 
1090 	intel_context_fini(ce);
1091 	intel_context_free(ce);
1092 }
1093 
1094 static u32 *
1095 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1096 {
1097 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1098 		MI_SRM_LRM_GLOBAL_GTT |
1099 		MI_LRI_LRM_CS_MMIO;
1100 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1101 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1102 		CTX_TIMESTAMP * sizeof(u32);
1103 	*cs++ = 0;
1104 
1105 	*cs++ = MI_LOAD_REGISTER_REG |
1106 		MI_LRR_SOURCE_CS_MMIO |
1107 		MI_LRI_LRM_CS_MMIO;
1108 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1109 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1110 
1111 	*cs++ = MI_LOAD_REGISTER_REG |
1112 		MI_LRR_SOURCE_CS_MMIO |
1113 		MI_LRI_LRM_CS_MMIO;
1114 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1115 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1116 
1117 	return cs;
1118 }
1119 
1120 static u32 *
1121 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1122 {
1123 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1124 
1125 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1126 		MI_SRM_LRM_GLOBAL_GTT |
1127 		MI_LRI_LRM_CS_MMIO;
1128 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1129 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1130 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1131 	*cs++ = 0;
1132 
1133 	return cs;
1134 }
1135 
1136 static u32 *
1137 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1138 {
1139 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1140 
1141 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1142 		MI_SRM_LRM_GLOBAL_GTT |
1143 		MI_LRI_LRM_CS_MMIO;
1144 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1145 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1146 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1147 	*cs++ = 0;
1148 
1149 	*cs++ = MI_LOAD_REGISTER_REG |
1150 		MI_LRR_SOURCE_CS_MMIO |
1151 		MI_LRI_LRM_CS_MMIO;
1152 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1153 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1154 
1155 	return cs;
1156 }
1157 
1158 static u32 *
1159 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1160 {
1161 	cs = gen12_emit_timestamp_wa(ce, cs);
1162 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1163 	cs = gen12_emit_restore_scratch(ce, cs);
1164 
1165 	return cs;
1166 }
1167 
1168 static u32 *
1169 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1170 {
1171 	cs = gen12_emit_timestamp_wa(ce, cs);
1172 	cs = gen12_emit_restore_scratch(ce, cs);
1173 
1174 	return cs;
1175 }
1176 
1177 static u32 context_wa_bb_offset(const struct intel_context *ce)
1178 {
1179 	return PAGE_SIZE * ce->wa_bb_page;
1180 }
1181 
1182 static u32 *context_indirect_bb(const struct intel_context *ce)
1183 {
1184 	void *ptr;
1185 
1186 	GEM_BUG_ON(!ce->wa_bb_page);
1187 
1188 	ptr = ce->lrc_reg_state;
1189 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1190 	ptr += context_wa_bb_offset(ce);
1191 
1192 	return ptr;
1193 }
1194 
1195 static void
1196 setup_indirect_ctx_bb(const struct intel_context *ce,
1197 		      const struct intel_engine_cs *engine,
1198 		      u32 *(*emit)(const struct intel_context *, u32 *))
1199 {
1200 	u32 * const start = context_indirect_bb(ce);
1201 	u32 *cs;
1202 
1203 	cs = emit(ce, start);
1204 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1205 	while ((unsigned long)cs % CACHELINE_BYTES)
1206 		*cs++ = MI_NOOP;
1207 
1208 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1209 			       i915_ggtt_offset(ce->state) +
1210 			       context_wa_bb_offset(ce),
1211 			       (cs - start) * sizeof(*cs));
1212 }
1213 
1214 /*
1215  * The context descriptor encodes various attributes of a context,
1216  * including its GTT address and some flags. Because it's fairly
1217  * expensive to calculate, we'll just do it once and cache the result,
1218  * which remains valid until the context is unpinned.
1219  *
1220  * This is what a descriptor looks like, from LSB to MSB::
1221  *
1222  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1223  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1224  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1225  *      bits 53-54:    mbz, reserved for use by hardware
1226  *      bits 55-63:    group ID, currently unused and set to 0
1227  *
1228  * Starting from Gen11, the upper dword of the descriptor has a new format:
1229  *
1230  *      bits 32-36:    reserved
1231  *      bits 37-47:    SW context ID
1232  *      bits 48:53:    engine instance
1233  *      bit 54:        mbz, reserved for use by hardware
1234  *      bits 55-60:    SW counter
1235  *      bits 61-63:    engine class
1236  *
1237  * On Xe_HP, the upper dword of the descriptor has a new format:
1238  *
1239  *      bits 32-37:    virtual function number
1240  *      bit 38:        mbz, reserved for use by hardware
1241  *      bits 39-54:    SW context ID
1242  *      bits 55-57:    reserved
1243  *      bits 58-63:    SW counter
1244  *
1245  * engine info, SW context ID and SW counter need to form a unique number
1246  * (Context ID) per lrc.
1247  */
1248 static u32 lrc_descriptor(const struct intel_context *ce)
1249 {
1250 	u32 desc;
1251 
1252 	desc = INTEL_LEGACY_32B_CONTEXT;
1253 	if (i915_vm_is_4lvl(ce->vm))
1254 		desc = INTEL_LEGACY_64B_CONTEXT;
1255 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1256 
1257 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1258 	if (GRAPHICS_VER(ce->vm->i915) == 8)
1259 		desc |= GEN8_CTX_L3LLC_COHERENT;
1260 
1261 	return i915_ggtt_offset(ce->state) | desc;
1262 }
1263 
1264 u32 lrc_update_regs(const struct intel_context *ce,
1265 		    const struct intel_engine_cs *engine,
1266 		    u32 head)
1267 {
1268 	struct intel_ring *ring = ce->ring;
1269 	u32 *regs = ce->lrc_reg_state;
1270 
1271 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1272 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1273 
1274 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1275 	regs[CTX_RING_HEAD] = head;
1276 	regs[CTX_RING_TAIL] = ring->tail;
1277 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1278 
1279 	/* RPCS */
1280 	if (engine->class == RENDER_CLASS) {
1281 		regs[CTX_R_PWR_CLK_STATE] =
1282 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1283 
1284 		i915_oa_init_reg_state(ce, engine);
1285 	}
1286 
1287 	if (ce->wa_bb_page) {
1288 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1289 
1290 		fn = gen12_emit_indirect_ctx_xcs;
1291 		if (ce->engine->class == RENDER_CLASS)
1292 			fn = gen12_emit_indirect_ctx_rcs;
1293 
1294 		/* Mutually exclusive wrt to global indirect bb */
1295 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1296 		setup_indirect_ctx_bb(ce, engine, fn);
1297 	}
1298 
1299 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1300 }
1301 
1302 void lrc_update_offsets(struct intel_context *ce,
1303 			struct intel_engine_cs *engine)
1304 {
1305 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1306 }
1307 
1308 void lrc_check_regs(const struct intel_context *ce,
1309 		    const struct intel_engine_cs *engine,
1310 		    const char *when)
1311 {
1312 	const struct intel_ring *ring = ce->ring;
1313 	u32 *regs = ce->lrc_reg_state;
1314 	bool valid = true;
1315 	int x;
1316 
1317 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1318 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1319 		       engine->name,
1320 		       regs[CTX_RING_START],
1321 		       i915_ggtt_offset(ring->vma));
1322 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1323 		valid = false;
1324 	}
1325 
1326 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1327 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1328 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1329 		       engine->name,
1330 		       regs[CTX_RING_CTL],
1331 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1332 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1333 		valid = false;
1334 	}
1335 
1336 	x = lrc_ring_mi_mode(engine);
1337 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1338 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1339 		       engine->name, regs[x + 1]);
1340 		regs[x + 1] &= ~STOP_RING;
1341 		regs[x + 1] |= STOP_RING << 16;
1342 		valid = false;
1343 	}
1344 
1345 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1346 }
1347 
1348 /*
1349  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1350  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1351  * but there is a slight complication as this is applied in WA batch where the
1352  * values are only initialized once so we cannot take register value at the
1353  * beginning and reuse it further; hence we save its value to memory, upload a
1354  * constant value with bit21 set and then we restore it back with the saved value.
1355  * To simplify the WA, a constant value is formed by using the default value
1356  * of this register. This shouldn't be a problem because we are only modifying
1357  * it for a short period and this batch in non-premptible. We can ofcourse
1358  * use additional instructions that read the actual value of the register
1359  * at that time and set our bit of interest but it makes the WA complicated.
1360  *
1361  * This WA is also required for Gen9 so extracting as a function avoids
1362  * code duplication.
1363  */
1364 static u32 *
1365 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1366 {
1367 	/* NB no one else is allowed to scribble over scratch + 256! */
1368 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1369 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1370 	*batch++ = intel_gt_scratch_offset(engine->gt,
1371 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1372 	*batch++ = 0;
1373 
1374 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1375 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1376 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1377 
1378 	batch = gen8_emit_pipe_control(batch,
1379 				       PIPE_CONTROL_CS_STALL |
1380 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1381 				       0);
1382 
1383 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1384 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1385 	*batch++ = intel_gt_scratch_offset(engine->gt,
1386 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1387 	*batch++ = 0;
1388 
1389 	return batch;
1390 }
1391 
1392 /*
1393  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1394  * initialized at the beginning and shared across all contexts but this field
1395  * helps us to have multiple batches at different offsets and select them based
1396  * on a criteria. At the moment this batch always start at the beginning of the page
1397  * and at this point we don't have multiple wa_ctx batch buffers.
1398  *
1399  * The number of WA applied are not known at the beginning; we use this field
1400  * to return the no of DWORDS written.
1401  *
1402  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1403  * so it adds NOOPs as padding to make it cacheline aligned.
1404  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1405  * makes a complete batch buffer.
1406  */
1407 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1408 {
1409 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1410 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1411 
1412 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1413 	if (IS_BROADWELL(engine->i915))
1414 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1415 
1416 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1417 	/* Actual scratch location is at 128 bytes offset */
1418 	batch = gen8_emit_pipe_control(batch,
1419 				       PIPE_CONTROL_FLUSH_L3 |
1420 				       PIPE_CONTROL_STORE_DATA_INDEX |
1421 				       PIPE_CONTROL_CS_STALL |
1422 				       PIPE_CONTROL_QW_WRITE,
1423 				       LRC_PPHWSP_SCRATCH_ADDR);
1424 
1425 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1426 
1427 	/* Pad to end of cacheline */
1428 	while ((unsigned long)batch % CACHELINE_BYTES)
1429 		*batch++ = MI_NOOP;
1430 
1431 	/*
1432 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1433 	 * execution depends on the length specified in terms of cache lines
1434 	 * in the register CTX_RCS_INDIRECT_CTX
1435 	 */
1436 
1437 	return batch;
1438 }
1439 
1440 struct lri {
1441 	i915_reg_t reg;
1442 	u32 value;
1443 };
1444 
1445 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1446 {
1447 	GEM_BUG_ON(!count || count > 63);
1448 
1449 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1450 	do {
1451 		*batch++ = i915_mmio_reg_offset(lri->reg);
1452 		*batch++ = lri->value;
1453 	} while (lri++, --count);
1454 	*batch++ = MI_NOOP;
1455 
1456 	return batch;
1457 }
1458 
1459 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1460 {
1461 	static const struct lri lri[] = {
1462 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1463 		{
1464 			COMMON_SLICE_CHICKEN2,
1465 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1466 				       0),
1467 		},
1468 
1469 		/* BSpec: 11391 */
1470 		{
1471 			FF_SLICE_CHICKEN,
1472 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1473 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1474 		},
1475 
1476 		/* BSpec: 11299 */
1477 		{
1478 			_3D_CHICKEN3,
1479 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1480 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1481 		}
1482 	};
1483 
1484 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1485 
1486 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1487 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1488 
1489 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1490 	batch = gen8_emit_pipe_control(batch,
1491 				       PIPE_CONTROL_FLUSH_L3 |
1492 				       PIPE_CONTROL_STORE_DATA_INDEX |
1493 				       PIPE_CONTROL_CS_STALL |
1494 				       PIPE_CONTROL_QW_WRITE,
1495 				       LRC_PPHWSP_SCRATCH_ADDR);
1496 
1497 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1498 
1499 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1500 	if (HAS_POOLED_EU(engine->i915)) {
1501 		/*
1502 		 * EU pool configuration is setup along with golden context
1503 		 * during context initialization. This value depends on
1504 		 * device type (2x6 or 3x6) and needs to be updated based
1505 		 * on which subslice is disabled especially for 2x6
1506 		 * devices, however it is safe to load default
1507 		 * configuration of 3x6 device instead of masking off
1508 		 * corresponding bits because HW ignores bits of a disabled
1509 		 * subslice and drops down to appropriate config. Please
1510 		 * see render_state_setup() in i915_gem_render_state.c for
1511 		 * possible configurations, to avoid duplication they are
1512 		 * not shown here again.
1513 		 */
1514 		*batch++ = GEN9_MEDIA_POOL_STATE;
1515 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1516 		*batch++ = 0x00777000;
1517 		*batch++ = 0;
1518 		*batch++ = 0;
1519 		*batch++ = 0;
1520 	}
1521 
1522 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1523 
1524 	/* Pad to end of cacheline */
1525 	while ((unsigned long)batch % CACHELINE_BYTES)
1526 		*batch++ = MI_NOOP;
1527 
1528 	return batch;
1529 }
1530 
1531 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1532 
1533 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1534 {
1535 	struct drm_i915_gem_object *obj;
1536 	struct i915_vma *vma;
1537 	int err;
1538 
1539 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1540 	if (IS_ERR(obj))
1541 		return PTR_ERR(obj);
1542 
1543 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1544 	if (IS_ERR(vma)) {
1545 		err = PTR_ERR(vma);
1546 		goto err;
1547 	}
1548 
1549 	engine->wa_ctx.vma = vma;
1550 	return 0;
1551 
1552 err:
1553 	i915_gem_object_put(obj);
1554 	return err;
1555 }
1556 
1557 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1558 {
1559 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1560 }
1561 
1562 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1563 
1564 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1565 {
1566 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1567 	struct i915_wa_ctx_bb *wa_bb[] = {
1568 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1569 	};
1570 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1571 	struct i915_gem_ww_ctx ww;
1572 	void *batch, *batch_ptr;
1573 	unsigned int i;
1574 	int err;
1575 
1576 	if (engine->class != RENDER_CLASS)
1577 		return;
1578 
1579 	switch (GRAPHICS_VER(engine->i915)) {
1580 	case 12:
1581 	case 11:
1582 		return;
1583 	case 9:
1584 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1585 		wa_bb_fn[1] = NULL;
1586 		break;
1587 	case 8:
1588 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1589 		wa_bb_fn[1] = NULL;
1590 		break;
1591 	default:
1592 		MISSING_CASE(GRAPHICS_VER(engine->i915));
1593 		return;
1594 	}
1595 
1596 	err = lrc_create_wa_ctx(engine);
1597 	if (err) {
1598 		/*
1599 		 * We continue even if we fail to initialize WA batch
1600 		 * because we only expect rare glitches but nothing
1601 		 * critical to prevent us from using GPU
1602 		 */
1603 		drm_err(&engine->i915->drm,
1604 			"Ignoring context switch w/a allocation error:%d\n",
1605 			err);
1606 		return;
1607 	}
1608 
1609 	if (!engine->wa_ctx.vma)
1610 		return;
1611 
1612 	i915_gem_ww_ctx_init(&ww, true);
1613 retry:
1614 	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1615 	if (!err)
1616 		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1617 	if (err)
1618 		goto err;
1619 
1620 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1621 	if (IS_ERR(batch)) {
1622 		err = PTR_ERR(batch);
1623 		goto err_unpin;
1624 	}
1625 
1626 	/*
1627 	 * Emit the two workaround batch buffers, recording the offset from the
1628 	 * start of the workaround batch buffer object for each and their
1629 	 * respective sizes.
1630 	 */
1631 	batch_ptr = batch;
1632 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1633 		wa_bb[i]->offset = batch_ptr - batch;
1634 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1635 						  CACHELINE_BYTES))) {
1636 			err = -EINVAL;
1637 			break;
1638 		}
1639 		if (wa_bb_fn[i])
1640 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1641 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1642 	}
1643 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1644 
1645 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1646 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1647 
1648 	/* Verify that we can handle failure to setup the wa_ctx */
1649 	if (!err)
1650 		err = i915_inject_probe_error(engine->i915, -ENODEV);
1651 
1652 err_unpin:
1653 	if (err)
1654 		i915_vma_unpin(wa_ctx->vma);
1655 err:
1656 	if (err == -EDEADLK) {
1657 		err = i915_gem_ww_ctx_backoff(&ww);
1658 		if (!err)
1659 			goto retry;
1660 	}
1661 	i915_gem_ww_ctx_fini(&ww);
1662 
1663 	if (err) {
1664 		i915_vma_put(engine->wa_ctx.vma);
1665 
1666 		/* Clear all flags to prevent further use */
1667 		memset(wa_ctx, 0, sizeof(*wa_ctx));
1668 	}
1669 }
1670 
1671 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1672 {
1673 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1674 	ce->runtime.num_underflow++;
1675 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1676 #endif
1677 }
1678 
1679 void lrc_update_runtime(struct intel_context *ce)
1680 {
1681 	u32 old;
1682 	s32 dt;
1683 
1684 	if (intel_context_is_barrier(ce))
1685 		return;
1686 
1687 	old = ce->runtime.last;
1688 	ce->runtime.last = lrc_get_runtime(ce);
1689 	dt = ce->runtime.last - old;
1690 
1691 	if (unlikely(dt < 0)) {
1692 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1693 			 old, ce->runtime.last, dt);
1694 		st_update_runtime_underflow(ce, dt);
1695 		return;
1696 	}
1697 
1698 	ewma_runtime_add(&ce->runtime.avg, dt);
1699 	ce->runtime.total += dt;
1700 }
1701 
1702 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1703 #include "selftest_lrc.c"
1704 #endif
1705