xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision dd21bfa4)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gem/i915_gem_lmem.h"
7 
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "intel_engine.h"
12 #include "intel_gpu_commands.h"
13 #include "intel_gt.h"
14 #include "intel_lrc.h"
15 #include "intel_lrc_reg.h"
16 #include "intel_ring.h"
17 #include "shmem_utils.h"
18 
19 static void set_offsets(u32 *regs,
20 			const u8 *data,
21 			const struct intel_engine_cs *engine,
22 			bool close)
23 #define NOP(x) (BIT(7) | (x))
24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
25 #define POSTED BIT(0)
26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
27 #define REG16(x) \
28 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
29 	(((x) >> 2) & 0x7f)
30 #define END 0
31 {
32 	const u32 base = engine->mmio_base;
33 
34 	while (*data) {
35 		u8 count, flags;
36 
37 		if (*data & BIT(7)) { /* skip */
38 			count = *data++ & ~BIT(7);
39 			regs += count;
40 			continue;
41 		}
42 
43 		count = *data & 0x3f;
44 		flags = *data >> 6;
45 		data++;
46 
47 		*regs = MI_LOAD_REGISTER_IMM(count);
48 		if (flags & POSTED)
49 			*regs |= MI_LRI_FORCE_POSTED;
50 		if (GRAPHICS_VER(engine->i915) >= 11)
51 			*regs |= MI_LRI_LRM_CS_MMIO;
52 		regs++;
53 
54 		GEM_BUG_ON(!count);
55 		do {
56 			u32 offset = 0;
57 			u8 v;
58 
59 			do {
60 				v = *data++;
61 				offset <<= 7;
62 				offset |= v & ~BIT(7);
63 			} while (v & BIT(7));
64 
65 			regs[0] = base + (offset << 2);
66 			regs += 2;
67 		} while (--count);
68 	}
69 
70 	if (close) {
71 		/* Close the batch; used mainly by live_lrc_layout() */
72 		*regs = MI_BATCH_BUFFER_END;
73 		if (GRAPHICS_VER(engine->i915) >= 11)
74 			*regs |= BIT(0);
75 	}
76 }
77 
78 static const u8 gen8_xcs_offsets[] = {
79 	NOP(1),
80 	LRI(11, 0),
81 	REG16(0x244),
82 	REG(0x034),
83 	REG(0x030),
84 	REG(0x038),
85 	REG(0x03c),
86 	REG(0x168),
87 	REG(0x140),
88 	REG(0x110),
89 	REG(0x11c),
90 	REG(0x114),
91 	REG(0x118),
92 
93 	NOP(9),
94 	LRI(9, 0),
95 	REG16(0x3a8),
96 	REG16(0x28c),
97 	REG16(0x288),
98 	REG16(0x284),
99 	REG16(0x280),
100 	REG16(0x27c),
101 	REG16(0x278),
102 	REG16(0x274),
103 	REG16(0x270),
104 
105 	NOP(13),
106 	LRI(2, 0),
107 	REG16(0x200),
108 	REG(0x028),
109 
110 	END
111 };
112 
113 static const u8 gen9_xcs_offsets[] = {
114 	NOP(1),
115 	LRI(14, POSTED),
116 	REG16(0x244),
117 	REG(0x034),
118 	REG(0x030),
119 	REG(0x038),
120 	REG(0x03c),
121 	REG(0x168),
122 	REG(0x140),
123 	REG(0x110),
124 	REG(0x11c),
125 	REG(0x114),
126 	REG(0x118),
127 	REG(0x1c0),
128 	REG(0x1c4),
129 	REG(0x1c8),
130 
131 	NOP(3),
132 	LRI(9, POSTED),
133 	REG16(0x3a8),
134 	REG16(0x28c),
135 	REG16(0x288),
136 	REG16(0x284),
137 	REG16(0x280),
138 	REG16(0x27c),
139 	REG16(0x278),
140 	REG16(0x274),
141 	REG16(0x270),
142 
143 	NOP(13),
144 	LRI(1, POSTED),
145 	REG16(0x200),
146 
147 	NOP(13),
148 	LRI(44, POSTED),
149 	REG(0x028),
150 	REG(0x09c),
151 	REG(0x0c0),
152 	REG(0x178),
153 	REG(0x17c),
154 	REG16(0x358),
155 	REG(0x170),
156 	REG(0x150),
157 	REG(0x154),
158 	REG(0x158),
159 	REG16(0x41c),
160 	REG16(0x600),
161 	REG16(0x604),
162 	REG16(0x608),
163 	REG16(0x60c),
164 	REG16(0x610),
165 	REG16(0x614),
166 	REG16(0x618),
167 	REG16(0x61c),
168 	REG16(0x620),
169 	REG16(0x624),
170 	REG16(0x628),
171 	REG16(0x62c),
172 	REG16(0x630),
173 	REG16(0x634),
174 	REG16(0x638),
175 	REG16(0x63c),
176 	REG16(0x640),
177 	REG16(0x644),
178 	REG16(0x648),
179 	REG16(0x64c),
180 	REG16(0x650),
181 	REG16(0x654),
182 	REG16(0x658),
183 	REG16(0x65c),
184 	REG16(0x660),
185 	REG16(0x664),
186 	REG16(0x668),
187 	REG16(0x66c),
188 	REG16(0x670),
189 	REG16(0x674),
190 	REG16(0x678),
191 	REG16(0x67c),
192 	REG(0x068),
193 
194 	END
195 };
196 
197 static const u8 gen12_xcs_offsets[] = {
198 	NOP(1),
199 	LRI(13, POSTED),
200 	REG16(0x244),
201 	REG(0x034),
202 	REG(0x030),
203 	REG(0x038),
204 	REG(0x03c),
205 	REG(0x168),
206 	REG(0x140),
207 	REG(0x110),
208 	REG(0x1c0),
209 	REG(0x1c4),
210 	REG(0x1c8),
211 	REG(0x180),
212 	REG16(0x2b4),
213 
214 	NOP(5),
215 	LRI(9, POSTED),
216 	REG16(0x3a8),
217 	REG16(0x28c),
218 	REG16(0x288),
219 	REG16(0x284),
220 	REG16(0x280),
221 	REG16(0x27c),
222 	REG16(0x278),
223 	REG16(0x274),
224 	REG16(0x270),
225 
226 	END
227 };
228 
229 static const u8 dg2_xcs_offsets[] = {
230 	NOP(1),
231 	LRI(15, POSTED),
232 	REG16(0x244),
233 	REG(0x034),
234 	REG(0x030),
235 	REG(0x038),
236 	REG(0x03c),
237 	REG(0x168),
238 	REG(0x140),
239 	REG(0x110),
240 	REG(0x1c0),
241 	REG(0x1c4),
242 	REG(0x1c8),
243 	REG(0x180),
244 	REG16(0x2b4),
245 	REG(0x120),
246 	REG(0x124),
247 
248 	NOP(1),
249 	LRI(9, POSTED),
250 	REG16(0x3a8),
251 	REG16(0x28c),
252 	REG16(0x288),
253 	REG16(0x284),
254 	REG16(0x280),
255 	REG16(0x27c),
256 	REG16(0x278),
257 	REG16(0x274),
258 	REG16(0x270),
259 
260 	END
261 };
262 
263 static const u8 gen8_rcs_offsets[] = {
264 	NOP(1),
265 	LRI(14, POSTED),
266 	REG16(0x244),
267 	REG(0x034),
268 	REG(0x030),
269 	REG(0x038),
270 	REG(0x03c),
271 	REG(0x168),
272 	REG(0x140),
273 	REG(0x110),
274 	REG(0x11c),
275 	REG(0x114),
276 	REG(0x118),
277 	REG(0x1c0),
278 	REG(0x1c4),
279 	REG(0x1c8),
280 
281 	NOP(3),
282 	LRI(9, POSTED),
283 	REG16(0x3a8),
284 	REG16(0x28c),
285 	REG16(0x288),
286 	REG16(0x284),
287 	REG16(0x280),
288 	REG16(0x27c),
289 	REG16(0x278),
290 	REG16(0x274),
291 	REG16(0x270),
292 
293 	NOP(13),
294 	LRI(1, 0),
295 	REG(0x0c8),
296 
297 	END
298 };
299 
300 static const u8 gen9_rcs_offsets[] = {
301 	NOP(1),
302 	LRI(14, POSTED),
303 	REG16(0x244),
304 	REG(0x34),
305 	REG(0x30),
306 	REG(0x38),
307 	REG(0x3c),
308 	REG(0x168),
309 	REG(0x140),
310 	REG(0x110),
311 	REG(0x11c),
312 	REG(0x114),
313 	REG(0x118),
314 	REG(0x1c0),
315 	REG(0x1c4),
316 	REG(0x1c8),
317 
318 	NOP(3),
319 	LRI(9, POSTED),
320 	REG16(0x3a8),
321 	REG16(0x28c),
322 	REG16(0x288),
323 	REG16(0x284),
324 	REG16(0x280),
325 	REG16(0x27c),
326 	REG16(0x278),
327 	REG16(0x274),
328 	REG16(0x270),
329 
330 	NOP(13),
331 	LRI(1, 0),
332 	REG(0xc8),
333 
334 	NOP(13),
335 	LRI(44, POSTED),
336 	REG(0x28),
337 	REG(0x9c),
338 	REG(0xc0),
339 	REG(0x178),
340 	REG(0x17c),
341 	REG16(0x358),
342 	REG(0x170),
343 	REG(0x150),
344 	REG(0x154),
345 	REG(0x158),
346 	REG16(0x41c),
347 	REG16(0x600),
348 	REG16(0x604),
349 	REG16(0x608),
350 	REG16(0x60c),
351 	REG16(0x610),
352 	REG16(0x614),
353 	REG16(0x618),
354 	REG16(0x61c),
355 	REG16(0x620),
356 	REG16(0x624),
357 	REG16(0x628),
358 	REG16(0x62c),
359 	REG16(0x630),
360 	REG16(0x634),
361 	REG16(0x638),
362 	REG16(0x63c),
363 	REG16(0x640),
364 	REG16(0x644),
365 	REG16(0x648),
366 	REG16(0x64c),
367 	REG16(0x650),
368 	REG16(0x654),
369 	REG16(0x658),
370 	REG16(0x65c),
371 	REG16(0x660),
372 	REG16(0x664),
373 	REG16(0x668),
374 	REG16(0x66c),
375 	REG16(0x670),
376 	REG16(0x674),
377 	REG16(0x678),
378 	REG16(0x67c),
379 	REG(0x68),
380 
381 	END
382 };
383 
384 static const u8 gen11_rcs_offsets[] = {
385 	NOP(1),
386 	LRI(15, POSTED),
387 	REG16(0x244),
388 	REG(0x034),
389 	REG(0x030),
390 	REG(0x038),
391 	REG(0x03c),
392 	REG(0x168),
393 	REG(0x140),
394 	REG(0x110),
395 	REG(0x11c),
396 	REG(0x114),
397 	REG(0x118),
398 	REG(0x1c0),
399 	REG(0x1c4),
400 	REG(0x1c8),
401 	REG(0x180),
402 
403 	NOP(1),
404 	LRI(9, POSTED),
405 	REG16(0x3a8),
406 	REG16(0x28c),
407 	REG16(0x288),
408 	REG16(0x284),
409 	REG16(0x280),
410 	REG16(0x27c),
411 	REG16(0x278),
412 	REG16(0x274),
413 	REG16(0x270),
414 
415 	LRI(1, POSTED),
416 	REG(0x1b0),
417 
418 	NOP(10),
419 	LRI(1, 0),
420 	REG(0x0c8),
421 
422 	END
423 };
424 
425 static const u8 gen12_rcs_offsets[] = {
426 	NOP(1),
427 	LRI(13, POSTED),
428 	REG16(0x244),
429 	REG(0x034),
430 	REG(0x030),
431 	REG(0x038),
432 	REG(0x03c),
433 	REG(0x168),
434 	REG(0x140),
435 	REG(0x110),
436 	REG(0x1c0),
437 	REG(0x1c4),
438 	REG(0x1c8),
439 	REG(0x180),
440 	REG16(0x2b4),
441 
442 	NOP(5),
443 	LRI(9, POSTED),
444 	REG16(0x3a8),
445 	REG16(0x28c),
446 	REG16(0x288),
447 	REG16(0x284),
448 	REG16(0x280),
449 	REG16(0x27c),
450 	REG16(0x278),
451 	REG16(0x274),
452 	REG16(0x270),
453 
454 	LRI(3, POSTED),
455 	REG(0x1b0),
456 	REG16(0x5a8),
457 	REG16(0x5ac),
458 
459 	NOP(6),
460 	LRI(1, 0),
461 	REG(0x0c8),
462 	NOP(3 + 9 + 1),
463 
464 	LRI(51, POSTED),
465 	REG16(0x588),
466 	REG16(0x588),
467 	REG16(0x588),
468 	REG16(0x588),
469 	REG16(0x588),
470 	REG16(0x588),
471 	REG(0x028),
472 	REG(0x09c),
473 	REG(0x0c0),
474 	REG(0x178),
475 	REG(0x17c),
476 	REG16(0x358),
477 	REG(0x170),
478 	REG(0x150),
479 	REG(0x154),
480 	REG(0x158),
481 	REG16(0x41c),
482 	REG16(0x600),
483 	REG16(0x604),
484 	REG16(0x608),
485 	REG16(0x60c),
486 	REG16(0x610),
487 	REG16(0x614),
488 	REG16(0x618),
489 	REG16(0x61c),
490 	REG16(0x620),
491 	REG16(0x624),
492 	REG16(0x628),
493 	REG16(0x62c),
494 	REG16(0x630),
495 	REG16(0x634),
496 	REG16(0x638),
497 	REG16(0x63c),
498 	REG16(0x640),
499 	REG16(0x644),
500 	REG16(0x648),
501 	REG16(0x64c),
502 	REG16(0x650),
503 	REG16(0x654),
504 	REG16(0x658),
505 	REG16(0x65c),
506 	REG16(0x660),
507 	REG16(0x664),
508 	REG16(0x668),
509 	REG16(0x66c),
510 	REG16(0x670),
511 	REG16(0x674),
512 	REG16(0x678),
513 	REG16(0x67c),
514 	REG(0x068),
515 	REG(0x084),
516 	NOP(1),
517 
518 	END
519 };
520 
521 static const u8 xehp_rcs_offsets[] = {
522 	NOP(1),
523 	LRI(13, POSTED),
524 	REG16(0x244),
525 	REG(0x034),
526 	REG(0x030),
527 	REG(0x038),
528 	REG(0x03c),
529 	REG(0x168),
530 	REG(0x140),
531 	REG(0x110),
532 	REG(0x1c0),
533 	REG(0x1c4),
534 	REG(0x1c8),
535 	REG(0x180),
536 	REG16(0x2b4),
537 
538 	NOP(5),
539 	LRI(9, POSTED),
540 	REG16(0x3a8),
541 	REG16(0x28c),
542 	REG16(0x288),
543 	REG16(0x284),
544 	REG16(0x280),
545 	REG16(0x27c),
546 	REG16(0x278),
547 	REG16(0x274),
548 	REG16(0x270),
549 
550 	LRI(3, POSTED),
551 	REG(0x1b0),
552 	REG16(0x5a8),
553 	REG16(0x5ac),
554 
555 	NOP(6),
556 	LRI(1, 0),
557 	REG(0x0c8),
558 
559 	END
560 };
561 
562 static const u8 dg2_rcs_offsets[] = {
563 	NOP(1),
564 	LRI(15, POSTED),
565 	REG16(0x244),
566 	REG(0x034),
567 	REG(0x030),
568 	REG(0x038),
569 	REG(0x03c),
570 	REG(0x168),
571 	REG(0x140),
572 	REG(0x110),
573 	REG(0x1c0),
574 	REG(0x1c4),
575 	REG(0x1c8),
576 	REG(0x180),
577 	REG16(0x2b4),
578 	REG(0x120),
579 	REG(0x124),
580 
581 	NOP(1),
582 	LRI(9, POSTED),
583 	REG16(0x3a8),
584 	REG16(0x28c),
585 	REG16(0x288),
586 	REG16(0x284),
587 	REG16(0x280),
588 	REG16(0x27c),
589 	REG16(0x278),
590 	REG16(0x274),
591 	REG16(0x270),
592 
593 	LRI(3, POSTED),
594 	REG(0x1b0),
595 	REG16(0x5a8),
596 	REG16(0x5ac),
597 
598 	NOP(6),
599 	LRI(1, 0),
600 	REG(0x0c8),
601 
602 	END
603 };
604 
605 #undef END
606 #undef REG16
607 #undef REG
608 #undef LRI
609 #undef NOP
610 
611 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
612 {
613 	/*
614 	 * The gen12+ lists only have the registers we program in the basic
615 	 * default state. We rely on the context image using relative
616 	 * addressing to automatic fixup the register state between the
617 	 * physical engines for virtual engine.
618 	 */
619 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
620 		   !intel_engine_has_relative_mmio(engine));
621 
622 	if (engine->class == RENDER_CLASS) {
623 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
624 			return dg2_rcs_offsets;
625 		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
626 			return xehp_rcs_offsets;
627 		else if (GRAPHICS_VER(engine->i915) >= 12)
628 			return gen12_rcs_offsets;
629 		else if (GRAPHICS_VER(engine->i915) >= 11)
630 			return gen11_rcs_offsets;
631 		else if (GRAPHICS_VER(engine->i915) >= 9)
632 			return gen9_rcs_offsets;
633 		else
634 			return gen8_rcs_offsets;
635 	} else {
636 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
637 			return dg2_xcs_offsets;
638 		else if (GRAPHICS_VER(engine->i915) >= 12)
639 			return gen12_xcs_offsets;
640 		else if (GRAPHICS_VER(engine->i915) >= 9)
641 			return gen9_xcs_offsets;
642 		else
643 			return gen8_xcs_offsets;
644 	}
645 }
646 
647 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
648 {
649 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
650 		return 0x70;
651 	else if (GRAPHICS_VER(engine->i915) >= 12)
652 		return 0x60;
653 	else if (GRAPHICS_VER(engine->i915) >= 9)
654 		return 0x54;
655 	else if (engine->class == RENDER_CLASS)
656 		return 0x58;
657 	else
658 		return -1;
659 }
660 
661 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
662 {
663 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
664 		return 0x84;
665 	else if (GRAPHICS_VER(engine->i915) >= 12)
666 		return 0x74;
667 	else if (GRAPHICS_VER(engine->i915) >= 9)
668 		return 0x68;
669 	else if (engine->class == RENDER_CLASS)
670 		return 0xd8;
671 	else
672 		return -1;
673 }
674 
675 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
676 {
677 	if (GRAPHICS_VER(engine->i915) >= 12)
678 		return 0x12;
679 	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
680 		return 0x18;
681 	else
682 		return -1;
683 }
684 
685 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
686 {
687 	int x;
688 
689 	x = lrc_ring_wa_bb_per_ctx(engine);
690 	if (x < 0)
691 		return x;
692 
693 	return x + 2;
694 }
695 
696 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
697 {
698 	int x;
699 
700 	x = lrc_ring_indirect_ptr(engine);
701 	if (x < 0)
702 		return x;
703 
704 	return x + 2;
705 }
706 
707 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
708 {
709 
710 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
711 		/*
712 		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
713 		 * simply to match the RCS context image layout.
714 		 */
715 		return 0xc6;
716 	else if (engine->class != RENDER_CLASS)
717 		return -1;
718 	else if (GRAPHICS_VER(engine->i915) >= 12)
719 		return 0xb6;
720 	else if (GRAPHICS_VER(engine->i915) >= 11)
721 		return 0xaa;
722 	else
723 		return -1;
724 }
725 
726 static u32
727 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
728 {
729 	switch (GRAPHICS_VER(engine->i915)) {
730 	default:
731 		MISSING_CASE(GRAPHICS_VER(engine->i915));
732 		fallthrough;
733 	case 12:
734 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
735 	case 11:
736 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
737 	case 9:
738 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
739 	case 8:
740 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
741 	}
742 }
743 
744 static void
745 lrc_setup_indirect_ctx(u32 *regs,
746 		       const struct intel_engine_cs *engine,
747 		       u32 ctx_bb_ggtt_addr,
748 		       u32 size)
749 {
750 	GEM_BUG_ON(!size);
751 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
752 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
753 	regs[lrc_ring_indirect_ptr(engine) + 1] =
754 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
755 
756 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
757 	regs[lrc_ring_indirect_offset(engine) + 1] =
758 		lrc_ring_indirect_offset_default(engine) << 6;
759 }
760 
761 static void init_common_regs(u32 * const regs,
762 			     const struct intel_context *ce,
763 			     const struct intel_engine_cs *engine,
764 			     bool inhibit)
765 {
766 	u32 ctl;
767 
768 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
769 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
770 	if (inhibit)
771 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
772 	if (GRAPHICS_VER(engine->i915) < 11)
773 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
774 					   CTX_CTRL_RS_CTX_ENABLE);
775 	regs[CTX_CONTEXT_CONTROL] = ctl;
776 
777 	regs[CTX_TIMESTAMP] = ce->runtime.last;
778 }
779 
780 static void init_wa_bb_regs(u32 * const regs,
781 			    const struct intel_engine_cs *engine)
782 {
783 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
784 
785 	if (wa_ctx->per_ctx.size) {
786 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
787 
788 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
789 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
790 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
791 	}
792 
793 	if (wa_ctx->indirect_ctx.size) {
794 		lrc_setup_indirect_ctx(regs, engine,
795 				       i915_ggtt_offset(wa_ctx->vma) +
796 				       wa_ctx->indirect_ctx.offset,
797 				       wa_ctx->indirect_ctx.size);
798 	}
799 }
800 
801 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
802 {
803 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
804 		/* 64b PPGTT (48bit canonical)
805 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
806 		 * other PDP Descriptors are ignored.
807 		 */
808 		ASSIGN_CTX_PML4(ppgtt, regs);
809 	} else {
810 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
811 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
812 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
813 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
814 	}
815 }
816 
817 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
818 {
819 	if (i915_is_ggtt(vm))
820 		return i915_vm_to_ggtt(vm)->alias;
821 	else
822 		return i915_vm_to_ppgtt(vm);
823 }
824 
825 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
826 {
827 	int x;
828 
829 	x = lrc_ring_mi_mode(engine);
830 	if (x != -1) {
831 		regs[x + 1] &= ~STOP_RING;
832 		regs[x + 1] |= STOP_RING << 16;
833 	}
834 }
835 
836 static void __lrc_init_regs(u32 *regs,
837 			    const struct intel_context *ce,
838 			    const struct intel_engine_cs *engine,
839 			    bool inhibit)
840 {
841 	/*
842 	 * A context is actually a big batch buffer with several
843 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
844 	 * values we are setting here are only for the first context restore:
845 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
846 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
847 	 * we are not initializing here).
848 	 *
849 	 * Must keep consistent with virtual_update_register_offsets().
850 	 */
851 
852 	if (inhibit)
853 		memset(regs, 0, PAGE_SIZE);
854 
855 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
856 
857 	init_common_regs(regs, ce, engine, inhibit);
858 	init_ppgtt_regs(regs, vm_alias(ce->vm));
859 
860 	init_wa_bb_regs(regs, engine);
861 
862 	__reset_stop_ring(regs, engine);
863 }
864 
865 void lrc_init_regs(const struct intel_context *ce,
866 		   const struct intel_engine_cs *engine,
867 		   bool inhibit)
868 {
869 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
870 }
871 
872 void lrc_reset_regs(const struct intel_context *ce,
873 		    const struct intel_engine_cs *engine)
874 {
875 	__reset_stop_ring(ce->lrc_reg_state, engine);
876 }
877 
878 static void
879 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
880 {
881 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
882 		return;
883 
884 	vaddr += engine->context_size;
885 
886 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
887 }
888 
889 static void
890 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
891 {
892 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
893 		return;
894 
895 	vaddr += engine->context_size;
896 
897 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
898 		drm_err_once(&engine->i915->drm,
899 			     "%s context redzone overwritten!\n",
900 			     engine->name);
901 }
902 
903 void lrc_init_state(struct intel_context *ce,
904 		    struct intel_engine_cs *engine,
905 		    void *state)
906 {
907 	bool inhibit = true;
908 
909 	set_redzone(state, engine);
910 
911 	if (engine->default_state) {
912 		shmem_read(engine->default_state, 0,
913 			   state, engine->context_size);
914 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
915 		inhibit = false;
916 	}
917 
918 	/* Clear the ppHWSP (inc. per-context counters) */
919 	memset(state, 0, PAGE_SIZE);
920 
921 	/*
922 	 * The second page of the context object contains some registers which
923 	 * must be set up prior to the first execution.
924 	 */
925 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
926 }
927 
928 static struct i915_vma *
929 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
930 {
931 	struct drm_i915_gem_object *obj;
932 	struct i915_vma *vma;
933 	u32 context_size;
934 
935 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
936 
937 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
938 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
939 
940 	if (GRAPHICS_VER(engine->i915) == 12) {
941 		ce->wa_bb_page = context_size / PAGE_SIZE;
942 		context_size += PAGE_SIZE;
943 	}
944 
945 	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
946 		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
947 		context_size += PARENT_SCRATCH_SIZE;
948 	}
949 
950 	obj = i915_gem_object_create_lmem(engine->i915, context_size,
951 					  I915_BO_ALLOC_PM_VOLATILE);
952 	if (IS_ERR(obj))
953 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
954 	if (IS_ERR(obj))
955 		return ERR_CAST(obj);
956 
957 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
958 	if (IS_ERR(vma)) {
959 		i915_gem_object_put(obj);
960 		return vma;
961 	}
962 
963 	return vma;
964 }
965 
966 static struct intel_timeline *
967 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
968 {
969 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
970 
971 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
972 }
973 
974 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
975 {
976 	struct intel_ring *ring;
977 	struct i915_vma *vma;
978 	int err;
979 
980 	GEM_BUG_ON(ce->state);
981 
982 	vma = __lrc_alloc_state(ce, engine);
983 	if (IS_ERR(vma))
984 		return PTR_ERR(vma);
985 
986 	ring = intel_engine_create_ring(engine, ce->ring_size);
987 	if (IS_ERR(ring)) {
988 		err = PTR_ERR(ring);
989 		goto err_vma;
990 	}
991 
992 	if (!page_mask_bits(ce->timeline)) {
993 		struct intel_timeline *tl;
994 
995 		/*
996 		 * Use the static global HWSP for the kernel context, and
997 		 * a dynamically allocated cacheline for everyone else.
998 		 */
999 		if (unlikely(ce->timeline))
1000 			tl = pinned_timeline(ce, engine);
1001 		else
1002 			tl = intel_timeline_create(engine->gt);
1003 		if (IS_ERR(tl)) {
1004 			err = PTR_ERR(tl);
1005 			goto err_ring;
1006 		}
1007 
1008 		ce->timeline = tl;
1009 	}
1010 
1011 	ce->ring = ring;
1012 	ce->state = vma;
1013 
1014 	return 0;
1015 
1016 err_ring:
1017 	intel_ring_put(ring);
1018 err_vma:
1019 	i915_vma_put(vma);
1020 	return err;
1021 }
1022 
1023 void lrc_reset(struct intel_context *ce)
1024 {
1025 	GEM_BUG_ON(!intel_context_is_pinned(ce));
1026 
1027 	intel_ring_reset(ce->ring, ce->ring->emit);
1028 
1029 	/* Scrub away the garbage */
1030 	lrc_init_regs(ce, ce->engine, true);
1031 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1032 }
1033 
1034 int
1035 lrc_pre_pin(struct intel_context *ce,
1036 	    struct intel_engine_cs *engine,
1037 	    struct i915_gem_ww_ctx *ww,
1038 	    void **vaddr)
1039 {
1040 	GEM_BUG_ON(!ce->state);
1041 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1042 
1043 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1044 					 i915_coherent_map_type(ce->engine->i915,
1045 								ce->state->obj,
1046 								false) |
1047 					 I915_MAP_OVERRIDE);
1048 
1049 	return PTR_ERR_OR_ZERO(*vaddr);
1050 }
1051 
1052 int
1053 lrc_pin(struct intel_context *ce,
1054 	struct intel_engine_cs *engine,
1055 	void *vaddr)
1056 {
1057 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1058 
1059 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1060 		lrc_init_state(ce, engine, vaddr);
1061 
1062 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1063 	return 0;
1064 }
1065 
1066 void lrc_unpin(struct intel_context *ce)
1067 {
1068 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1069 		      ce->engine);
1070 }
1071 
1072 void lrc_post_unpin(struct intel_context *ce)
1073 {
1074 	i915_gem_object_unpin_map(ce->state->obj);
1075 }
1076 
1077 void lrc_fini(struct intel_context *ce)
1078 {
1079 	if (!ce->state)
1080 		return;
1081 
1082 	intel_ring_put(fetch_and_zero(&ce->ring));
1083 	i915_vma_put(fetch_and_zero(&ce->state));
1084 }
1085 
1086 void lrc_destroy(struct kref *kref)
1087 {
1088 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1089 
1090 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1091 	GEM_BUG_ON(intel_context_is_pinned(ce));
1092 
1093 	lrc_fini(ce);
1094 
1095 	intel_context_fini(ce);
1096 	intel_context_free(ce);
1097 }
1098 
1099 static u32 *
1100 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1101 {
1102 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1103 		MI_SRM_LRM_GLOBAL_GTT |
1104 		MI_LRI_LRM_CS_MMIO;
1105 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1106 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1107 		CTX_TIMESTAMP * sizeof(u32);
1108 	*cs++ = 0;
1109 
1110 	*cs++ = MI_LOAD_REGISTER_REG |
1111 		MI_LRR_SOURCE_CS_MMIO |
1112 		MI_LRI_LRM_CS_MMIO;
1113 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1114 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1115 
1116 	*cs++ = MI_LOAD_REGISTER_REG |
1117 		MI_LRR_SOURCE_CS_MMIO |
1118 		MI_LRI_LRM_CS_MMIO;
1119 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1120 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1121 
1122 	return cs;
1123 }
1124 
1125 static u32 *
1126 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1127 {
1128 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1129 
1130 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1131 		MI_SRM_LRM_GLOBAL_GTT |
1132 		MI_LRI_LRM_CS_MMIO;
1133 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1134 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1135 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1136 	*cs++ = 0;
1137 
1138 	return cs;
1139 }
1140 
1141 static u32 *
1142 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1143 {
1144 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1145 
1146 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1147 		MI_SRM_LRM_GLOBAL_GTT |
1148 		MI_LRI_LRM_CS_MMIO;
1149 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1150 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1151 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1152 	*cs++ = 0;
1153 
1154 	*cs++ = MI_LOAD_REGISTER_REG |
1155 		MI_LRR_SOURCE_CS_MMIO |
1156 		MI_LRI_LRM_CS_MMIO;
1157 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1158 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1159 
1160 	return cs;
1161 }
1162 
1163 static u32 *
1164 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1165 {
1166 	cs = gen12_emit_timestamp_wa(ce, cs);
1167 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1168 	cs = gen12_emit_restore_scratch(ce, cs);
1169 
1170 	/* Wa_16013000631:dg2 */
1171 	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1172 	    IS_DG2_G11(ce->engine->i915))
1173 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1174 
1175 	return cs;
1176 }
1177 
1178 static u32 *
1179 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1180 {
1181 	cs = gen12_emit_timestamp_wa(ce, cs);
1182 	cs = gen12_emit_restore_scratch(ce, cs);
1183 
1184 	return cs;
1185 }
1186 
1187 static u32 context_wa_bb_offset(const struct intel_context *ce)
1188 {
1189 	return PAGE_SIZE * ce->wa_bb_page;
1190 }
1191 
1192 static u32 *context_indirect_bb(const struct intel_context *ce)
1193 {
1194 	void *ptr;
1195 
1196 	GEM_BUG_ON(!ce->wa_bb_page);
1197 
1198 	ptr = ce->lrc_reg_state;
1199 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1200 	ptr += context_wa_bb_offset(ce);
1201 
1202 	return ptr;
1203 }
1204 
1205 static void
1206 setup_indirect_ctx_bb(const struct intel_context *ce,
1207 		      const struct intel_engine_cs *engine,
1208 		      u32 *(*emit)(const struct intel_context *, u32 *))
1209 {
1210 	u32 * const start = context_indirect_bb(ce);
1211 	u32 *cs;
1212 
1213 	cs = emit(ce, start);
1214 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1215 	while ((unsigned long)cs % CACHELINE_BYTES)
1216 		*cs++ = MI_NOOP;
1217 
1218 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1219 			       i915_ggtt_offset(ce->state) +
1220 			       context_wa_bb_offset(ce),
1221 			       (cs - start) * sizeof(*cs));
1222 }
1223 
1224 /*
1225  * The context descriptor encodes various attributes of a context,
1226  * including its GTT address and some flags. Because it's fairly
1227  * expensive to calculate, we'll just do it once and cache the result,
1228  * which remains valid until the context is unpinned.
1229  *
1230  * This is what a descriptor looks like, from LSB to MSB::
1231  *
1232  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1233  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1234  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1235  *      bits 53-54:    mbz, reserved for use by hardware
1236  *      bits 55-63:    group ID, currently unused and set to 0
1237  *
1238  * Starting from Gen11, the upper dword of the descriptor has a new format:
1239  *
1240  *      bits 32-36:    reserved
1241  *      bits 37-47:    SW context ID
1242  *      bits 48:53:    engine instance
1243  *      bit 54:        mbz, reserved for use by hardware
1244  *      bits 55-60:    SW counter
1245  *      bits 61-63:    engine class
1246  *
1247  * On Xe_HP, the upper dword of the descriptor has a new format:
1248  *
1249  *      bits 32-37:    virtual function number
1250  *      bit 38:        mbz, reserved for use by hardware
1251  *      bits 39-54:    SW context ID
1252  *      bits 55-57:    reserved
1253  *      bits 58-63:    SW counter
1254  *
1255  * engine info, SW context ID and SW counter need to form a unique number
1256  * (Context ID) per lrc.
1257  */
1258 static u32 lrc_descriptor(const struct intel_context *ce)
1259 {
1260 	u32 desc;
1261 
1262 	desc = INTEL_LEGACY_32B_CONTEXT;
1263 	if (i915_vm_is_4lvl(ce->vm))
1264 		desc = INTEL_LEGACY_64B_CONTEXT;
1265 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1266 
1267 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1268 	if (GRAPHICS_VER(ce->vm->i915) == 8)
1269 		desc |= GEN8_CTX_L3LLC_COHERENT;
1270 
1271 	return i915_ggtt_offset(ce->state) | desc;
1272 }
1273 
1274 u32 lrc_update_regs(const struct intel_context *ce,
1275 		    const struct intel_engine_cs *engine,
1276 		    u32 head)
1277 {
1278 	struct intel_ring *ring = ce->ring;
1279 	u32 *regs = ce->lrc_reg_state;
1280 
1281 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1282 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1283 
1284 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1285 	regs[CTX_RING_HEAD] = head;
1286 	regs[CTX_RING_TAIL] = ring->tail;
1287 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1288 
1289 	/* RPCS */
1290 	if (engine->class == RENDER_CLASS) {
1291 		regs[CTX_R_PWR_CLK_STATE] =
1292 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1293 
1294 		i915_oa_init_reg_state(ce, engine);
1295 	}
1296 
1297 	if (ce->wa_bb_page) {
1298 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1299 
1300 		fn = gen12_emit_indirect_ctx_xcs;
1301 		if (ce->engine->class == RENDER_CLASS)
1302 			fn = gen12_emit_indirect_ctx_rcs;
1303 
1304 		/* Mutually exclusive wrt to global indirect bb */
1305 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1306 		setup_indirect_ctx_bb(ce, engine, fn);
1307 	}
1308 
1309 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1310 }
1311 
1312 void lrc_update_offsets(struct intel_context *ce,
1313 			struct intel_engine_cs *engine)
1314 {
1315 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1316 }
1317 
1318 void lrc_check_regs(const struct intel_context *ce,
1319 		    const struct intel_engine_cs *engine,
1320 		    const char *when)
1321 {
1322 	const struct intel_ring *ring = ce->ring;
1323 	u32 *regs = ce->lrc_reg_state;
1324 	bool valid = true;
1325 	int x;
1326 
1327 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1328 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1329 		       engine->name,
1330 		       regs[CTX_RING_START],
1331 		       i915_ggtt_offset(ring->vma));
1332 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1333 		valid = false;
1334 	}
1335 
1336 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1337 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1338 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1339 		       engine->name,
1340 		       regs[CTX_RING_CTL],
1341 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1342 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1343 		valid = false;
1344 	}
1345 
1346 	x = lrc_ring_mi_mode(engine);
1347 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1348 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1349 		       engine->name, regs[x + 1]);
1350 		regs[x + 1] &= ~STOP_RING;
1351 		regs[x + 1] |= STOP_RING << 16;
1352 		valid = false;
1353 	}
1354 
1355 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1356 }
1357 
1358 /*
1359  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1360  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1361  * but there is a slight complication as this is applied in WA batch where the
1362  * values are only initialized once so we cannot take register value at the
1363  * beginning and reuse it further; hence we save its value to memory, upload a
1364  * constant value with bit21 set and then we restore it back with the saved value.
1365  * To simplify the WA, a constant value is formed by using the default value
1366  * of this register. This shouldn't be a problem because we are only modifying
1367  * it for a short period and this batch in non-premptible. We can ofcourse
1368  * use additional instructions that read the actual value of the register
1369  * at that time and set our bit of interest but it makes the WA complicated.
1370  *
1371  * This WA is also required for Gen9 so extracting as a function avoids
1372  * code duplication.
1373  */
1374 static u32 *
1375 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1376 {
1377 	/* NB no one else is allowed to scribble over scratch + 256! */
1378 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1379 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1380 	*batch++ = intel_gt_scratch_offset(engine->gt,
1381 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1382 	*batch++ = 0;
1383 
1384 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1385 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1386 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1387 
1388 	batch = gen8_emit_pipe_control(batch,
1389 				       PIPE_CONTROL_CS_STALL |
1390 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1391 				       0);
1392 
1393 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1394 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1395 	*batch++ = intel_gt_scratch_offset(engine->gt,
1396 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1397 	*batch++ = 0;
1398 
1399 	return batch;
1400 }
1401 
1402 /*
1403  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1404  * initialized at the beginning and shared across all contexts but this field
1405  * helps us to have multiple batches at different offsets and select them based
1406  * on a criteria. At the moment this batch always start at the beginning of the page
1407  * and at this point we don't have multiple wa_ctx batch buffers.
1408  *
1409  * The number of WA applied are not known at the beginning; we use this field
1410  * to return the no of DWORDS written.
1411  *
1412  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1413  * so it adds NOOPs as padding to make it cacheline aligned.
1414  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1415  * makes a complete batch buffer.
1416  */
1417 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1418 {
1419 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1420 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1421 
1422 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1423 	if (IS_BROADWELL(engine->i915))
1424 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1425 
1426 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1427 	/* Actual scratch location is at 128 bytes offset */
1428 	batch = gen8_emit_pipe_control(batch,
1429 				       PIPE_CONTROL_FLUSH_L3 |
1430 				       PIPE_CONTROL_STORE_DATA_INDEX |
1431 				       PIPE_CONTROL_CS_STALL |
1432 				       PIPE_CONTROL_QW_WRITE,
1433 				       LRC_PPHWSP_SCRATCH_ADDR);
1434 
1435 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1436 
1437 	/* Pad to end of cacheline */
1438 	while ((unsigned long)batch % CACHELINE_BYTES)
1439 		*batch++ = MI_NOOP;
1440 
1441 	/*
1442 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1443 	 * execution depends on the length specified in terms of cache lines
1444 	 * in the register CTX_RCS_INDIRECT_CTX
1445 	 */
1446 
1447 	return batch;
1448 }
1449 
1450 struct lri {
1451 	i915_reg_t reg;
1452 	u32 value;
1453 };
1454 
1455 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1456 {
1457 	GEM_BUG_ON(!count || count > 63);
1458 
1459 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1460 	do {
1461 		*batch++ = i915_mmio_reg_offset(lri->reg);
1462 		*batch++ = lri->value;
1463 	} while (lri++, --count);
1464 	*batch++ = MI_NOOP;
1465 
1466 	return batch;
1467 }
1468 
1469 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1470 {
1471 	static const struct lri lri[] = {
1472 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1473 		{
1474 			COMMON_SLICE_CHICKEN2,
1475 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1476 				       0),
1477 		},
1478 
1479 		/* BSpec: 11391 */
1480 		{
1481 			FF_SLICE_CHICKEN,
1482 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1483 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1484 		},
1485 
1486 		/* BSpec: 11299 */
1487 		{
1488 			_3D_CHICKEN3,
1489 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1490 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1491 		}
1492 	};
1493 
1494 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1495 
1496 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1497 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1498 
1499 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1500 	batch = gen8_emit_pipe_control(batch,
1501 				       PIPE_CONTROL_FLUSH_L3 |
1502 				       PIPE_CONTROL_STORE_DATA_INDEX |
1503 				       PIPE_CONTROL_CS_STALL |
1504 				       PIPE_CONTROL_QW_WRITE,
1505 				       LRC_PPHWSP_SCRATCH_ADDR);
1506 
1507 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1508 
1509 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1510 	if (HAS_POOLED_EU(engine->i915)) {
1511 		/*
1512 		 * EU pool configuration is setup along with golden context
1513 		 * during context initialization. This value depends on
1514 		 * device type (2x6 or 3x6) and needs to be updated based
1515 		 * on which subslice is disabled especially for 2x6
1516 		 * devices, however it is safe to load default
1517 		 * configuration of 3x6 device instead of masking off
1518 		 * corresponding bits because HW ignores bits of a disabled
1519 		 * subslice and drops down to appropriate config. Please
1520 		 * see render_state_setup() in i915_gem_render_state.c for
1521 		 * possible configurations, to avoid duplication they are
1522 		 * not shown here again.
1523 		 */
1524 		*batch++ = GEN9_MEDIA_POOL_STATE;
1525 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1526 		*batch++ = 0x00777000;
1527 		*batch++ = 0;
1528 		*batch++ = 0;
1529 		*batch++ = 0;
1530 	}
1531 
1532 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1533 
1534 	/* Pad to end of cacheline */
1535 	while ((unsigned long)batch % CACHELINE_BYTES)
1536 		*batch++ = MI_NOOP;
1537 
1538 	return batch;
1539 }
1540 
1541 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1542 
1543 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1544 {
1545 	struct drm_i915_gem_object *obj;
1546 	struct i915_vma *vma;
1547 	int err;
1548 
1549 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1550 	if (IS_ERR(obj))
1551 		return PTR_ERR(obj);
1552 
1553 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1554 	if (IS_ERR(vma)) {
1555 		err = PTR_ERR(vma);
1556 		goto err;
1557 	}
1558 
1559 	engine->wa_ctx.vma = vma;
1560 	return 0;
1561 
1562 err:
1563 	i915_gem_object_put(obj);
1564 	return err;
1565 }
1566 
1567 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1568 {
1569 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1570 }
1571 
1572 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1573 
1574 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1575 {
1576 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1577 	struct i915_wa_ctx_bb *wa_bb[] = {
1578 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1579 	};
1580 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1581 	struct i915_gem_ww_ctx ww;
1582 	void *batch, *batch_ptr;
1583 	unsigned int i;
1584 	int err;
1585 
1586 	if (engine->class != RENDER_CLASS)
1587 		return;
1588 
1589 	switch (GRAPHICS_VER(engine->i915)) {
1590 	case 12:
1591 	case 11:
1592 		return;
1593 	case 9:
1594 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1595 		wa_bb_fn[1] = NULL;
1596 		break;
1597 	case 8:
1598 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1599 		wa_bb_fn[1] = NULL;
1600 		break;
1601 	default:
1602 		MISSING_CASE(GRAPHICS_VER(engine->i915));
1603 		return;
1604 	}
1605 
1606 	err = lrc_create_wa_ctx(engine);
1607 	if (err) {
1608 		/*
1609 		 * We continue even if we fail to initialize WA batch
1610 		 * because we only expect rare glitches but nothing
1611 		 * critical to prevent us from using GPU
1612 		 */
1613 		drm_err(&engine->i915->drm,
1614 			"Ignoring context switch w/a allocation error:%d\n",
1615 			err);
1616 		return;
1617 	}
1618 
1619 	if (!engine->wa_ctx.vma)
1620 		return;
1621 
1622 	i915_gem_ww_ctx_init(&ww, true);
1623 retry:
1624 	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1625 	if (!err)
1626 		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1627 	if (err)
1628 		goto err;
1629 
1630 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1631 	if (IS_ERR(batch)) {
1632 		err = PTR_ERR(batch);
1633 		goto err_unpin;
1634 	}
1635 
1636 	/*
1637 	 * Emit the two workaround batch buffers, recording the offset from the
1638 	 * start of the workaround batch buffer object for each and their
1639 	 * respective sizes.
1640 	 */
1641 	batch_ptr = batch;
1642 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1643 		wa_bb[i]->offset = batch_ptr - batch;
1644 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1645 						  CACHELINE_BYTES))) {
1646 			err = -EINVAL;
1647 			break;
1648 		}
1649 		if (wa_bb_fn[i])
1650 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1651 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1652 	}
1653 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1654 
1655 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1656 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1657 
1658 	/* Verify that we can handle failure to setup the wa_ctx */
1659 	if (!err)
1660 		err = i915_inject_probe_error(engine->i915, -ENODEV);
1661 
1662 err_unpin:
1663 	if (err)
1664 		i915_vma_unpin(wa_ctx->vma);
1665 err:
1666 	if (err == -EDEADLK) {
1667 		err = i915_gem_ww_ctx_backoff(&ww);
1668 		if (!err)
1669 			goto retry;
1670 	}
1671 	i915_gem_ww_ctx_fini(&ww);
1672 
1673 	if (err) {
1674 		i915_vma_put(engine->wa_ctx.vma);
1675 
1676 		/* Clear all flags to prevent further use */
1677 		memset(wa_ctx, 0, sizeof(*wa_ctx));
1678 	}
1679 }
1680 
1681 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1682 {
1683 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1684 	ce->runtime.num_underflow++;
1685 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1686 #endif
1687 }
1688 
1689 void lrc_update_runtime(struct intel_context *ce)
1690 {
1691 	u32 old;
1692 	s32 dt;
1693 
1694 	if (intel_context_is_barrier(ce))
1695 		return;
1696 
1697 	old = ce->runtime.last;
1698 	ce->runtime.last = lrc_get_runtime(ce);
1699 	dt = ce->runtime.last - old;
1700 
1701 	if (unlikely(dt < 0)) {
1702 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1703 			 old, ce->runtime.last, dt);
1704 		st_update_runtime_underflow(ce, dt);
1705 		return;
1706 	}
1707 
1708 	ewma_runtime_add(&ce->runtime.avg, dt);
1709 	ce->runtime.total += dt;
1710 }
1711 
1712 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1713 #include "selftest_lrc.c"
1714 #endif
1715