xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision dd4821ba)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gem/i915_gem_lmem.h"
7 
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "intel_engine.h"
12 #include "intel_engine_regs.h"
13 #include "intel_gpu_commands.h"
14 #include "intel_gt.h"
15 #include "intel_gt_regs.h"
16 #include "intel_lrc.h"
17 #include "intel_lrc_reg.h"
18 #include "intel_ring.h"
19 #include "shmem_utils.h"
20 
21 static void set_offsets(u32 *regs,
22 			const u8 *data,
23 			const struct intel_engine_cs *engine,
24 			bool close)
25 #define NOP(x) (BIT(7) | (x))
26 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
27 #define POSTED BIT(0)
28 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
29 #define REG16(x) \
30 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
31 	(((x) >> 2) & 0x7f)
32 #define END 0
33 {
34 	const u32 base = engine->mmio_base;
35 
36 	while (*data) {
37 		u8 count, flags;
38 
39 		if (*data & BIT(7)) { /* skip */
40 			count = *data++ & ~BIT(7);
41 			regs += count;
42 			continue;
43 		}
44 
45 		count = *data & 0x3f;
46 		flags = *data >> 6;
47 		data++;
48 
49 		*regs = MI_LOAD_REGISTER_IMM(count);
50 		if (flags & POSTED)
51 			*regs |= MI_LRI_FORCE_POSTED;
52 		if (GRAPHICS_VER(engine->i915) >= 11)
53 			*regs |= MI_LRI_LRM_CS_MMIO;
54 		regs++;
55 
56 		GEM_BUG_ON(!count);
57 		do {
58 			u32 offset = 0;
59 			u8 v;
60 
61 			do {
62 				v = *data++;
63 				offset <<= 7;
64 				offset |= v & ~BIT(7);
65 			} while (v & BIT(7));
66 
67 			regs[0] = base + (offset << 2);
68 			regs += 2;
69 		} while (--count);
70 	}
71 
72 	if (close) {
73 		/* Close the batch; used mainly by live_lrc_layout() */
74 		*regs = MI_BATCH_BUFFER_END;
75 		if (GRAPHICS_VER(engine->i915) >= 11)
76 			*regs |= BIT(0);
77 	}
78 }
79 
80 static const u8 gen8_xcs_offsets[] = {
81 	NOP(1),
82 	LRI(11, 0),
83 	REG16(0x244),
84 	REG(0x034),
85 	REG(0x030),
86 	REG(0x038),
87 	REG(0x03c),
88 	REG(0x168),
89 	REG(0x140),
90 	REG(0x110),
91 	REG(0x11c),
92 	REG(0x114),
93 	REG(0x118),
94 
95 	NOP(9),
96 	LRI(9, 0),
97 	REG16(0x3a8),
98 	REG16(0x28c),
99 	REG16(0x288),
100 	REG16(0x284),
101 	REG16(0x280),
102 	REG16(0x27c),
103 	REG16(0x278),
104 	REG16(0x274),
105 	REG16(0x270),
106 
107 	NOP(13),
108 	LRI(2, 0),
109 	REG16(0x200),
110 	REG(0x028),
111 
112 	END
113 };
114 
115 static const u8 gen9_xcs_offsets[] = {
116 	NOP(1),
117 	LRI(14, POSTED),
118 	REG16(0x244),
119 	REG(0x034),
120 	REG(0x030),
121 	REG(0x038),
122 	REG(0x03c),
123 	REG(0x168),
124 	REG(0x140),
125 	REG(0x110),
126 	REG(0x11c),
127 	REG(0x114),
128 	REG(0x118),
129 	REG(0x1c0),
130 	REG(0x1c4),
131 	REG(0x1c8),
132 
133 	NOP(3),
134 	LRI(9, POSTED),
135 	REG16(0x3a8),
136 	REG16(0x28c),
137 	REG16(0x288),
138 	REG16(0x284),
139 	REG16(0x280),
140 	REG16(0x27c),
141 	REG16(0x278),
142 	REG16(0x274),
143 	REG16(0x270),
144 
145 	NOP(13),
146 	LRI(1, POSTED),
147 	REG16(0x200),
148 
149 	NOP(13),
150 	LRI(44, POSTED),
151 	REG(0x028),
152 	REG(0x09c),
153 	REG(0x0c0),
154 	REG(0x178),
155 	REG(0x17c),
156 	REG16(0x358),
157 	REG(0x170),
158 	REG(0x150),
159 	REG(0x154),
160 	REG(0x158),
161 	REG16(0x41c),
162 	REG16(0x600),
163 	REG16(0x604),
164 	REG16(0x608),
165 	REG16(0x60c),
166 	REG16(0x610),
167 	REG16(0x614),
168 	REG16(0x618),
169 	REG16(0x61c),
170 	REG16(0x620),
171 	REG16(0x624),
172 	REG16(0x628),
173 	REG16(0x62c),
174 	REG16(0x630),
175 	REG16(0x634),
176 	REG16(0x638),
177 	REG16(0x63c),
178 	REG16(0x640),
179 	REG16(0x644),
180 	REG16(0x648),
181 	REG16(0x64c),
182 	REG16(0x650),
183 	REG16(0x654),
184 	REG16(0x658),
185 	REG16(0x65c),
186 	REG16(0x660),
187 	REG16(0x664),
188 	REG16(0x668),
189 	REG16(0x66c),
190 	REG16(0x670),
191 	REG16(0x674),
192 	REG16(0x678),
193 	REG16(0x67c),
194 	REG(0x068),
195 
196 	END
197 };
198 
199 static const u8 gen12_xcs_offsets[] = {
200 	NOP(1),
201 	LRI(13, POSTED),
202 	REG16(0x244),
203 	REG(0x034),
204 	REG(0x030),
205 	REG(0x038),
206 	REG(0x03c),
207 	REG(0x168),
208 	REG(0x140),
209 	REG(0x110),
210 	REG(0x1c0),
211 	REG(0x1c4),
212 	REG(0x1c8),
213 	REG(0x180),
214 	REG16(0x2b4),
215 
216 	NOP(5),
217 	LRI(9, POSTED),
218 	REG16(0x3a8),
219 	REG16(0x28c),
220 	REG16(0x288),
221 	REG16(0x284),
222 	REG16(0x280),
223 	REG16(0x27c),
224 	REG16(0x278),
225 	REG16(0x274),
226 	REG16(0x270),
227 
228 	END
229 };
230 
231 static const u8 dg2_xcs_offsets[] = {
232 	NOP(1),
233 	LRI(15, POSTED),
234 	REG16(0x244),
235 	REG(0x034),
236 	REG(0x030),
237 	REG(0x038),
238 	REG(0x03c),
239 	REG(0x168),
240 	REG(0x140),
241 	REG(0x110),
242 	REG(0x1c0),
243 	REG(0x1c4),
244 	REG(0x1c8),
245 	REG(0x180),
246 	REG16(0x2b4),
247 	REG(0x120),
248 	REG(0x124),
249 
250 	NOP(1),
251 	LRI(9, POSTED),
252 	REG16(0x3a8),
253 	REG16(0x28c),
254 	REG16(0x288),
255 	REG16(0x284),
256 	REG16(0x280),
257 	REG16(0x27c),
258 	REG16(0x278),
259 	REG16(0x274),
260 	REG16(0x270),
261 
262 	END
263 };
264 
265 static const u8 gen8_rcs_offsets[] = {
266 	NOP(1),
267 	LRI(14, POSTED),
268 	REG16(0x244),
269 	REG(0x034),
270 	REG(0x030),
271 	REG(0x038),
272 	REG(0x03c),
273 	REG(0x168),
274 	REG(0x140),
275 	REG(0x110),
276 	REG(0x11c),
277 	REG(0x114),
278 	REG(0x118),
279 	REG(0x1c0),
280 	REG(0x1c4),
281 	REG(0x1c8),
282 
283 	NOP(3),
284 	LRI(9, POSTED),
285 	REG16(0x3a8),
286 	REG16(0x28c),
287 	REG16(0x288),
288 	REG16(0x284),
289 	REG16(0x280),
290 	REG16(0x27c),
291 	REG16(0x278),
292 	REG16(0x274),
293 	REG16(0x270),
294 
295 	NOP(13),
296 	LRI(1, 0),
297 	REG(0x0c8),
298 
299 	END
300 };
301 
302 static const u8 gen9_rcs_offsets[] = {
303 	NOP(1),
304 	LRI(14, POSTED),
305 	REG16(0x244),
306 	REG(0x34),
307 	REG(0x30),
308 	REG(0x38),
309 	REG(0x3c),
310 	REG(0x168),
311 	REG(0x140),
312 	REG(0x110),
313 	REG(0x11c),
314 	REG(0x114),
315 	REG(0x118),
316 	REG(0x1c0),
317 	REG(0x1c4),
318 	REG(0x1c8),
319 
320 	NOP(3),
321 	LRI(9, POSTED),
322 	REG16(0x3a8),
323 	REG16(0x28c),
324 	REG16(0x288),
325 	REG16(0x284),
326 	REG16(0x280),
327 	REG16(0x27c),
328 	REG16(0x278),
329 	REG16(0x274),
330 	REG16(0x270),
331 
332 	NOP(13),
333 	LRI(1, 0),
334 	REG(0xc8),
335 
336 	NOP(13),
337 	LRI(44, POSTED),
338 	REG(0x28),
339 	REG(0x9c),
340 	REG(0xc0),
341 	REG(0x178),
342 	REG(0x17c),
343 	REG16(0x358),
344 	REG(0x170),
345 	REG(0x150),
346 	REG(0x154),
347 	REG(0x158),
348 	REG16(0x41c),
349 	REG16(0x600),
350 	REG16(0x604),
351 	REG16(0x608),
352 	REG16(0x60c),
353 	REG16(0x610),
354 	REG16(0x614),
355 	REG16(0x618),
356 	REG16(0x61c),
357 	REG16(0x620),
358 	REG16(0x624),
359 	REG16(0x628),
360 	REG16(0x62c),
361 	REG16(0x630),
362 	REG16(0x634),
363 	REG16(0x638),
364 	REG16(0x63c),
365 	REG16(0x640),
366 	REG16(0x644),
367 	REG16(0x648),
368 	REG16(0x64c),
369 	REG16(0x650),
370 	REG16(0x654),
371 	REG16(0x658),
372 	REG16(0x65c),
373 	REG16(0x660),
374 	REG16(0x664),
375 	REG16(0x668),
376 	REG16(0x66c),
377 	REG16(0x670),
378 	REG16(0x674),
379 	REG16(0x678),
380 	REG16(0x67c),
381 	REG(0x68),
382 
383 	END
384 };
385 
386 static const u8 gen11_rcs_offsets[] = {
387 	NOP(1),
388 	LRI(15, POSTED),
389 	REG16(0x244),
390 	REG(0x034),
391 	REG(0x030),
392 	REG(0x038),
393 	REG(0x03c),
394 	REG(0x168),
395 	REG(0x140),
396 	REG(0x110),
397 	REG(0x11c),
398 	REG(0x114),
399 	REG(0x118),
400 	REG(0x1c0),
401 	REG(0x1c4),
402 	REG(0x1c8),
403 	REG(0x180),
404 
405 	NOP(1),
406 	LRI(9, POSTED),
407 	REG16(0x3a8),
408 	REG16(0x28c),
409 	REG16(0x288),
410 	REG16(0x284),
411 	REG16(0x280),
412 	REG16(0x27c),
413 	REG16(0x278),
414 	REG16(0x274),
415 	REG16(0x270),
416 
417 	LRI(1, POSTED),
418 	REG(0x1b0),
419 
420 	NOP(10),
421 	LRI(1, 0),
422 	REG(0x0c8),
423 
424 	END
425 };
426 
427 static const u8 gen12_rcs_offsets[] = {
428 	NOP(1),
429 	LRI(13, POSTED),
430 	REG16(0x244),
431 	REG(0x034),
432 	REG(0x030),
433 	REG(0x038),
434 	REG(0x03c),
435 	REG(0x168),
436 	REG(0x140),
437 	REG(0x110),
438 	REG(0x1c0),
439 	REG(0x1c4),
440 	REG(0x1c8),
441 	REG(0x180),
442 	REG16(0x2b4),
443 
444 	NOP(5),
445 	LRI(9, POSTED),
446 	REG16(0x3a8),
447 	REG16(0x28c),
448 	REG16(0x288),
449 	REG16(0x284),
450 	REG16(0x280),
451 	REG16(0x27c),
452 	REG16(0x278),
453 	REG16(0x274),
454 	REG16(0x270),
455 
456 	LRI(3, POSTED),
457 	REG(0x1b0),
458 	REG16(0x5a8),
459 	REG16(0x5ac),
460 
461 	NOP(6),
462 	LRI(1, 0),
463 	REG(0x0c8),
464 	NOP(3 + 9 + 1),
465 
466 	LRI(51, POSTED),
467 	REG16(0x588),
468 	REG16(0x588),
469 	REG16(0x588),
470 	REG16(0x588),
471 	REG16(0x588),
472 	REG16(0x588),
473 	REG(0x028),
474 	REG(0x09c),
475 	REG(0x0c0),
476 	REG(0x178),
477 	REG(0x17c),
478 	REG16(0x358),
479 	REG(0x170),
480 	REG(0x150),
481 	REG(0x154),
482 	REG(0x158),
483 	REG16(0x41c),
484 	REG16(0x600),
485 	REG16(0x604),
486 	REG16(0x608),
487 	REG16(0x60c),
488 	REG16(0x610),
489 	REG16(0x614),
490 	REG16(0x618),
491 	REG16(0x61c),
492 	REG16(0x620),
493 	REG16(0x624),
494 	REG16(0x628),
495 	REG16(0x62c),
496 	REG16(0x630),
497 	REG16(0x634),
498 	REG16(0x638),
499 	REG16(0x63c),
500 	REG16(0x640),
501 	REG16(0x644),
502 	REG16(0x648),
503 	REG16(0x64c),
504 	REG16(0x650),
505 	REG16(0x654),
506 	REG16(0x658),
507 	REG16(0x65c),
508 	REG16(0x660),
509 	REG16(0x664),
510 	REG16(0x668),
511 	REG16(0x66c),
512 	REG16(0x670),
513 	REG16(0x674),
514 	REG16(0x678),
515 	REG16(0x67c),
516 	REG(0x068),
517 	REG(0x084),
518 	NOP(1),
519 
520 	END
521 };
522 
523 static const u8 xehp_rcs_offsets[] = {
524 	NOP(1),
525 	LRI(13, POSTED),
526 	REG16(0x244),
527 	REG(0x034),
528 	REG(0x030),
529 	REG(0x038),
530 	REG(0x03c),
531 	REG(0x168),
532 	REG(0x140),
533 	REG(0x110),
534 	REG(0x1c0),
535 	REG(0x1c4),
536 	REG(0x1c8),
537 	REG(0x180),
538 	REG16(0x2b4),
539 
540 	NOP(5),
541 	LRI(9, POSTED),
542 	REG16(0x3a8),
543 	REG16(0x28c),
544 	REG16(0x288),
545 	REG16(0x284),
546 	REG16(0x280),
547 	REG16(0x27c),
548 	REG16(0x278),
549 	REG16(0x274),
550 	REG16(0x270),
551 
552 	LRI(3, POSTED),
553 	REG(0x1b0),
554 	REG16(0x5a8),
555 	REG16(0x5ac),
556 
557 	NOP(6),
558 	LRI(1, 0),
559 	REG(0x0c8),
560 
561 	END
562 };
563 
564 static const u8 dg2_rcs_offsets[] = {
565 	NOP(1),
566 	LRI(15, POSTED),
567 	REG16(0x244),
568 	REG(0x034),
569 	REG(0x030),
570 	REG(0x038),
571 	REG(0x03c),
572 	REG(0x168),
573 	REG(0x140),
574 	REG(0x110),
575 	REG(0x1c0),
576 	REG(0x1c4),
577 	REG(0x1c8),
578 	REG(0x180),
579 	REG16(0x2b4),
580 	REG(0x120),
581 	REG(0x124),
582 
583 	NOP(1),
584 	LRI(9, POSTED),
585 	REG16(0x3a8),
586 	REG16(0x28c),
587 	REG16(0x288),
588 	REG16(0x284),
589 	REG16(0x280),
590 	REG16(0x27c),
591 	REG16(0x278),
592 	REG16(0x274),
593 	REG16(0x270),
594 
595 	LRI(3, POSTED),
596 	REG(0x1b0),
597 	REG16(0x5a8),
598 	REG16(0x5ac),
599 
600 	NOP(6),
601 	LRI(1, 0),
602 	REG(0x0c8),
603 
604 	END
605 };
606 
607 #undef END
608 #undef REG16
609 #undef REG
610 #undef LRI
611 #undef NOP
612 
613 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
614 {
615 	/*
616 	 * The gen12+ lists only have the registers we program in the basic
617 	 * default state. We rely on the context image using relative
618 	 * addressing to automatic fixup the register state between the
619 	 * physical engines for virtual engine.
620 	 */
621 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
622 		   !intel_engine_has_relative_mmio(engine));
623 
624 	if (engine->class == RENDER_CLASS) {
625 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
626 			return dg2_rcs_offsets;
627 		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
628 			return xehp_rcs_offsets;
629 		else if (GRAPHICS_VER(engine->i915) >= 12)
630 			return gen12_rcs_offsets;
631 		else if (GRAPHICS_VER(engine->i915) >= 11)
632 			return gen11_rcs_offsets;
633 		else if (GRAPHICS_VER(engine->i915) >= 9)
634 			return gen9_rcs_offsets;
635 		else
636 			return gen8_rcs_offsets;
637 	} else {
638 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
639 			return dg2_xcs_offsets;
640 		else if (GRAPHICS_VER(engine->i915) >= 12)
641 			return gen12_xcs_offsets;
642 		else if (GRAPHICS_VER(engine->i915) >= 9)
643 			return gen9_xcs_offsets;
644 		else
645 			return gen8_xcs_offsets;
646 	}
647 }
648 
649 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
650 {
651 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
652 		return 0x70;
653 	else if (GRAPHICS_VER(engine->i915) >= 12)
654 		return 0x60;
655 	else if (GRAPHICS_VER(engine->i915) >= 9)
656 		return 0x54;
657 	else if (engine->class == RENDER_CLASS)
658 		return 0x58;
659 	else
660 		return -1;
661 }
662 
663 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
664 {
665 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
666 		return 0x84;
667 	else if (GRAPHICS_VER(engine->i915) >= 12)
668 		return 0x74;
669 	else if (GRAPHICS_VER(engine->i915) >= 9)
670 		return 0x68;
671 	else if (engine->class == RENDER_CLASS)
672 		return 0xd8;
673 	else
674 		return -1;
675 }
676 
677 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
678 {
679 	if (GRAPHICS_VER(engine->i915) >= 12)
680 		return 0x12;
681 	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
682 		return 0x18;
683 	else
684 		return -1;
685 }
686 
687 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
688 {
689 	int x;
690 
691 	x = lrc_ring_wa_bb_per_ctx(engine);
692 	if (x < 0)
693 		return x;
694 
695 	return x + 2;
696 }
697 
698 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
699 {
700 	int x;
701 
702 	x = lrc_ring_indirect_ptr(engine);
703 	if (x < 0)
704 		return x;
705 
706 	return x + 2;
707 }
708 
709 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
710 {
711 
712 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
713 		/*
714 		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
715 		 * simply to match the RCS context image layout.
716 		 */
717 		return 0xc6;
718 	else if (engine->class != RENDER_CLASS)
719 		return -1;
720 	else if (GRAPHICS_VER(engine->i915) >= 12)
721 		return 0xb6;
722 	else if (GRAPHICS_VER(engine->i915) >= 11)
723 		return 0xaa;
724 	else
725 		return -1;
726 }
727 
728 static u32
729 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
730 {
731 	switch (GRAPHICS_VER(engine->i915)) {
732 	default:
733 		MISSING_CASE(GRAPHICS_VER(engine->i915));
734 		fallthrough;
735 	case 12:
736 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
737 	case 11:
738 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
739 	case 9:
740 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
741 	case 8:
742 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
743 	}
744 }
745 
746 static void
747 lrc_setup_indirect_ctx(u32 *regs,
748 		       const struct intel_engine_cs *engine,
749 		       u32 ctx_bb_ggtt_addr,
750 		       u32 size)
751 {
752 	GEM_BUG_ON(!size);
753 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
754 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
755 	regs[lrc_ring_indirect_ptr(engine) + 1] =
756 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
757 
758 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
759 	regs[lrc_ring_indirect_offset(engine) + 1] =
760 		lrc_ring_indirect_offset_default(engine) << 6;
761 }
762 
763 static void init_common_regs(u32 * const regs,
764 			     const struct intel_context *ce,
765 			     const struct intel_engine_cs *engine,
766 			     bool inhibit)
767 {
768 	u32 ctl;
769 
770 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
771 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
772 	if (inhibit)
773 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
774 	if (GRAPHICS_VER(engine->i915) < 11)
775 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
776 					   CTX_CTRL_RS_CTX_ENABLE);
777 	regs[CTX_CONTEXT_CONTROL] = ctl;
778 
779 	regs[CTX_TIMESTAMP] = ce->runtime.last;
780 }
781 
782 static void init_wa_bb_regs(u32 * const regs,
783 			    const struct intel_engine_cs *engine)
784 {
785 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
786 
787 	if (wa_ctx->per_ctx.size) {
788 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
789 
790 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
791 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
792 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
793 	}
794 
795 	if (wa_ctx->indirect_ctx.size) {
796 		lrc_setup_indirect_ctx(regs, engine,
797 				       i915_ggtt_offset(wa_ctx->vma) +
798 				       wa_ctx->indirect_ctx.offset,
799 				       wa_ctx->indirect_ctx.size);
800 	}
801 }
802 
803 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
804 {
805 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
806 		/* 64b PPGTT (48bit canonical)
807 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
808 		 * other PDP Descriptors are ignored.
809 		 */
810 		ASSIGN_CTX_PML4(ppgtt, regs);
811 	} else {
812 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
813 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
814 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
815 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
816 	}
817 }
818 
819 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
820 {
821 	if (i915_is_ggtt(vm))
822 		return i915_vm_to_ggtt(vm)->alias;
823 	else
824 		return i915_vm_to_ppgtt(vm);
825 }
826 
827 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
828 {
829 	int x;
830 
831 	x = lrc_ring_mi_mode(engine);
832 	if (x != -1) {
833 		regs[x + 1] &= ~STOP_RING;
834 		regs[x + 1] |= STOP_RING << 16;
835 	}
836 }
837 
838 static void __lrc_init_regs(u32 *regs,
839 			    const struct intel_context *ce,
840 			    const struct intel_engine_cs *engine,
841 			    bool inhibit)
842 {
843 	/*
844 	 * A context is actually a big batch buffer with several
845 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
846 	 * values we are setting here are only for the first context restore:
847 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
848 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
849 	 * we are not initializing here).
850 	 *
851 	 * Must keep consistent with virtual_update_register_offsets().
852 	 */
853 
854 	if (inhibit)
855 		memset(regs, 0, PAGE_SIZE);
856 
857 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
858 
859 	init_common_regs(regs, ce, engine, inhibit);
860 	init_ppgtt_regs(regs, vm_alias(ce->vm));
861 
862 	init_wa_bb_regs(regs, engine);
863 
864 	__reset_stop_ring(regs, engine);
865 }
866 
867 void lrc_init_regs(const struct intel_context *ce,
868 		   const struct intel_engine_cs *engine,
869 		   bool inhibit)
870 {
871 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
872 }
873 
874 void lrc_reset_regs(const struct intel_context *ce,
875 		    const struct intel_engine_cs *engine)
876 {
877 	__reset_stop_ring(ce->lrc_reg_state, engine);
878 }
879 
880 static void
881 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
882 {
883 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
884 		return;
885 
886 	vaddr += engine->context_size;
887 
888 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
889 }
890 
891 static void
892 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
893 {
894 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
895 		return;
896 
897 	vaddr += engine->context_size;
898 
899 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
900 		drm_err_once(&engine->i915->drm,
901 			     "%s context redzone overwritten!\n",
902 			     engine->name);
903 }
904 
905 void lrc_init_state(struct intel_context *ce,
906 		    struct intel_engine_cs *engine,
907 		    void *state)
908 {
909 	bool inhibit = true;
910 
911 	set_redzone(state, engine);
912 
913 	if (engine->default_state) {
914 		shmem_read(engine->default_state, 0,
915 			   state, engine->context_size);
916 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
917 		inhibit = false;
918 	}
919 
920 	/* Clear the ppHWSP (inc. per-context counters) */
921 	memset(state, 0, PAGE_SIZE);
922 
923 	/*
924 	 * The second page of the context object contains some registers which
925 	 * must be set up prior to the first execution.
926 	 */
927 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
928 }
929 
930 static struct i915_vma *
931 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
932 {
933 	struct drm_i915_gem_object *obj;
934 	struct i915_vma *vma;
935 	u32 context_size;
936 
937 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
938 
939 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
940 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
941 
942 	if (GRAPHICS_VER(engine->i915) == 12) {
943 		ce->wa_bb_page = context_size / PAGE_SIZE;
944 		context_size += PAGE_SIZE;
945 	}
946 
947 	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
948 		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
949 		context_size += PARENT_SCRATCH_SIZE;
950 	}
951 
952 	obj = i915_gem_object_create_lmem(engine->i915, context_size,
953 					  I915_BO_ALLOC_PM_VOLATILE);
954 	if (IS_ERR(obj))
955 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
956 	if (IS_ERR(obj))
957 		return ERR_CAST(obj);
958 
959 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
960 	if (IS_ERR(vma)) {
961 		i915_gem_object_put(obj);
962 		return vma;
963 	}
964 
965 	return vma;
966 }
967 
968 static struct intel_timeline *
969 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
970 {
971 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
972 
973 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
974 }
975 
976 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
977 {
978 	struct intel_ring *ring;
979 	struct i915_vma *vma;
980 	int err;
981 
982 	GEM_BUG_ON(ce->state);
983 
984 	vma = __lrc_alloc_state(ce, engine);
985 	if (IS_ERR(vma))
986 		return PTR_ERR(vma);
987 
988 	ring = intel_engine_create_ring(engine, ce->ring_size);
989 	if (IS_ERR(ring)) {
990 		err = PTR_ERR(ring);
991 		goto err_vma;
992 	}
993 
994 	if (!page_mask_bits(ce->timeline)) {
995 		struct intel_timeline *tl;
996 
997 		/*
998 		 * Use the static global HWSP for the kernel context, and
999 		 * a dynamically allocated cacheline for everyone else.
1000 		 */
1001 		if (unlikely(ce->timeline))
1002 			tl = pinned_timeline(ce, engine);
1003 		else
1004 			tl = intel_timeline_create(engine->gt);
1005 		if (IS_ERR(tl)) {
1006 			err = PTR_ERR(tl);
1007 			goto err_ring;
1008 		}
1009 
1010 		ce->timeline = tl;
1011 	}
1012 
1013 	ce->ring = ring;
1014 	ce->state = vma;
1015 
1016 	return 0;
1017 
1018 err_ring:
1019 	intel_ring_put(ring);
1020 err_vma:
1021 	i915_vma_put(vma);
1022 	return err;
1023 }
1024 
1025 void lrc_reset(struct intel_context *ce)
1026 {
1027 	GEM_BUG_ON(!intel_context_is_pinned(ce));
1028 
1029 	intel_ring_reset(ce->ring, ce->ring->emit);
1030 
1031 	/* Scrub away the garbage */
1032 	lrc_init_regs(ce, ce->engine, true);
1033 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1034 }
1035 
1036 int
1037 lrc_pre_pin(struct intel_context *ce,
1038 	    struct intel_engine_cs *engine,
1039 	    struct i915_gem_ww_ctx *ww,
1040 	    void **vaddr)
1041 {
1042 	GEM_BUG_ON(!ce->state);
1043 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1044 
1045 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1046 					 i915_coherent_map_type(ce->engine->i915,
1047 								ce->state->obj,
1048 								false) |
1049 					 I915_MAP_OVERRIDE);
1050 
1051 	return PTR_ERR_OR_ZERO(*vaddr);
1052 }
1053 
1054 int
1055 lrc_pin(struct intel_context *ce,
1056 	struct intel_engine_cs *engine,
1057 	void *vaddr)
1058 {
1059 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1060 
1061 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1062 		lrc_init_state(ce, engine, vaddr);
1063 
1064 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1065 	return 0;
1066 }
1067 
1068 void lrc_unpin(struct intel_context *ce)
1069 {
1070 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1071 		      ce->engine);
1072 }
1073 
1074 void lrc_post_unpin(struct intel_context *ce)
1075 {
1076 	i915_gem_object_unpin_map(ce->state->obj);
1077 }
1078 
1079 void lrc_fini(struct intel_context *ce)
1080 {
1081 	if (!ce->state)
1082 		return;
1083 
1084 	intel_ring_put(fetch_and_zero(&ce->ring));
1085 	i915_vma_put(fetch_and_zero(&ce->state));
1086 }
1087 
1088 void lrc_destroy(struct kref *kref)
1089 {
1090 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1091 
1092 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1093 	GEM_BUG_ON(intel_context_is_pinned(ce));
1094 
1095 	lrc_fini(ce);
1096 
1097 	intel_context_fini(ce);
1098 	intel_context_free(ce);
1099 }
1100 
1101 static u32 *
1102 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1103 {
1104 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1105 		MI_SRM_LRM_GLOBAL_GTT |
1106 		MI_LRI_LRM_CS_MMIO;
1107 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1108 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1109 		CTX_TIMESTAMP * sizeof(u32);
1110 	*cs++ = 0;
1111 
1112 	*cs++ = MI_LOAD_REGISTER_REG |
1113 		MI_LRR_SOURCE_CS_MMIO |
1114 		MI_LRI_LRM_CS_MMIO;
1115 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1116 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1117 
1118 	*cs++ = MI_LOAD_REGISTER_REG |
1119 		MI_LRR_SOURCE_CS_MMIO |
1120 		MI_LRI_LRM_CS_MMIO;
1121 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1122 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1123 
1124 	return cs;
1125 }
1126 
1127 static u32 *
1128 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1129 {
1130 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1131 
1132 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1133 		MI_SRM_LRM_GLOBAL_GTT |
1134 		MI_LRI_LRM_CS_MMIO;
1135 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1136 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1137 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1138 	*cs++ = 0;
1139 
1140 	return cs;
1141 }
1142 
1143 static u32 *
1144 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1145 {
1146 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1147 
1148 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1149 		MI_SRM_LRM_GLOBAL_GTT |
1150 		MI_LRI_LRM_CS_MMIO;
1151 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1152 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1153 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1154 	*cs++ = 0;
1155 
1156 	*cs++ = MI_LOAD_REGISTER_REG |
1157 		MI_LRR_SOURCE_CS_MMIO |
1158 		MI_LRI_LRM_CS_MMIO;
1159 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1160 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1161 
1162 	return cs;
1163 }
1164 
1165 static u32 *
1166 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1167 {
1168 	cs = gen12_emit_timestamp_wa(ce, cs);
1169 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1170 	cs = gen12_emit_restore_scratch(ce, cs);
1171 
1172 	/* Wa_16013000631:dg2 */
1173 	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1174 	    IS_DG2_G11(ce->engine->i915))
1175 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1176 
1177 	return cs;
1178 }
1179 
1180 static u32 *
1181 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1182 {
1183 	cs = gen12_emit_timestamp_wa(ce, cs);
1184 	cs = gen12_emit_restore_scratch(ce, cs);
1185 
1186 	return cs;
1187 }
1188 
1189 static u32 context_wa_bb_offset(const struct intel_context *ce)
1190 {
1191 	return PAGE_SIZE * ce->wa_bb_page;
1192 }
1193 
1194 static u32 *context_indirect_bb(const struct intel_context *ce)
1195 {
1196 	void *ptr;
1197 
1198 	GEM_BUG_ON(!ce->wa_bb_page);
1199 
1200 	ptr = ce->lrc_reg_state;
1201 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1202 	ptr += context_wa_bb_offset(ce);
1203 
1204 	return ptr;
1205 }
1206 
1207 static void
1208 setup_indirect_ctx_bb(const struct intel_context *ce,
1209 		      const struct intel_engine_cs *engine,
1210 		      u32 *(*emit)(const struct intel_context *, u32 *))
1211 {
1212 	u32 * const start = context_indirect_bb(ce);
1213 	u32 *cs;
1214 
1215 	cs = emit(ce, start);
1216 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1217 	while ((unsigned long)cs % CACHELINE_BYTES)
1218 		*cs++ = MI_NOOP;
1219 
1220 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1221 			       i915_ggtt_offset(ce->state) +
1222 			       context_wa_bb_offset(ce),
1223 			       (cs - start) * sizeof(*cs));
1224 }
1225 
1226 /*
1227  * The context descriptor encodes various attributes of a context,
1228  * including its GTT address and some flags. Because it's fairly
1229  * expensive to calculate, we'll just do it once and cache the result,
1230  * which remains valid until the context is unpinned.
1231  *
1232  * This is what a descriptor looks like, from LSB to MSB::
1233  *
1234  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1235  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1236  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1237  *      bits 53-54:    mbz, reserved for use by hardware
1238  *      bits 55-63:    group ID, currently unused and set to 0
1239  *
1240  * Starting from Gen11, the upper dword of the descriptor has a new format:
1241  *
1242  *      bits 32-36:    reserved
1243  *      bits 37-47:    SW context ID
1244  *      bits 48:53:    engine instance
1245  *      bit 54:        mbz, reserved for use by hardware
1246  *      bits 55-60:    SW counter
1247  *      bits 61-63:    engine class
1248  *
1249  * On Xe_HP, the upper dword of the descriptor has a new format:
1250  *
1251  *      bits 32-37:    virtual function number
1252  *      bit 38:        mbz, reserved for use by hardware
1253  *      bits 39-54:    SW context ID
1254  *      bits 55-57:    reserved
1255  *      bits 58-63:    SW counter
1256  *
1257  * engine info, SW context ID and SW counter need to form a unique number
1258  * (Context ID) per lrc.
1259  */
1260 static u32 lrc_descriptor(const struct intel_context *ce)
1261 {
1262 	u32 desc;
1263 
1264 	desc = INTEL_LEGACY_32B_CONTEXT;
1265 	if (i915_vm_is_4lvl(ce->vm))
1266 		desc = INTEL_LEGACY_64B_CONTEXT;
1267 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1268 
1269 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1270 	if (GRAPHICS_VER(ce->vm->i915) == 8)
1271 		desc |= GEN8_CTX_L3LLC_COHERENT;
1272 
1273 	return i915_ggtt_offset(ce->state) | desc;
1274 }
1275 
1276 u32 lrc_update_regs(const struct intel_context *ce,
1277 		    const struct intel_engine_cs *engine,
1278 		    u32 head)
1279 {
1280 	struct intel_ring *ring = ce->ring;
1281 	u32 *regs = ce->lrc_reg_state;
1282 
1283 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1284 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1285 
1286 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1287 	regs[CTX_RING_HEAD] = head;
1288 	regs[CTX_RING_TAIL] = ring->tail;
1289 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1290 
1291 	/* RPCS */
1292 	if (engine->class == RENDER_CLASS) {
1293 		regs[CTX_R_PWR_CLK_STATE] =
1294 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1295 
1296 		i915_oa_init_reg_state(ce, engine);
1297 	}
1298 
1299 	if (ce->wa_bb_page) {
1300 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1301 
1302 		fn = gen12_emit_indirect_ctx_xcs;
1303 		if (ce->engine->class == RENDER_CLASS)
1304 			fn = gen12_emit_indirect_ctx_rcs;
1305 
1306 		/* Mutually exclusive wrt to global indirect bb */
1307 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1308 		setup_indirect_ctx_bb(ce, engine, fn);
1309 	}
1310 
1311 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1312 }
1313 
1314 void lrc_update_offsets(struct intel_context *ce,
1315 			struct intel_engine_cs *engine)
1316 {
1317 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1318 }
1319 
1320 void lrc_check_regs(const struct intel_context *ce,
1321 		    const struct intel_engine_cs *engine,
1322 		    const char *when)
1323 {
1324 	const struct intel_ring *ring = ce->ring;
1325 	u32 *regs = ce->lrc_reg_state;
1326 	bool valid = true;
1327 	int x;
1328 
1329 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1330 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1331 		       engine->name,
1332 		       regs[CTX_RING_START],
1333 		       i915_ggtt_offset(ring->vma));
1334 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1335 		valid = false;
1336 	}
1337 
1338 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1339 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1340 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1341 		       engine->name,
1342 		       regs[CTX_RING_CTL],
1343 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1344 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1345 		valid = false;
1346 	}
1347 
1348 	x = lrc_ring_mi_mode(engine);
1349 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1350 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1351 		       engine->name, regs[x + 1]);
1352 		regs[x + 1] &= ~STOP_RING;
1353 		regs[x + 1] |= STOP_RING << 16;
1354 		valid = false;
1355 	}
1356 
1357 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1358 }
1359 
1360 /*
1361  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1362  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1363  * but there is a slight complication as this is applied in WA batch where the
1364  * values are only initialized once so we cannot take register value at the
1365  * beginning and reuse it further; hence we save its value to memory, upload a
1366  * constant value with bit21 set and then we restore it back with the saved value.
1367  * To simplify the WA, a constant value is formed by using the default value
1368  * of this register. This shouldn't be a problem because we are only modifying
1369  * it for a short period and this batch in non-premptible. We can ofcourse
1370  * use additional instructions that read the actual value of the register
1371  * at that time and set our bit of interest but it makes the WA complicated.
1372  *
1373  * This WA is also required for Gen9 so extracting as a function avoids
1374  * code duplication.
1375  */
1376 static u32 *
1377 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1378 {
1379 	/* NB no one else is allowed to scribble over scratch + 256! */
1380 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1381 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1382 	*batch++ = intel_gt_scratch_offset(engine->gt,
1383 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1384 	*batch++ = 0;
1385 
1386 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1387 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1388 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1389 
1390 	batch = gen8_emit_pipe_control(batch,
1391 				       PIPE_CONTROL_CS_STALL |
1392 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1393 				       0);
1394 
1395 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1396 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1397 	*batch++ = intel_gt_scratch_offset(engine->gt,
1398 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1399 	*batch++ = 0;
1400 
1401 	return batch;
1402 }
1403 
1404 /*
1405  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1406  * initialized at the beginning and shared across all contexts but this field
1407  * helps us to have multiple batches at different offsets and select them based
1408  * on a criteria. At the moment this batch always start at the beginning of the page
1409  * and at this point we don't have multiple wa_ctx batch buffers.
1410  *
1411  * The number of WA applied are not known at the beginning; we use this field
1412  * to return the no of DWORDS written.
1413  *
1414  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1415  * so it adds NOOPs as padding to make it cacheline aligned.
1416  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1417  * makes a complete batch buffer.
1418  */
1419 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1420 {
1421 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1422 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1423 
1424 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1425 	if (IS_BROADWELL(engine->i915))
1426 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1427 
1428 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1429 	/* Actual scratch location is at 128 bytes offset */
1430 	batch = gen8_emit_pipe_control(batch,
1431 				       PIPE_CONTROL_FLUSH_L3 |
1432 				       PIPE_CONTROL_STORE_DATA_INDEX |
1433 				       PIPE_CONTROL_CS_STALL |
1434 				       PIPE_CONTROL_QW_WRITE,
1435 				       LRC_PPHWSP_SCRATCH_ADDR);
1436 
1437 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1438 
1439 	/* Pad to end of cacheline */
1440 	while ((unsigned long)batch % CACHELINE_BYTES)
1441 		*batch++ = MI_NOOP;
1442 
1443 	/*
1444 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1445 	 * execution depends on the length specified in terms of cache lines
1446 	 * in the register CTX_RCS_INDIRECT_CTX
1447 	 */
1448 
1449 	return batch;
1450 }
1451 
1452 struct lri {
1453 	i915_reg_t reg;
1454 	u32 value;
1455 };
1456 
1457 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1458 {
1459 	GEM_BUG_ON(!count || count > 63);
1460 
1461 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1462 	do {
1463 		*batch++ = i915_mmio_reg_offset(lri->reg);
1464 		*batch++ = lri->value;
1465 	} while (lri++, --count);
1466 	*batch++ = MI_NOOP;
1467 
1468 	return batch;
1469 }
1470 
1471 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1472 {
1473 	static const struct lri lri[] = {
1474 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1475 		{
1476 			COMMON_SLICE_CHICKEN2,
1477 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1478 				       0),
1479 		},
1480 
1481 		/* BSpec: 11391 */
1482 		{
1483 			FF_SLICE_CHICKEN,
1484 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1485 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1486 		},
1487 
1488 		/* BSpec: 11299 */
1489 		{
1490 			_3D_CHICKEN3,
1491 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1492 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1493 		}
1494 	};
1495 
1496 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1497 
1498 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1499 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1500 
1501 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1502 	batch = gen8_emit_pipe_control(batch,
1503 				       PIPE_CONTROL_FLUSH_L3 |
1504 				       PIPE_CONTROL_STORE_DATA_INDEX |
1505 				       PIPE_CONTROL_CS_STALL |
1506 				       PIPE_CONTROL_QW_WRITE,
1507 				       LRC_PPHWSP_SCRATCH_ADDR);
1508 
1509 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1510 
1511 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1512 	if (HAS_POOLED_EU(engine->i915)) {
1513 		/*
1514 		 * EU pool configuration is setup along with golden context
1515 		 * during context initialization. This value depends on
1516 		 * device type (2x6 or 3x6) and needs to be updated based
1517 		 * on which subslice is disabled especially for 2x6
1518 		 * devices, however it is safe to load default
1519 		 * configuration of 3x6 device instead of masking off
1520 		 * corresponding bits because HW ignores bits of a disabled
1521 		 * subslice and drops down to appropriate config. Please
1522 		 * see render_state_setup() in i915_gem_render_state.c for
1523 		 * possible configurations, to avoid duplication they are
1524 		 * not shown here again.
1525 		 */
1526 		*batch++ = GEN9_MEDIA_POOL_STATE;
1527 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1528 		*batch++ = 0x00777000;
1529 		*batch++ = 0;
1530 		*batch++ = 0;
1531 		*batch++ = 0;
1532 	}
1533 
1534 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1535 
1536 	/* Pad to end of cacheline */
1537 	while ((unsigned long)batch % CACHELINE_BYTES)
1538 		*batch++ = MI_NOOP;
1539 
1540 	return batch;
1541 }
1542 
1543 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1544 
1545 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1546 {
1547 	struct drm_i915_gem_object *obj;
1548 	struct i915_vma *vma;
1549 	int err;
1550 
1551 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1552 	if (IS_ERR(obj))
1553 		return PTR_ERR(obj);
1554 
1555 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1556 	if (IS_ERR(vma)) {
1557 		err = PTR_ERR(vma);
1558 		goto err;
1559 	}
1560 
1561 	engine->wa_ctx.vma = vma;
1562 	return 0;
1563 
1564 err:
1565 	i915_gem_object_put(obj);
1566 	return err;
1567 }
1568 
1569 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1570 {
1571 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1572 }
1573 
1574 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1575 
1576 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1577 {
1578 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1579 	struct i915_wa_ctx_bb *wa_bb[] = {
1580 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1581 	};
1582 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1583 	struct i915_gem_ww_ctx ww;
1584 	void *batch, *batch_ptr;
1585 	unsigned int i;
1586 	int err;
1587 
1588 	if (engine->class != RENDER_CLASS)
1589 		return;
1590 
1591 	switch (GRAPHICS_VER(engine->i915)) {
1592 	case 12:
1593 	case 11:
1594 		return;
1595 	case 9:
1596 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1597 		wa_bb_fn[1] = NULL;
1598 		break;
1599 	case 8:
1600 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1601 		wa_bb_fn[1] = NULL;
1602 		break;
1603 	default:
1604 		MISSING_CASE(GRAPHICS_VER(engine->i915));
1605 		return;
1606 	}
1607 
1608 	err = lrc_create_wa_ctx(engine);
1609 	if (err) {
1610 		/*
1611 		 * We continue even if we fail to initialize WA batch
1612 		 * because we only expect rare glitches but nothing
1613 		 * critical to prevent us from using GPU
1614 		 */
1615 		drm_err(&engine->i915->drm,
1616 			"Ignoring context switch w/a allocation error:%d\n",
1617 			err);
1618 		return;
1619 	}
1620 
1621 	if (!engine->wa_ctx.vma)
1622 		return;
1623 
1624 	i915_gem_ww_ctx_init(&ww, true);
1625 retry:
1626 	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1627 	if (!err)
1628 		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1629 	if (err)
1630 		goto err;
1631 
1632 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1633 	if (IS_ERR(batch)) {
1634 		err = PTR_ERR(batch);
1635 		goto err_unpin;
1636 	}
1637 
1638 	/*
1639 	 * Emit the two workaround batch buffers, recording the offset from the
1640 	 * start of the workaround batch buffer object for each and their
1641 	 * respective sizes.
1642 	 */
1643 	batch_ptr = batch;
1644 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1645 		wa_bb[i]->offset = batch_ptr - batch;
1646 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1647 						  CACHELINE_BYTES))) {
1648 			err = -EINVAL;
1649 			break;
1650 		}
1651 		if (wa_bb_fn[i])
1652 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1653 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1654 	}
1655 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1656 
1657 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1658 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1659 
1660 	/* Verify that we can handle failure to setup the wa_ctx */
1661 	if (!err)
1662 		err = i915_inject_probe_error(engine->i915, -ENODEV);
1663 
1664 err_unpin:
1665 	if (err)
1666 		i915_vma_unpin(wa_ctx->vma);
1667 err:
1668 	if (err == -EDEADLK) {
1669 		err = i915_gem_ww_ctx_backoff(&ww);
1670 		if (!err)
1671 			goto retry;
1672 	}
1673 	i915_gem_ww_ctx_fini(&ww);
1674 
1675 	if (err) {
1676 		i915_vma_put(engine->wa_ctx.vma);
1677 
1678 		/* Clear all flags to prevent further use */
1679 		memset(wa_ctx, 0, sizeof(*wa_ctx));
1680 	}
1681 }
1682 
1683 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1684 {
1685 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1686 	ce->runtime.num_underflow++;
1687 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1688 #endif
1689 }
1690 
1691 static u32 lrc_get_runtime(const struct intel_context *ce)
1692 {
1693 	/*
1694 	 * We can use either ppHWSP[16] which is recorded before the context
1695 	 * switch (and so excludes the cost of context switches) or use the
1696 	 * value from the context image itself, which is saved/restored earlier
1697 	 * and so includes the cost of the save.
1698 	 */
1699 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1700 }
1701 
1702 void lrc_update_runtime(struct intel_context *ce)
1703 {
1704 	u32 old;
1705 	s32 dt;
1706 
1707 	if (intel_context_is_barrier(ce))
1708 		return;
1709 
1710 	old = ce->runtime.last;
1711 	ce->runtime.last = lrc_get_runtime(ce);
1712 	dt = ce->runtime.last - old;
1713 
1714 	if (unlikely(dt < 0)) {
1715 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1716 			 old, ce->runtime.last, dt);
1717 		st_update_runtime_underflow(ce, dt);
1718 		return;
1719 	}
1720 
1721 	ewma_runtime_add(&ce->runtime.avg, dt);
1722 	ce->runtime.total += dt;
1723 }
1724 
1725 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1726 #include "selftest_lrc.c"
1727 #endif
1728