xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 3297481d)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gem/i915_gem_lmem.h"
7 
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "intel_engine.h"
12 #include "intel_gpu_commands.h"
13 #include "intel_gt.h"
14 #include "intel_lrc.h"
15 #include "intel_lrc_reg.h"
16 #include "intel_ring.h"
17 #include "shmem_utils.h"
18 
19 static void set_offsets(u32 *regs,
20 			const u8 *data,
21 			const struct intel_engine_cs *engine,
22 			bool close)
23 #define NOP(x) (BIT(7) | (x))
24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
25 #define POSTED BIT(0)
26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
27 #define REG16(x) \
28 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
29 	(((x) >> 2) & 0x7f)
30 #define END 0
31 {
32 	const u32 base = engine->mmio_base;
33 
34 	while (*data) {
35 		u8 count, flags;
36 
37 		if (*data & BIT(7)) { /* skip */
38 			count = *data++ & ~BIT(7);
39 			regs += count;
40 			continue;
41 		}
42 
43 		count = *data & 0x3f;
44 		flags = *data >> 6;
45 		data++;
46 
47 		*regs = MI_LOAD_REGISTER_IMM(count);
48 		if (flags & POSTED)
49 			*regs |= MI_LRI_FORCE_POSTED;
50 		if (GRAPHICS_VER(engine->i915) >= 11)
51 			*regs |= MI_LRI_LRM_CS_MMIO;
52 		regs++;
53 
54 		GEM_BUG_ON(!count);
55 		do {
56 			u32 offset = 0;
57 			u8 v;
58 
59 			do {
60 				v = *data++;
61 				offset <<= 7;
62 				offset |= v & ~BIT(7);
63 			} while (v & BIT(7));
64 
65 			regs[0] = base + (offset << 2);
66 			regs += 2;
67 		} while (--count);
68 	}
69 
70 	if (close) {
71 		/* Close the batch; used mainly by live_lrc_layout() */
72 		*regs = MI_BATCH_BUFFER_END;
73 		if (GRAPHICS_VER(engine->i915) >= 11)
74 			*regs |= BIT(0);
75 	}
76 }
77 
78 static const u8 gen8_xcs_offsets[] = {
79 	NOP(1),
80 	LRI(11, 0),
81 	REG16(0x244),
82 	REG(0x034),
83 	REG(0x030),
84 	REG(0x038),
85 	REG(0x03c),
86 	REG(0x168),
87 	REG(0x140),
88 	REG(0x110),
89 	REG(0x11c),
90 	REG(0x114),
91 	REG(0x118),
92 
93 	NOP(9),
94 	LRI(9, 0),
95 	REG16(0x3a8),
96 	REG16(0x28c),
97 	REG16(0x288),
98 	REG16(0x284),
99 	REG16(0x280),
100 	REG16(0x27c),
101 	REG16(0x278),
102 	REG16(0x274),
103 	REG16(0x270),
104 
105 	NOP(13),
106 	LRI(2, 0),
107 	REG16(0x200),
108 	REG(0x028),
109 
110 	END
111 };
112 
113 static const u8 gen9_xcs_offsets[] = {
114 	NOP(1),
115 	LRI(14, POSTED),
116 	REG16(0x244),
117 	REG(0x034),
118 	REG(0x030),
119 	REG(0x038),
120 	REG(0x03c),
121 	REG(0x168),
122 	REG(0x140),
123 	REG(0x110),
124 	REG(0x11c),
125 	REG(0x114),
126 	REG(0x118),
127 	REG(0x1c0),
128 	REG(0x1c4),
129 	REG(0x1c8),
130 
131 	NOP(3),
132 	LRI(9, POSTED),
133 	REG16(0x3a8),
134 	REG16(0x28c),
135 	REG16(0x288),
136 	REG16(0x284),
137 	REG16(0x280),
138 	REG16(0x27c),
139 	REG16(0x278),
140 	REG16(0x274),
141 	REG16(0x270),
142 
143 	NOP(13),
144 	LRI(1, POSTED),
145 	REG16(0x200),
146 
147 	NOP(13),
148 	LRI(44, POSTED),
149 	REG(0x028),
150 	REG(0x09c),
151 	REG(0x0c0),
152 	REG(0x178),
153 	REG(0x17c),
154 	REG16(0x358),
155 	REG(0x170),
156 	REG(0x150),
157 	REG(0x154),
158 	REG(0x158),
159 	REG16(0x41c),
160 	REG16(0x600),
161 	REG16(0x604),
162 	REG16(0x608),
163 	REG16(0x60c),
164 	REG16(0x610),
165 	REG16(0x614),
166 	REG16(0x618),
167 	REG16(0x61c),
168 	REG16(0x620),
169 	REG16(0x624),
170 	REG16(0x628),
171 	REG16(0x62c),
172 	REG16(0x630),
173 	REG16(0x634),
174 	REG16(0x638),
175 	REG16(0x63c),
176 	REG16(0x640),
177 	REG16(0x644),
178 	REG16(0x648),
179 	REG16(0x64c),
180 	REG16(0x650),
181 	REG16(0x654),
182 	REG16(0x658),
183 	REG16(0x65c),
184 	REG16(0x660),
185 	REG16(0x664),
186 	REG16(0x668),
187 	REG16(0x66c),
188 	REG16(0x670),
189 	REG16(0x674),
190 	REG16(0x678),
191 	REG16(0x67c),
192 	REG(0x068),
193 
194 	END
195 };
196 
197 static const u8 gen12_xcs_offsets[] = {
198 	NOP(1),
199 	LRI(13, POSTED),
200 	REG16(0x244),
201 	REG(0x034),
202 	REG(0x030),
203 	REG(0x038),
204 	REG(0x03c),
205 	REG(0x168),
206 	REG(0x140),
207 	REG(0x110),
208 	REG(0x1c0),
209 	REG(0x1c4),
210 	REG(0x1c8),
211 	REG(0x180),
212 	REG16(0x2b4),
213 
214 	NOP(5),
215 	LRI(9, POSTED),
216 	REG16(0x3a8),
217 	REG16(0x28c),
218 	REG16(0x288),
219 	REG16(0x284),
220 	REG16(0x280),
221 	REG16(0x27c),
222 	REG16(0x278),
223 	REG16(0x274),
224 	REG16(0x270),
225 
226 	END
227 };
228 
229 static const u8 dg2_xcs_offsets[] = {
230 	NOP(1),
231 	LRI(15, POSTED),
232 	REG16(0x244),
233 	REG(0x034),
234 	REG(0x030),
235 	REG(0x038),
236 	REG(0x03c),
237 	REG(0x168),
238 	REG(0x140),
239 	REG(0x110),
240 	REG(0x1c0),
241 	REG(0x1c4),
242 	REG(0x1c8),
243 	REG(0x180),
244 	REG16(0x2b4),
245 	REG(0x120),
246 	REG(0x124),
247 
248 	NOP(1),
249 	LRI(9, POSTED),
250 	REG16(0x3a8),
251 	REG16(0x28c),
252 	REG16(0x288),
253 	REG16(0x284),
254 	REG16(0x280),
255 	REG16(0x27c),
256 	REG16(0x278),
257 	REG16(0x274),
258 	REG16(0x270),
259 
260 	END
261 };
262 
263 static const u8 gen8_rcs_offsets[] = {
264 	NOP(1),
265 	LRI(14, POSTED),
266 	REG16(0x244),
267 	REG(0x034),
268 	REG(0x030),
269 	REG(0x038),
270 	REG(0x03c),
271 	REG(0x168),
272 	REG(0x140),
273 	REG(0x110),
274 	REG(0x11c),
275 	REG(0x114),
276 	REG(0x118),
277 	REG(0x1c0),
278 	REG(0x1c4),
279 	REG(0x1c8),
280 
281 	NOP(3),
282 	LRI(9, POSTED),
283 	REG16(0x3a8),
284 	REG16(0x28c),
285 	REG16(0x288),
286 	REG16(0x284),
287 	REG16(0x280),
288 	REG16(0x27c),
289 	REG16(0x278),
290 	REG16(0x274),
291 	REG16(0x270),
292 
293 	NOP(13),
294 	LRI(1, 0),
295 	REG(0x0c8),
296 
297 	END
298 };
299 
300 static const u8 gen9_rcs_offsets[] = {
301 	NOP(1),
302 	LRI(14, POSTED),
303 	REG16(0x244),
304 	REG(0x34),
305 	REG(0x30),
306 	REG(0x38),
307 	REG(0x3c),
308 	REG(0x168),
309 	REG(0x140),
310 	REG(0x110),
311 	REG(0x11c),
312 	REG(0x114),
313 	REG(0x118),
314 	REG(0x1c0),
315 	REG(0x1c4),
316 	REG(0x1c8),
317 
318 	NOP(3),
319 	LRI(9, POSTED),
320 	REG16(0x3a8),
321 	REG16(0x28c),
322 	REG16(0x288),
323 	REG16(0x284),
324 	REG16(0x280),
325 	REG16(0x27c),
326 	REG16(0x278),
327 	REG16(0x274),
328 	REG16(0x270),
329 
330 	NOP(13),
331 	LRI(1, 0),
332 	REG(0xc8),
333 
334 	NOP(13),
335 	LRI(44, POSTED),
336 	REG(0x28),
337 	REG(0x9c),
338 	REG(0xc0),
339 	REG(0x178),
340 	REG(0x17c),
341 	REG16(0x358),
342 	REG(0x170),
343 	REG(0x150),
344 	REG(0x154),
345 	REG(0x158),
346 	REG16(0x41c),
347 	REG16(0x600),
348 	REG16(0x604),
349 	REG16(0x608),
350 	REG16(0x60c),
351 	REG16(0x610),
352 	REG16(0x614),
353 	REG16(0x618),
354 	REG16(0x61c),
355 	REG16(0x620),
356 	REG16(0x624),
357 	REG16(0x628),
358 	REG16(0x62c),
359 	REG16(0x630),
360 	REG16(0x634),
361 	REG16(0x638),
362 	REG16(0x63c),
363 	REG16(0x640),
364 	REG16(0x644),
365 	REG16(0x648),
366 	REG16(0x64c),
367 	REG16(0x650),
368 	REG16(0x654),
369 	REG16(0x658),
370 	REG16(0x65c),
371 	REG16(0x660),
372 	REG16(0x664),
373 	REG16(0x668),
374 	REG16(0x66c),
375 	REG16(0x670),
376 	REG16(0x674),
377 	REG16(0x678),
378 	REG16(0x67c),
379 	REG(0x68),
380 
381 	END
382 };
383 
384 static const u8 gen11_rcs_offsets[] = {
385 	NOP(1),
386 	LRI(15, POSTED),
387 	REG16(0x244),
388 	REG(0x034),
389 	REG(0x030),
390 	REG(0x038),
391 	REG(0x03c),
392 	REG(0x168),
393 	REG(0x140),
394 	REG(0x110),
395 	REG(0x11c),
396 	REG(0x114),
397 	REG(0x118),
398 	REG(0x1c0),
399 	REG(0x1c4),
400 	REG(0x1c8),
401 	REG(0x180),
402 
403 	NOP(1),
404 	LRI(9, POSTED),
405 	REG16(0x3a8),
406 	REG16(0x28c),
407 	REG16(0x288),
408 	REG16(0x284),
409 	REG16(0x280),
410 	REG16(0x27c),
411 	REG16(0x278),
412 	REG16(0x274),
413 	REG16(0x270),
414 
415 	LRI(1, POSTED),
416 	REG(0x1b0),
417 
418 	NOP(10),
419 	LRI(1, 0),
420 	REG(0x0c8),
421 
422 	END
423 };
424 
425 static const u8 gen12_rcs_offsets[] = {
426 	NOP(1),
427 	LRI(13, POSTED),
428 	REG16(0x244),
429 	REG(0x034),
430 	REG(0x030),
431 	REG(0x038),
432 	REG(0x03c),
433 	REG(0x168),
434 	REG(0x140),
435 	REG(0x110),
436 	REG(0x1c0),
437 	REG(0x1c4),
438 	REG(0x1c8),
439 	REG(0x180),
440 	REG16(0x2b4),
441 
442 	NOP(5),
443 	LRI(9, POSTED),
444 	REG16(0x3a8),
445 	REG16(0x28c),
446 	REG16(0x288),
447 	REG16(0x284),
448 	REG16(0x280),
449 	REG16(0x27c),
450 	REG16(0x278),
451 	REG16(0x274),
452 	REG16(0x270),
453 
454 	LRI(3, POSTED),
455 	REG(0x1b0),
456 	REG16(0x5a8),
457 	REG16(0x5ac),
458 
459 	NOP(6),
460 	LRI(1, 0),
461 	REG(0x0c8),
462 	NOP(3 + 9 + 1),
463 
464 	LRI(51, POSTED),
465 	REG16(0x588),
466 	REG16(0x588),
467 	REG16(0x588),
468 	REG16(0x588),
469 	REG16(0x588),
470 	REG16(0x588),
471 	REG(0x028),
472 	REG(0x09c),
473 	REG(0x0c0),
474 	REG(0x178),
475 	REG(0x17c),
476 	REG16(0x358),
477 	REG(0x170),
478 	REG(0x150),
479 	REG(0x154),
480 	REG(0x158),
481 	REG16(0x41c),
482 	REG16(0x600),
483 	REG16(0x604),
484 	REG16(0x608),
485 	REG16(0x60c),
486 	REG16(0x610),
487 	REG16(0x614),
488 	REG16(0x618),
489 	REG16(0x61c),
490 	REG16(0x620),
491 	REG16(0x624),
492 	REG16(0x628),
493 	REG16(0x62c),
494 	REG16(0x630),
495 	REG16(0x634),
496 	REG16(0x638),
497 	REG16(0x63c),
498 	REG16(0x640),
499 	REG16(0x644),
500 	REG16(0x648),
501 	REG16(0x64c),
502 	REG16(0x650),
503 	REG16(0x654),
504 	REG16(0x658),
505 	REG16(0x65c),
506 	REG16(0x660),
507 	REG16(0x664),
508 	REG16(0x668),
509 	REG16(0x66c),
510 	REG16(0x670),
511 	REG16(0x674),
512 	REG16(0x678),
513 	REG16(0x67c),
514 	REG(0x068),
515 	REG(0x084),
516 	NOP(1),
517 
518 	END
519 };
520 
521 static const u8 xehp_rcs_offsets[] = {
522 	NOP(1),
523 	LRI(13, POSTED),
524 	REG16(0x244),
525 	REG(0x034),
526 	REG(0x030),
527 	REG(0x038),
528 	REG(0x03c),
529 	REG(0x168),
530 	REG(0x140),
531 	REG(0x110),
532 	REG(0x1c0),
533 	REG(0x1c4),
534 	REG(0x1c8),
535 	REG(0x180),
536 	REG16(0x2b4),
537 
538 	NOP(5),
539 	LRI(9, POSTED),
540 	REG16(0x3a8),
541 	REG16(0x28c),
542 	REG16(0x288),
543 	REG16(0x284),
544 	REG16(0x280),
545 	REG16(0x27c),
546 	REG16(0x278),
547 	REG16(0x274),
548 	REG16(0x270),
549 
550 	LRI(3, POSTED),
551 	REG(0x1b0),
552 	REG16(0x5a8),
553 	REG16(0x5ac),
554 
555 	NOP(6),
556 	LRI(1, 0),
557 	REG(0x0c8),
558 
559 	END
560 };
561 
562 static const u8 dg2_rcs_offsets[] = {
563 	NOP(1),
564 	LRI(15, POSTED),
565 	REG16(0x244),
566 	REG(0x034),
567 	REG(0x030),
568 	REG(0x038),
569 	REG(0x03c),
570 	REG(0x168),
571 	REG(0x140),
572 	REG(0x110),
573 	REG(0x1c0),
574 	REG(0x1c4),
575 	REG(0x1c8),
576 	REG(0x180),
577 	REG16(0x2b4),
578 	REG(0x120),
579 	REG(0x124),
580 
581 	NOP(1),
582 	LRI(9, POSTED),
583 	REG16(0x3a8),
584 	REG16(0x28c),
585 	REG16(0x288),
586 	REG16(0x284),
587 	REG16(0x280),
588 	REG16(0x27c),
589 	REG16(0x278),
590 	REG16(0x274),
591 	REG16(0x270),
592 
593 	LRI(3, POSTED),
594 	REG(0x1b0),
595 	REG16(0x5a8),
596 	REG16(0x5ac),
597 
598 	NOP(6),
599 	LRI(1, 0),
600 	REG(0x0c8),
601 
602 	END
603 };
604 
605 #undef END
606 #undef REG16
607 #undef REG
608 #undef LRI
609 #undef NOP
610 
611 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
612 {
613 	/*
614 	 * The gen12+ lists only have the registers we program in the basic
615 	 * default state. We rely on the context image using relative
616 	 * addressing to automatic fixup the register state between the
617 	 * physical engines for virtual engine.
618 	 */
619 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
620 		   !intel_engine_has_relative_mmio(engine));
621 
622 	if (engine->class == RENDER_CLASS) {
623 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
624 			return dg2_rcs_offsets;
625 		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
626 			return xehp_rcs_offsets;
627 		else if (GRAPHICS_VER(engine->i915) >= 12)
628 			return gen12_rcs_offsets;
629 		else if (GRAPHICS_VER(engine->i915) >= 11)
630 			return gen11_rcs_offsets;
631 		else if (GRAPHICS_VER(engine->i915) >= 9)
632 			return gen9_rcs_offsets;
633 		else
634 			return gen8_rcs_offsets;
635 	} else {
636 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
637 			return dg2_xcs_offsets;
638 		else if (GRAPHICS_VER(engine->i915) >= 12)
639 			return gen12_xcs_offsets;
640 		else if (GRAPHICS_VER(engine->i915) >= 9)
641 			return gen9_xcs_offsets;
642 		else
643 			return gen8_xcs_offsets;
644 	}
645 }
646 
647 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
648 {
649 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
650 		return 0x70;
651 	else if (GRAPHICS_VER(engine->i915) >= 12)
652 		return 0x60;
653 	else if (GRAPHICS_VER(engine->i915) >= 9)
654 		return 0x54;
655 	else if (engine->class == RENDER_CLASS)
656 		return 0x58;
657 	else
658 		return -1;
659 }
660 
661 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
662 {
663 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
664 		return 0x84;
665 	else if (GRAPHICS_VER(engine->i915) >= 12)
666 		return 0x74;
667 	else if (GRAPHICS_VER(engine->i915) >= 9)
668 		return 0x68;
669 	else if (engine->class == RENDER_CLASS)
670 		return 0xd8;
671 	else
672 		return -1;
673 }
674 
675 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
676 {
677 	if (GRAPHICS_VER(engine->i915) >= 12)
678 		return 0x12;
679 	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
680 		return 0x18;
681 	else
682 		return -1;
683 }
684 
685 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
686 {
687 	int x;
688 
689 	x = lrc_ring_wa_bb_per_ctx(engine);
690 	if (x < 0)
691 		return x;
692 
693 	return x + 2;
694 }
695 
696 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
697 {
698 	int x;
699 
700 	x = lrc_ring_indirect_ptr(engine);
701 	if (x < 0)
702 		return x;
703 
704 	return x + 2;
705 }
706 
707 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
708 {
709 
710 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
711 		/*
712 		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
713 		 * simply to match the RCS context image layout.
714 		 */
715 		return 0xc6;
716 	else if (engine->class != RENDER_CLASS)
717 		return -1;
718 	else if (GRAPHICS_VER(engine->i915) >= 12)
719 		return 0xb6;
720 	else if (GRAPHICS_VER(engine->i915) >= 11)
721 		return 0xaa;
722 	else
723 		return -1;
724 }
725 
726 static u32
727 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
728 {
729 	switch (GRAPHICS_VER(engine->i915)) {
730 	default:
731 		MISSING_CASE(GRAPHICS_VER(engine->i915));
732 		fallthrough;
733 	case 12:
734 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
735 	case 11:
736 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
737 	case 9:
738 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
739 	case 8:
740 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
741 	}
742 }
743 
744 static void
745 lrc_setup_indirect_ctx(u32 *regs,
746 		       const struct intel_engine_cs *engine,
747 		       u32 ctx_bb_ggtt_addr,
748 		       u32 size)
749 {
750 	GEM_BUG_ON(!size);
751 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
752 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
753 	regs[lrc_ring_indirect_ptr(engine) + 1] =
754 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
755 
756 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
757 	regs[lrc_ring_indirect_offset(engine) + 1] =
758 		lrc_ring_indirect_offset_default(engine) << 6;
759 }
760 
761 static void init_common_regs(u32 * const regs,
762 			     const struct intel_context *ce,
763 			     const struct intel_engine_cs *engine,
764 			     bool inhibit)
765 {
766 	u32 ctl;
767 
768 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
769 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
770 	if (inhibit)
771 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
772 	if (GRAPHICS_VER(engine->i915) < 11)
773 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
774 					   CTX_CTRL_RS_CTX_ENABLE);
775 	regs[CTX_CONTEXT_CONTROL] = ctl;
776 
777 	regs[CTX_TIMESTAMP] = ce->runtime.last;
778 }
779 
780 static void init_wa_bb_regs(u32 * const regs,
781 			    const struct intel_engine_cs *engine)
782 {
783 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
784 
785 	if (wa_ctx->per_ctx.size) {
786 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
787 
788 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
789 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
790 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
791 	}
792 
793 	if (wa_ctx->indirect_ctx.size) {
794 		lrc_setup_indirect_ctx(regs, engine,
795 				       i915_ggtt_offset(wa_ctx->vma) +
796 				       wa_ctx->indirect_ctx.offset,
797 				       wa_ctx->indirect_ctx.size);
798 	}
799 }
800 
801 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
802 {
803 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
804 		/* 64b PPGTT (48bit canonical)
805 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
806 		 * other PDP Descriptors are ignored.
807 		 */
808 		ASSIGN_CTX_PML4(ppgtt, regs);
809 	} else {
810 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
811 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
812 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
813 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
814 	}
815 }
816 
817 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
818 {
819 	if (i915_is_ggtt(vm))
820 		return i915_vm_to_ggtt(vm)->alias;
821 	else
822 		return i915_vm_to_ppgtt(vm);
823 }
824 
825 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
826 {
827 	int x;
828 
829 	x = lrc_ring_mi_mode(engine);
830 	if (x != -1) {
831 		regs[x + 1] &= ~STOP_RING;
832 		regs[x + 1] |= STOP_RING << 16;
833 	}
834 }
835 
836 static void __lrc_init_regs(u32 *regs,
837 			    const struct intel_context *ce,
838 			    const struct intel_engine_cs *engine,
839 			    bool inhibit)
840 {
841 	/*
842 	 * A context is actually a big batch buffer with several
843 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
844 	 * values we are setting here are only for the first context restore:
845 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
846 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
847 	 * we are not initializing here).
848 	 *
849 	 * Must keep consistent with virtual_update_register_offsets().
850 	 */
851 
852 	if (inhibit)
853 		memset(regs, 0, PAGE_SIZE);
854 
855 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
856 
857 	init_common_regs(regs, ce, engine, inhibit);
858 	init_ppgtt_regs(regs, vm_alias(ce->vm));
859 
860 	init_wa_bb_regs(regs, engine);
861 
862 	__reset_stop_ring(regs, engine);
863 }
864 
865 void lrc_init_regs(const struct intel_context *ce,
866 		   const struct intel_engine_cs *engine,
867 		   bool inhibit)
868 {
869 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
870 }
871 
872 void lrc_reset_regs(const struct intel_context *ce,
873 		    const struct intel_engine_cs *engine)
874 {
875 	__reset_stop_ring(ce->lrc_reg_state, engine);
876 }
877 
878 static void
879 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
880 {
881 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
882 		return;
883 
884 	vaddr += engine->context_size;
885 
886 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
887 }
888 
889 static void
890 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
891 {
892 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
893 		return;
894 
895 	vaddr += engine->context_size;
896 
897 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
898 		drm_err_once(&engine->i915->drm,
899 			     "%s context redzone overwritten!\n",
900 			     engine->name);
901 }
902 
903 void lrc_init_state(struct intel_context *ce,
904 		    struct intel_engine_cs *engine,
905 		    void *state)
906 {
907 	bool inhibit = true;
908 
909 	set_redzone(state, engine);
910 
911 	if (engine->default_state) {
912 		shmem_read(engine->default_state, 0,
913 			   state, engine->context_size);
914 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
915 		inhibit = false;
916 	}
917 
918 	/* Clear the ppHWSP (inc. per-context counters) */
919 	memset(state, 0, PAGE_SIZE);
920 
921 	/*
922 	 * The second page of the context object contains some registers which
923 	 * must be set up prior to the first execution.
924 	 */
925 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
926 }
927 
928 static struct i915_vma *
929 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
930 {
931 	struct drm_i915_gem_object *obj;
932 	struct i915_vma *vma;
933 	u32 context_size;
934 
935 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
936 
937 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
938 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
939 
940 	if (GRAPHICS_VER(engine->i915) == 12) {
941 		ce->wa_bb_page = context_size / PAGE_SIZE;
942 		context_size += PAGE_SIZE;
943 	}
944 
945 	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
946 		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
947 		context_size += PARENT_SCRATCH_SIZE;
948 	}
949 
950 	obj = i915_gem_object_create_lmem(engine->i915, context_size,
951 					  I915_BO_ALLOC_PM_VOLATILE);
952 	if (IS_ERR(obj))
953 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
954 	if (IS_ERR(obj))
955 		return ERR_CAST(obj);
956 
957 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
958 	if (IS_ERR(vma)) {
959 		i915_gem_object_put(obj);
960 		return vma;
961 	}
962 
963 	return vma;
964 }
965 
966 static struct intel_timeline *
967 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
968 {
969 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
970 
971 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
972 }
973 
974 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
975 {
976 	struct intel_ring *ring;
977 	struct i915_vma *vma;
978 	int err;
979 
980 	GEM_BUG_ON(ce->state);
981 
982 	vma = __lrc_alloc_state(ce, engine);
983 	if (IS_ERR(vma))
984 		return PTR_ERR(vma);
985 
986 	ring = intel_engine_create_ring(engine, ce->ring_size);
987 	if (IS_ERR(ring)) {
988 		err = PTR_ERR(ring);
989 		goto err_vma;
990 	}
991 
992 	if (!page_mask_bits(ce->timeline)) {
993 		struct intel_timeline *tl;
994 
995 		/*
996 		 * Use the static global HWSP for the kernel context, and
997 		 * a dynamically allocated cacheline for everyone else.
998 		 */
999 		if (unlikely(ce->timeline))
1000 			tl = pinned_timeline(ce, engine);
1001 		else
1002 			tl = intel_timeline_create(engine->gt);
1003 		if (IS_ERR(tl)) {
1004 			err = PTR_ERR(tl);
1005 			goto err_ring;
1006 		}
1007 
1008 		ce->timeline = tl;
1009 	}
1010 
1011 	ce->ring = ring;
1012 	ce->state = vma;
1013 
1014 	return 0;
1015 
1016 err_ring:
1017 	intel_ring_put(ring);
1018 err_vma:
1019 	i915_vma_put(vma);
1020 	return err;
1021 }
1022 
1023 void lrc_reset(struct intel_context *ce)
1024 {
1025 	GEM_BUG_ON(!intel_context_is_pinned(ce));
1026 
1027 	intel_ring_reset(ce->ring, ce->ring->emit);
1028 
1029 	/* Scrub away the garbage */
1030 	lrc_init_regs(ce, ce->engine, true);
1031 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1032 }
1033 
1034 int
1035 lrc_pre_pin(struct intel_context *ce,
1036 	    struct intel_engine_cs *engine,
1037 	    struct i915_gem_ww_ctx *ww,
1038 	    void **vaddr)
1039 {
1040 	GEM_BUG_ON(!ce->state);
1041 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1042 
1043 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1044 					 i915_coherent_map_type(ce->engine->i915,
1045 								ce->state->obj,
1046 								false) |
1047 					 I915_MAP_OVERRIDE);
1048 
1049 	return PTR_ERR_OR_ZERO(*vaddr);
1050 }
1051 
1052 int
1053 lrc_pin(struct intel_context *ce,
1054 	struct intel_engine_cs *engine,
1055 	void *vaddr)
1056 {
1057 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1058 
1059 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1060 		lrc_init_state(ce, engine, vaddr);
1061 
1062 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1063 	return 0;
1064 }
1065 
1066 void lrc_unpin(struct intel_context *ce)
1067 {
1068 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1069 		      ce->engine);
1070 }
1071 
1072 void lrc_post_unpin(struct intel_context *ce)
1073 {
1074 	i915_gem_object_unpin_map(ce->state->obj);
1075 }
1076 
1077 void lrc_fini(struct intel_context *ce)
1078 {
1079 	if (!ce->state)
1080 		return;
1081 
1082 	intel_ring_put(fetch_and_zero(&ce->ring));
1083 	i915_vma_put(fetch_and_zero(&ce->state));
1084 }
1085 
1086 void lrc_destroy(struct kref *kref)
1087 {
1088 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1089 
1090 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1091 	GEM_BUG_ON(intel_context_is_pinned(ce));
1092 
1093 	lrc_fini(ce);
1094 
1095 	intel_context_fini(ce);
1096 	intel_context_free(ce);
1097 }
1098 
1099 static u32 *
1100 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1101 {
1102 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1103 		MI_SRM_LRM_GLOBAL_GTT |
1104 		MI_LRI_LRM_CS_MMIO;
1105 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1106 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1107 		CTX_TIMESTAMP * sizeof(u32);
1108 	*cs++ = 0;
1109 
1110 	*cs++ = MI_LOAD_REGISTER_REG |
1111 		MI_LRR_SOURCE_CS_MMIO |
1112 		MI_LRI_LRM_CS_MMIO;
1113 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1114 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1115 
1116 	*cs++ = MI_LOAD_REGISTER_REG |
1117 		MI_LRR_SOURCE_CS_MMIO |
1118 		MI_LRI_LRM_CS_MMIO;
1119 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1120 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1121 
1122 	return cs;
1123 }
1124 
1125 static u32 *
1126 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1127 {
1128 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1129 
1130 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1131 		MI_SRM_LRM_GLOBAL_GTT |
1132 		MI_LRI_LRM_CS_MMIO;
1133 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1134 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1135 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1136 	*cs++ = 0;
1137 
1138 	return cs;
1139 }
1140 
1141 static u32 *
1142 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1143 {
1144 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1145 
1146 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1147 		MI_SRM_LRM_GLOBAL_GTT |
1148 		MI_LRI_LRM_CS_MMIO;
1149 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1150 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1151 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1152 	*cs++ = 0;
1153 
1154 	*cs++ = MI_LOAD_REGISTER_REG |
1155 		MI_LRR_SOURCE_CS_MMIO |
1156 		MI_LRI_LRM_CS_MMIO;
1157 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1158 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1159 
1160 	return cs;
1161 }
1162 
1163 static u32 *
1164 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1165 {
1166 	cs = gen12_emit_timestamp_wa(ce, cs);
1167 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1168 	cs = gen12_emit_restore_scratch(ce, cs);
1169 
1170 	return cs;
1171 }
1172 
1173 static u32 *
1174 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1175 {
1176 	cs = gen12_emit_timestamp_wa(ce, cs);
1177 	cs = gen12_emit_restore_scratch(ce, cs);
1178 
1179 	return cs;
1180 }
1181 
1182 static u32 context_wa_bb_offset(const struct intel_context *ce)
1183 {
1184 	return PAGE_SIZE * ce->wa_bb_page;
1185 }
1186 
1187 static u32 *context_indirect_bb(const struct intel_context *ce)
1188 {
1189 	void *ptr;
1190 
1191 	GEM_BUG_ON(!ce->wa_bb_page);
1192 
1193 	ptr = ce->lrc_reg_state;
1194 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1195 	ptr += context_wa_bb_offset(ce);
1196 
1197 	return ptr;
1198 }
1199 
1200 static void
1201 setup_indirect_ctx_bb(const struct intel_context *ce,
1202 		      const struct intel_engine_cs *engine,
1203 		      u32 *(*emit)(const struct intel_context *, u32 *))
1204 {
1205 	u32 * const start = context_indirect_bb(ce);
1206 	u32 *cs;
1207 
1208 	cs = emit(ce, start);
1209 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1210 	while ((unsigned long)cs % CACHELINE_BYTES)
1211 		*cs++ = MI_NOOP;
1212 
1213 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1214 			       i915_ggtt_offset(ce->state) +
1215 			       context_wa_bb_offset(ce),
1216 			       (cs - start) * sizeof(*cs));
1217 }
1218 
1219 /*
1220  * The context descriptor encodes various attributes of a context,
1221  * including its GTT address and some flags. Because it's fairly
1222  * expensive to calculate, we'll just do it once and cache the result,
1223  * which remains valid until the context is unpinned.
1224  *
1225  * This is what a descriptor looks like, from LSB to MSB::
1226  *
1227  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1228  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1229  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1230  *      bits 53-54:    mbz, reserved for use by hardware
1231  *      bits 55-63:    group ID, currently unused and set to 0
1232  *
1233  * Starting from Gen11, the upper dword of the descriptor has a new format:
1234  *
1235  *      bits 32-36:    reserved
1236  *      bits 37-47:    SW context ID
1237  *      bits 48:53:    engine instance
1238  *      bit 54:        mbz, reserved for use by hardware
1239  *      bits 55-60:    SW counter
1240  *      bits 61-63:    engine class
1241  *
1242  * On Xe_HP, the upper dword of the descriptor has a new format:
1243  *
1244  *      bits 32-37:    virtual function number
1245  *      bit 38:        mbz, reserved for use by hardware
1246  *      bits 39-54:    SW context ID
1247  *      bits 55-57:    reserved
1248  *      bits 58-63:    SW counter
1249  *
1250  * engine info, SW context ID and SW counter need to form a unique number
1251  * (Context ID) per lrc.
1252  */
1253 static u32 lrc_descriptor(const struct intel_context *ce)
1254 {
1255 	u32 desc;
1256 
1257 	desc = INTEL_LEGACY_32B_CONTEXT;
1258 	if (i915_vm_is_4lvl(ce->vm))
1259 		desc = INTEL_LEGACY_64B_CONTEXT;
1260 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1261 
1262 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1263 	if (GRAPHICS_VER(ce->vm->i915) == 8)
1264 		desc |= GEN8_CTX_L3LLC_COHERENT;
1265 
1266 	return i915_ggtt_offset(ce->state) | desc;
1267 }
1268 
1269 u32 lrc_update_regs(const struct intel_context *ce,
1270 		    const struct intel_engine_cs *engine,
1271 		    u32 head)
1272 {
1273 	struct intel_ring *ring = ce->ring;
1274 	u32 *regs = ce->lrc_reg_state;
1275 
1276 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1277 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1278 
1279 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1280 	regs[CTX_RING_HEAD] = head;
1281 	regs[CTX_RING_TAIL] = ring->tail;
1282 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1283 
1284 	/* RPCS */
1285 	if (engine->class == RENDER_CLASS) {
1286 		regs[CTX_R_PWR_CLK_STATE] =
1287 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1288 
1289 		i915_oa_init_reg_state(ce, engine);
1290 	}
1291 
1292 	if (ce->wa_bb_page) {
1293 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1294 
1295 		fn = gen12_emit_indirect_ctx_xcs;
1296 		if (ce->engine->class == RENDER_CLASS)
1297 			fn = gen12_emit_indirect_ctx_rcs;
1298 
1299 		/* Mutually exclusive wrt to global indirect bb */
1300 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1301 		setup_indirect_ctx_bb(ce, engine, fn);
1302 	}
1303 
1304 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1305 }
1306 
1307 void lrc_update_offsets(struct intel_context *ce,
1308 			struct intel_engine_cs *engine)
1309 {
1310 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1311 }
1312 
1313 void lrc_check_regs(const struct intel_context *ce,
1314 		    const struct intel_engine_cs *engine,
1315 		    const char *when)
1316 {
1317 	const struct intel_ring *ring = ce->ring;
1318 	u32 *regs = ce->lrc_reg_state;
1319 	bool valid = true;
1320 	int x;
1321 
1322 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1323 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1324 		       engine->name,
1325 		       regs[CTX_RING_START],
1326 		       i915_ggtt_offset(ring->vma));
1327 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1328 		valid = false;
1329 	}
1330 
1331 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1332 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1333 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1334 		       engine->name,
1335 		       regs[CTX_RING_CTL],
1336 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1337 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1338 		valid = false;
1339 	}
1340 
1341 	x = lrc_ring_mi_mode(engine);
1342 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1343 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1344 		       engine->name, regs[x + 1]);
1345 		regs[x + 1] &= ~STOP_RING;
1346 		regs[x + 1] |= STOP_RING << 16;
1347 		valid = false;
1348 	}
1349 
1350 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1351 }
1352 
1353 /*
1354  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1355  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1356  * but there is a slight complication as this is applied in WA batch where the
1357  * values are only initialized once so we cannot take register value at the
1358  * beginning and reuse it further; hence we save its value to memory, upload a
1359  * constant value with bit21 set and then we restore it back with the saved value.
1360  * To simplify the WA, a constant value is formed by using the default value
1361  * of this register. This shouldn't be a problem because we are only modifying
1362  * it for a short period and this batch in non-premptible. We can ofcourse
1363  * use additional instructions that read the actual value of the register
1364  * at that time and set our bit of interest but it makes the WA complicated.
1365  *
1366  * This WA is also required for Gen9 so extracting as a function avoids
1367  * code duplication.
1368  */
1369 static u32 *
1370 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1371 {
1372 	/* NB no one else is allowed to scribble over scratch + 256! */
1373 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1374 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1375 	*batch++ = intel_gt_scratch_offset(engine->gt,
1376 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1377 	*batch++ = 0;
1378 
1379 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1380 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1381 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1382 
1383 	batch = gen8_emit_pipe_control(batch,
1384 				       PIPE_CONTROL_CS_STALL |
1385 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1386 				       0);
1387 
1388 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1389 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1390 	*batch++ = intel_gt_scratch_offset(engine->gt,
1391 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1392 	*batch++ = 0;
1393 
1394 	return batch;
1395 }
1396 
1397 /*
1398  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1399  * initialized at the beginning and shared across all contexts but this field
1400  * helps us to have multiple batches at different offsets and select them based
1401  * on a criteria. At the moment this batch always start at the beginning of the page
1402  * and at this point we don't have multiple wa_ctx batch buffers.
1403  *
1404  * The number of WA applied are not known at the beginning; we use this field
1405  * to return the no of DWORDS written.
1406  *
1407  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1408  * so it adds NOOPs as padding to make it cacheline aligned.
1409  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1410  * makes a complete batch buffer.
1411  */
1412 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1413 {
1414 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1415 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1416 
1417 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1418 	if (IS_BROADWELL(engine->i915))
1419 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1420 
1421 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1422 	/* Actual scratch location is at 128 bytes offset */
1423 	batch = gen8_emit_pipe_control(batch,
1424 				       PIPE_CONTROL_FLUSH_L3 |
1425 				       PIPE_CONTROL_STORE_DATA_INDEX |
1426 				       PIPE_CONTROL_CS_STALL |
1427 				       PIPE_CONTROL_QW_WRITE,
1428 				       LRC_PPHWSP_SCRATCH_ADDR);
1429 
1430 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1431 
1432 	/* Pad to end of cacheline */
1433 	while ((unsigned long)batch % CACHELINE_BYTES)
1434 		*batch++ = MI_NOOP;
1435 
1436 	/*
1437 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1438 	 * execution depends on the length specified in terms of cache lines
1439 	 * in the register CTX_RCS_INDIRECT_CTX
1440 	 */
1441 
1442 	return batch;
1443 }
1444 
1445 struct lri {
1446 	i915_reg_t reg;
1447 	u32 value;
1448 };
1449 
1450 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1451 {
1452 	GEM_BUG_ON(!count || count > 63);
1453 
1454 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1455 	do {
1456 		*batch++ = i915_mmio_reg_offset(lri->reg);
1457 		*batch++ = lri->value;
1458 	} while (lri++, --count);
1459 	*batch++ = MI_NOOP;
1460 
1461 	return batch;
1462 }
1463 
1464 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1465 {
1466 	static const struct lri lri[] = {
1467 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1468 		{
1469 			COMMON_SLICE_CHICKEN2,
1470 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1471 				       0),
1472 		},
1473 
1474 		/* BSpec: 11391 */
1475 		{
1476 			FF_SLICE_CHICKEN,
1477 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1478 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1479 		},
1480 
1481 		/* BSpec: 11299 */
1482 		{
1483 			_3D_CHICKEN3,
1484 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1485 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1486 		}
1487 	};
1488 
1489 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1490 
1491 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1492 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1493 
1494 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1495 	batch = gen8_emit_pipe_control(batch,
1496 				       PIPE_CONTROL_FLUSH_L3 |
1497 				       PIPE_CONTROL_STORE_DATA_INDEX |
1498 				       PIPE_CONTROL_CS_STALL |
1499 				       PIPE_CONTROL_QW_WRITE,
1500 				       LRC_PPHWSP_SCRATCH_ADDR);
1501 
1502 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1503 
1504 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1505 	if (HAS_POOLED_EU(engine->i915)) {
1506 		/*
1507 		 * EU pool configuration is setup along with golden context
1508 		 * during context initialization. This value depends on
1509 		 * device type (2x6 or 3x6) and needs to be updated based
1510 		 * on which subslice is disabled especially for 2x6
1511 		 * devices, however it is safe to load default
1512 		 * configuration of 3x6 device instead of masking off
1513 		 * corresponding bits because HW ignores bits of a disabled
1514 		 * subslice and drops down to appropriate config. Please
1515 		 * see render_state_setup() in i915_gem_render_state.c for
1516 		 * possible configurations, to avoid duplication they are
1517 		 * not shown here again.
1518 		 */
1519 		*batch++ = GEN9_MEDIA_POOL_STATE;
1520 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1521 		*batch++ = 0x00777000;
1522 		*batch++ = 0;
1523 		*batch++ = 0;
1524 		*batch++ = 0;
1525 	}
1526 
1527 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1528 
1529 	/* Pad to end of cacheline */
1530 	while ((unsigned long)batch % CACHELINE_BYTES)
1531 		*batch++ = MI_NOOP;
1532 
1533 	return batch;
1534 }
1535 
1536 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1537 
1538 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1539 {
1540 	struct drm_i915_gem_object *obj;
1541 	struct i915_vma *vma;
1542 	int err;
1543 
1544 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1545 	if (IS_ERR(obj))
1546 		return PTR_ERR(obj);
1547 
1548 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1549 	if (IS_ERR(vma)) {
1550 		err = PTR_ERR(vma);
1551 		goto err;
1552 	}
1553 
1554 	engine->wa_ctx.vma = vma;
1555 	return 0;
1556 
1557 err:
1558 	i915_gem_object_put(obj);
1559 	return err;
1560 }
1561 
1562 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1563 {
1564 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1565 }
1566 
1567 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1568 
1569 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1570 {
1571 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1572 	struct i915_wa_ctx_bb *wa_bb[] = {
1573 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1574 	};
1575 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1576 	struct i915_gem_ww_ctx ww;
1577 	void *batch, *batch_ptr;
1578 	unsigned int i;
1579 	int err;
1580 
1581 	if (engine->class != RENDER_CLASS)
1582 		return;
1583 
1584 	switch (GRAPHICS_VER(engine->i915)) {
1585 	case 12:
1586 	case 11:
1587 		return;
1588 	case 9:
1589 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1590 		wa_bb_fn[1] = NULL;
1591 		break;
1592 	case 8:
1593 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1594 		wa_bb_fn[1] = NULL;
1595 		break;
1596 	default:
1597 		MISSING_CASE(GRAPHICS_VER(engine->i915));
1598 		return;
1599 	}
1600 
1601 	err = lrc_create_wa_ctx(engine);
1602 	if (err) {
1603 		/*
1604 		 * We continue even if we fail to initialize WA batch
1605 		 * because we only expect rare glitches but nothing
1606 		 * critical to prevent us from using GPU
1607 		 */
1608 		drm_err(&engine->i915->drm,
1609 			"Ignoring context switch w/a allocation error:%d\n",
1610 			err);
1611 		return;
1612 	}
1613 
1614 	if (!engine->wa_ctx.vma)
1615 		return;
1616 
1617 	i915_gem_ww_ctx_init(&ww, true);
1618 retry:
1619 	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1620 	if (!err)
1621 		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1622 	if (err)
1623 		goto err;
1624 
1625 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1626 	if (IS_ERR(batch)) {
1627 		err = PTR_ERR(batch);
1628 		goto err_unpin;
1629 	}
1630 
1631 	/*
1632 	 * Emit the two workaround batch buffers, recording the offset from the
1633 	 * start of the workaround batch buffer object for each and their
1634 	 * respective sizes.
1635 	 */
1636 	batch_ptr = batch;
1637 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1638 		wa_bb[i]->offset = batch_ptr - batch;
1639 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1640 						  CACHELINE_BYTES))) {
1641 			err = -EINVAL;
1642 			break;
1643 		}
1644 		if (wa_bb_fn[i])
1645 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1646 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1647 	}
1648 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1649 
1650 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1651 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1652 
1653 	/* Verify that we can handle failure to setup the wa_ctx */
1654 	if (!err)
1655 		err = i915_inject_probe_error(engine->i915, -ENODEV);
1656 
1657 err_unpin:
1658 	if (err)
1659 		i915_vma_unpin(wa_ctx->vma);
1660 err:
1661 	if (err == -EDEADLK) {
1662 		err = i915_gem_ww_ctx_backoff(&ww);
1663 		if (!err)
1664 			goto retry;
1665 	}
1666 	i915_gem_ww_ctx_fini(&ww);
1667 
1668 	if (err) {
1669 		i915_vma_put(engine->wa_ctx.vma);
1670 
1671 		/* Clear all flags to prevent further use */
1672 		memset(wa_ctx, 0, sizeof(*wa_ctx));
1673 	}
1674 }
1675 
1676 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1677 {
1678 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1679 	ce->runtime.num_underflow++;
1680 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1681 #endif
1682 }
1683 
1684 void lrc_update_runtime(struct intel_context *ce)
1685 {
1686 	u32 old;
1687 	s32 dt;
1688 
1689 	if (intel_context_is_barrier(ce))
1690 		return;
1691 
1692 	old = ce->runtime.last;
1693 	ce->runtime.last = lrc_get_runtime(ce);
1694 	dt = ce->runtime.last - old;
1695 
1696 	if (unlikely(dt < 0)) {
1697 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1698 			 old, ce->runtime.last, dt);
1699 		st_update_runtime_underflow(ce, dt);
1700 		return;
1701 	}
1702 
1703 	ewma_runtime_add(&ce->runtime.avg, dt);
1704 	ce->runtime.total += dt;
1705 }
1706 
1707 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1708 #include "selftest_lrc.c"
1709 #endif
1710