xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision ae4b0eac)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gem/i915_gem_lmem.h"
7 
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "intel_engine.h"
12 #include "intel_gpu_commands.h"
13 #include "intel_gt.h"
14 #include "intel_lrc.h"
15 #include "intel_lrc_reg.h"
16 #include "intel_ring.h"
17 #include "shmem_utils.h"
18 
19 static void set_offsets(u32 *regs,
20 			const u8 *data,
21 			const struct intel_engine_cs *engine,
22 			bool close)
23 #define NOP(x) (BIT(7) | (x))
24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
25 #define POSTED BIT(0)
26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
27 #define REG16(x) \
28 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
29 	(((x) >> 2) & 0x7f)
30 #define END 0
31 {
32 	const u32 base = engine->mmio_base;
33 
34 	while (*data) {
35 		u8 count, flags;
36 
37 		if (*data & BIT(7)) { /* skip */
38 			count = *data++ & ~BIT(7);
39 			regs += count;
40 			continue;
41 		}
42 
43 		count = *data & 0x3f;
44 		flags = *data >> 6;
45 		data++;
46 
47 		*regs = MI_LOAD_REGISTER_IMM(count);
48 		if (flags & POSTED)
49 			*regs |= MI_LRI_FORCE_POSTED;
50 		if (GRAPHICS_VER(engine->i915) >= 11)
51 			*regs |= MI_LRI_LRM_CS_MMIO;
52 		regs++;
53 
54 		GEM_BUG_ON(!count);
55 		do {
56 			u32 offset = 0;
57 			u8 v;
58 
59 			do {
60 				v = *data++;
61 				offset <<= 7;
62 				offset |= v & ~BIT(7);
63 			} while (v & BIT(7));
64 
65 			regs[0] = base + (offset << 2);
66 			regs += 2;
67 		} while (--count);
68 	}
69 
70 	if (close) {
71 		/* Close the batch; used mainly by live_lrc_layout() */
72 		*regs = MI_BATCH_BUFFER_END;
73 		if (GRAPHICS_VER(engine->i915) >= 11)
74 			*regs |= BIT(0);
75 	}
76 }
77 
78 static const u8 gen8_xcs_offsets[] = {
79 	NOP(1),
80 	LRI(11, 0),
81 	REG16(0x244),
82 	REG(0x034),
83 	REG(0x030),
84 	REG(0x038),
85 	REG(0x03c),
86 	REG(0x168),
87 	REG(0x140),
88 	REG(0x110),
89 	REG(0x11c),
90 	REG(0x114),
91 	REG(0x118),
92 
93 	NOP(9),
94 	LRI(9, 0),
95 	REG16(0x3a8),
96 	REG16(0x28c),
97 	REG16(0x288),
98 	REG16(0x284),
99 	REG16(0x280),
100 	REG16(0x27c),
101 	REG16(0x278),
102 	REG16(0x274),
103 	REG16(0x270),
104 
105 	NOP(13),
106 	LRI(2, 0),
107 	REG16(0x200),
108 	REG(0x028),
109 
110 	END
111 };
112 
113 static const u8 gen9_xcs_offsets[] = {
114 	NOP(1),
115 	LRI(14, POSTED),
116 	REG16(0x244),
117 	REG(0x034),
118 	REG(0x030),
119 	REG(0x038),
120 	REG(0x03c),
121 	REG(0x168),
122 	REG(0x140),
123 	REG(0x110),
124 	REG(0x11c),
125 	REG(0x114),
126 	REG(0x118),
127 	REG(0x1c0),
128 	REG(0x1c4),
129 	REG(0x1c8),
130 
131 	NOP(3),
132 	LRI(9, POSTED),
133 	REG16(0x3a8),
134 	REG16(0x28c),
135 	REG16(0x288),
136 	REG16(0x284),
137 	REG16(0x280),
138 	REG16(0x27c),
139 	REG16(0x278),
140 	REG16(0x274),
141 	REG16(0x270),
142 
143 	NOP(13),
144 	LRI(1, POSTED),
145 	REG16(0x200),
146 
147 	NOP(13),
148 	LRI(44, POSTED),
149 	REG(0x028),
150 	REG(0x09c),
151 	REG(0x0c0),
152 	REG(0x178),
153 	REG(0x17c),
154 	REG16(0x358),
155 	REG(0x170),
156 	REG(0x150),
157 	REG(0x154),
158 	REG(0x158),
159 	REG16(0x41c),
160 	REG16(0x600),
161 	REG16(0x604),
162 	REG16(0x608),
163 	REG16(0x60c),
164 	REG16(0x610),
165 	REG16(0x614),
166 	REG16(0x618),
167 	REG16(0x61c),
168 	REG16(0x620),
169 	REG16(0x624),
170 	REG16(0x628),
171 	REG16(0x62c),
172 	REG16(0x630),
173 	REG16(0x634),
174 	REG16(0x638),
175 	REG16(0x63c),
176 	REG16(0x640),
177 	REG16(0x644),
178 	REG16(0x648),
179 	REG16(0x64c),
180 	REG16(0x650),
181 	REG16(0x654),
182 	REG16(0x658),
183 	REG16(0x65c),
184 	REG16(0x660),
185 	REG16(0x664),
186 	REG16(0x668),
187 	REG16(0x66c),
188 	REG16(0x670),
189 	REG16(0x674),
190 	REG16(0x678),
191 	REG16(0x67c),
192 	REG(0x068),
193 
194 	END
195 };
196 
197 static const u8 gen12_xcs_offsets[] = {
198 	NOP(1),
199 	LRI(13, POSTED),
200 	REG16(0x244),
201 	REG(0x034),
202 	REG(0x030),
203 	REG(0x038),
204 	REG(0x03c),
205 	REG(0x168),
206 	REG(0x140),
207 	REG(0x110),
208 	REG(0x1c0),
209 	REG(0x1c4),
210 	REG(0x1c8),
211 	REG(0x180),
212 	REG16(0x2b4),
213 
214 	NOP(5),
215 	LRI(9, POSTED),
216 	REG16(0x3a8),
217 	REG16(0x28c),
218 	REG16(0x288),
219 	REG16(0x284),
220 	REG16(0x280),
221 	REG16(0x27c),
222 	REG16(0x278),
223 	REG16(0x274),
224 	REG16(0x270),
225 
226 	END
227 };
228 
229 static const u8 dg2_xcs_offsets[] = {
230 	NOP(1),
231 	LRI(15, POSTED),
232 	REG16(0x244),
233 	REG(0x034),
234 	REG(0x030),
235 	REG(0x038),
236 	REG(0x03c),
237 	REG(0x168),
238 	REG(0x140),
239 	REG(0x110),
240 	REG(0x1c0),
241 	REG(0x1c4),
242 	REG(0x1c8),
243 	REG(0x180),
244 	REG16(0x2b4),
245 	REG(0x120),
246 	REG(0x124),
247 
248 	NOP(1),
249 	LRI(9, POSTED),
250 	REG16(0x3a8),
251 	REG16(0x28c),
252 	REG16(0x288),
253 	REG16(0x284),
254 	REG16(0x280),
255 	REG16(0x27c),
256 	REG16(0x278),
257 	REG16(0x274),
258 	REG16(0x270),
259 
260 	END
261 };
262 
263 static const u8 gen8_rcs_offsets[] = {
264 	NOP(1),
265 	LRI(14, POSTED),
266 	REG16(0x244),
267 	REG(0x034),
268 	REG(0x030),
269 	REG(0x038),
270 	REG(0x03c),
271 	REG(0x168),
272 	REG(0x140),
273 	REG(0x110),
274 	REG(0x11c),
275 	REG(0x114),
276 	REG(0x118),
277 	REG(0x1c0),
278 	REG(0x1c4),
279 	REG(0x1c8),
280 
281 	NOP(3),
282 	LRI(9, POSTED),
283 	REG16(0x3a8),
284 	REG16(0x28c),
285 	REG16(0x288),
286 	REG16(0x284),
287 	REG16(0x280),
288 	REG16(0x27c),
289 	REG16(0x278),
290 	REG16(0x274),
291 	REG16(0x270),
292 
293 	NOP(13),
294 	LRI(1, 0),
295 	REG(0x0c8),
296 
297 	END
298 };
299 
300 static const u8 gen9_rcs_offsets[] = {
301 	NOP(1),
302 	LRI(14, POSTED),
303 	REG16(0x244),
304 	REG(0x34),
305 	REG(0x30),
306 	REG(0x38),
307 	REG(0x3c),
308 	REG(0x168),
309 	REG(0x140),
310 	REG(0x110),
311 	REG(0x11c),
312 	REG(0x114),
313 	REG(0x118),
314 	REG(0x1c0),
315 	REG(0x1c4),
316 	REG(0x1c8),
317 
318 	NOP(3),
319 	LRI(9, POSTED),
320 	REG16(0x3a8),
321 	REG16(0x28c),
322 	REG16(0x288),
323 	REG16(0x284),
324 	REG16(0x280),
325 	REG16(0x27c),
326 	REG16(0x278),
327 	REG16(0x274),
328 	REG16(0x270),
329 
330 	NOP(13),
331 	LRI(1, 0),
332 	REG(0xc8),
333 
334 	NOP(13),
335 	LRI(44, POSTED),
336 	REG(0x28),
337 	REG(0x9c),
338 	REG(0xc0),
339 	REG(0x178),
340 	REG(0x17c),
341 	REG16(0x358),
342 	REG(0x170),
343 	REG(0x150),
344 	REG(0x154),
345 	REG(0x158),
346 	REG16(0x41c),
347 	REG16(0x600),
348 	REG16(0x604),
349 	REG16(0x608),
350 	REG16(0x60c),
351 	REG16(0x610),
352 	REG16(0x614),
353 	REG16(0x618),
354 	REG16(0x61c),
355 	REG16(0x620),
356 	REG16(0x624),
357 	REG16(0x628),
358 	REG16(0x62c),
359 	REG16(0x630),
360 	REG16(0x634),
361 	REG16(0x638),
362 	REG16(0x63c),
363 	REG16(0x640),
364 	REG16(0x644),
365 	REG16(0x648),
366 	REG16(0x64c),
367 	REG16(0x650),
368 	REG16(0x654),
369 	REG16(0x658),
370 	REG16(0x65c),
371 	REG16(0x660),
372 	REG16(0x664),
373 	REG16(0x668),
374 	REG16(0x66c),
375 	REG16(0x670),
376 	REG16(0x674),
377 	REG16(0x678),
378 	REG16(0x67c),
379 	REG(0x68),
380 
381 	END
382 };
383 
384 static const u8 gen11_rcs_offsets[] = {
385 	NOP(1),
386 	LRI(15, POSTED),
387 	REG16(0x244),
388 	REG(0x034),
389 	REG(0x030),
390 	REG(0x038),
391 	REG(0x03c),
392 	REG(0x168),
393 	REG(0x140),
394 	REG(0x110),
395 	REG(0x11c),
396 	REG(0x114),
397 	REG(0x118),
398 	REG(0x1c0),
399 	REG(0x1c4),
400 	REG(0x1c8),
401 	REG(0x180),
402 
403 	NOP(1),
404 	LRI(9, POSTED),
405 	REG16(0x3a8),
406 	REG16(0x28c),
407 	REG16(0x288),
408 	REG16(0x284),
409 	REG16(0x280),
410 	REG16(0x27c),
411 	REG16(0x278),
412 	REG16(0x274),
413 	REG16(0x270),
414 
415 	LRI(1, POSTED),
416 	REG(0x1b0),
417 
418 	NOP(10),
419 	LRI(1, 0),
420 	REG(0x0c8),
421 
422 	END
423 };
424 
425 static const u8 gen12_rcs_offsets[] = {
426 	NOP(1),
427 	LRI(13, POSTED),
428 	REG16(0x244),
429 	REG(0x034),
430 	REG(0x030),
431 	REG(0x038),
432 	REG(0x03c),
433 	REG(0x168),
434 	REG(0x140),
435 	REG(0x110),
436 	REG(0x1c0),
437 	REG(0x1c4),
438 	REG(0x1c8),
439 	REG(0x180),
440 	REG16(0x2b4),
441 
442 	NOP(5),
443 	LRI(9, POSTED),
444 	REG16(0x3a8),
445 	REG16(0x28c),
446 	REG16(0x288),
447 	REG16(0x284),
448 	REG16(0x280),
449 	REG16(0x27c),
450 	REG16(0x278),
451 	REG16(0x274),
452 	REG16(0x270),
453 
454 	LRI(3, POSTED),
455 	REG(0x1b0),
456 	REG16(0x5a8),
457 	REG16(0x5ac),
458 
459 	NOP(6),
460 	LRI(1, 0),
461 	REG(0x0c8),
462 	NOP(3 + 9 + 1),
463 
464 	LRI(51, POSTED),
465 	REG16(0x588),
466 	REG16(0x588),
467 	REG16(0x588),
468 	REG16(0x588),
469 	REG16(0x588),
470 	REG16(0x588),
471 	REG(0x028),
472 	REG(0x09c),
473 	REG(0x0c0),
474 	REG(0x178),
475 	REG(0x17c),
476 	REG16(0x358),
477 	REG(0x170),
478 	REG(0x150),
479 	REG(0x154),
480 	REG(0x158),
481 	REG16(0x41c),
482 	REG16(0x600),
483 	REG16(0x604),
484 	REG16(0x608),
485 	REG16(0x60c),
486 	REG16(0x610),
487 	REG16(0x614),
488 	REG16(0x618),
489 	REG16(0x61c),
490 	REG16(0x620),
491 	REG16(0x624),
492 	REG16(0x628),
493 	REG16(0x62c),
494 	REG16(0x630),
495 	REG16(0x634),
496 	REG16(0x638),
497 	REG16(0x63c),
498 	REG16(0x640),
499 	REG16(0x644),
500 	REG16(0x648),
501 	REG16(0x64c),
502 	REG16(0x650),
503 	REG16(0x654),
504 	REG16(0x658),
505 	REG16(0x65c),
506 	REG16(0x660),
507 	REG16(0x664),
508 	REG16(0x668),
509 	REG16(0x66c),
510 	REG16(0x670),
511 	REG16(0x674),
512 	REG16(0x678),
513 	REG16(0x67c),
514 	REG(0x068),
515 	REG(0x084),
516 	NOP(1),
517 
518 	END
519 };
520 
521 static const u8 xehp_rcs_offsets[] = {
522 	NOP(1),
523 	LRI(13, POSTED),
524 	REG16(0x244),
525 	REG(0x034),
526 	REG(0x030),
527 	REG(0x038),
528 	REG(0x03c),
529 	REG(0x168),
530 	REG(0x140),
531 	REG(0x110),
532 	REG(0x1c0),
533 	REG(0x1c4),
534 	REG(0x1c8),
535 	REG(0x180),
536 	REG16(0x2b4),
537 
538 	NOP(5),
539 	LRI(9, POSTED),
540 	REG16(0x3a8),
541 	REG16(0x28c),
542 	REG16(0x288),
543 	REG16(0x284),
544 	REG16(0x280),
545 	REG16(0x27c),
546 	REG16(0x278),
547 	REG16(0x274),
548 	REG16(0x270),
549 
550 	LRI(3, POSTED),
551 	REG(0x1b0),
552 	REG16(0x5a8),
553 	REG16(0x5ac),
554 
555 	NOP(6),
556 	LRI(1, 0),
557 	REG(0x0c8),
558 
559 	END
560 };
561 
562 static const u8 dg2_rcs_offsets[] = {
563 	NOP(1),
564 	LRI(15, POSTED),
565 	REG16(0x244),
566 	REG(0x034),
567 	REG(0x030),
568 	REG(0x038),
569 	REG(0x03c),
570 	REG(0x168),
571 	REG(0x140),
572 	REG(0x110),
573 	REG(0x1c0),
574 	REG(0x1c4),
575 	REG(0x1c8),
576 	REG(0x180),
577 	REG16(0x2b4),
578 	REG(0x120),
579 	REG(0x124),
580 
581 	NOP(1),
582 	LRI(9, POSTED),
583 	REG16(0x3a8),
584 	REG16(0x28c),
585 	REG16(0x288),
586 	REG16(0x284),
587 	REG16(0x280),
588 	REG16(0x27c),
589 	REG16(0x278),
590 	REG16(0x274),
591 	REG16(0x270),
592 
593 	LRI(3, POSTED),
594 	REG(0x1b0),
595 	REG16(0x5a8),
596 	REG16(0x5ac),
597 
598 	NOP(6),
599 	LRI(1, 0),
600 	REG(0x0c8),
601 
602 	END
603 };
604 
605 #undef END
606 #undef REG16
607 #undef REG
608 #undef LRI
609 #undef NOP
610 
611 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
612 {
613 	/*
614 	 * The gen12+ lists only have the registers we program in the basic
615 	 * default state. We rely on the context image using relative
616 	 * addressing to automatic fixup the register state between the
617 	 * physical engines for virtual engine.
618 	 */
619 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
620 		   !intel_engine_has_relative_mmio(engine));
621 
622 	if (engine->class == RENDER_CLASS) {
623 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
624 			return dg2_rcs_offsets;
625 		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
626 			return xehp_rcs_offsets;
627 		else if (GRAPHICS_VER(engine->i915) >= 12)
628 			return gen12_rcs_offsets;
629 		else if (GRAPHICS_VER(engine->i915) >= 11)
630 			return gen11_rcs_offsets;
631 		else if (GRAPHICS_VER(engine->i915) >= 9)
632 			return gen9_rcs_offsets;
633 		else
634 			return gen8_rcs_offsets;
635 	} else {
636 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
637 			return dg2_xcs_offsets;
638 		else if (GRAPHICS_VER(engine->i915) >= 12)
639 			return gen12_xcs_offsets;
640 		else if (GRAPHICS_VER(engine->i915) >= 9)
641 			return gen9_xcs_offsets;
642 		else
643 			return gen8_xcs_offsets;
644 	}
645 }
646 
647 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
648 {
649 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
650 		return 0x70;
651 	else if (GRAPHICS_VER(engine->i915) >= 12)
652 		return 0x60;
653 	else if (GRAPHICS_VER(engine->i915) >= 9)
654 		return 0x54;
655 	else if (engine->class == RENDER_CLASS)
656 		return 0x58;
657 	else
658 		return -1;
659 }
660 
661 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
662 {
663 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
664 		return 0x84;
665 	else if (GRAPHICS_VER(engine->i915) >= 12)
666 		return 0x74;
667 	else if (GRAPHICS_VER(engine->i915) >= 9)
668 		return 0x68;
669 	else if (engine->class == RENDER_CLASS)
670 		return 0xd8;
671 	else
672 		return -1;
673 }
674 
675 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
676 {
677 	if (GRAPHICS_VER(engine->i915) >= 12)
678 		return 0x12;
679 	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
680 		return 0x18;
681 	else
682 		return -1;
683 }
684 
685 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
686 {
687 	int x;
688 
689 	x = lrc_ring_wa_bb_per_ctx(engine);
690 	if (x < 0)
691 		return x;
692 
693 	return x + 2;
694 }
695 
696 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
697 {
698 	int x;
699 
700 	x = lrc_ring_indirect_ptr(engine);
701 	if (x < 0)
702 		return x;
703 
704 	return x + 2;
705 }
706 
707 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
708 {
709 
710 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
711 		/*
712 		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
713 		 * simply to match the RCS context image layout.
714 		 */
715 		return 0xc6;
716 	else if (engine->class != RENDER_CLASS)
717 		return -1;
718 	else if (GRAPHICS_VER(engine->i915) >= 12)
719 		return 0xb6;
720 	else if (GRAPHICS_VER(engine->i915) >= 11)
721 		return 0xaa;
722 	else
723 		return -1;
724 }
725 
726 static u32
727 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
728 {
729 	switch (GRAPHICS_VER(engine->i915)) {
730 	default:
731 		MISSING_CASE(GRAPHICS_VER(engine->i915));
732 		fallthrough;
733 	case 12:
734 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
735 	case 11:
736 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
737 	case 9:
738 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
739 	case 8:
740 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
741 	}
742 }
743 
744 static void
745 lrc_setup_indirect_ctx(u32 *regs,
746 		       const struct intel_engine_cs *engine,
747 		       u32 ctx_bb_ggtt_addr,
748 		       u32 size)
749 {
750 	GEM_BUG_ON(!size);
751 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
752 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
753 	regs[lrc_ring_indirect_ptr(engine) + 1] =
754 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
755 
756 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
757 	regs[lrc_ring_indirect_offset(engine) + 1] =
758 		lrc_ring_indirect_offset_default(engine) << 6;
759 }
760 
761 static void init_common_regs(u32 * const regs,
762 			     const struct intel_context *ce,
763 			     const struct intel_engine_cs *engine,
764 			     bool inhibit)
765 {
766 	u32 ctl;
767 
768 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
769 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
770 	if (inhibit)
771 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
772 	if (GRAPHICS_VER(engine->i915) < 11)
773 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
774 					   CTX_CTRL_RS_CTX_ENABLE);
775 	regs[CTX_CONTEXT_CONTROL] = ctl;
776 
777 	regs[CTX_TIMESTAMP] = ce->runtime.last;
778 }
779 
780 static void init_wa_bb_regs(u32 * const regs,
781 			    const struct intel_engine_cs *engine)
782 {
783 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
784 
785 	if (wa_ctx->per_ctx.size) {
786 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
787 
788 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
789 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
790 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
791 	}
792 
793 	if (wa_ctx->indirect_ctx.size) {
794 		lrc_setup_indirect_ctx(regs, engine,
795 				       i915_ggtt_offset(wa_ctx->vma) +
796 				       wa_ctx->indirect_ctx.offset,
797 				       wa_ctx->indirect_ctx.size);
798 	}
799 }
800 
801 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
802 {
803 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
804 		/* 64b PPGTT (48bit canonical)
805 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
806 		 * other PDP Descriptors are ignored.
807 		 */
808 		ASSIGN_CTX_PML4(ppgtt, regs);
809 	} else {
810 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
811 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
812 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
813 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
814 	}
815 }
816 
817 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
818 {
819 	if (i915_is_ggtt(vm))
820 		return i915_vm_to_ggtt(vm)->alias;
821 	else
822 		return i915_vm_to_ppgtt(vm);
823 }
824 
825 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
826 {
827 	int x;
828 
829 	x = lrc_ring_mi_mode(engine);
830 	if (x != -1) {
831 		regs[x + 1] &= ~STOP_RING;
832 		regs[x + 1] |= STOP_RING << 16;
833 	}
834 }
835 
836 static void __lrc_init_regs(u32 *regs,
837 			    const struct intel_context *ce,
838 			    const struct intel_engine_cs *engine,
839 			    bool inhibit)
840 {
841 	/*
842 	 * A context is actually a big batch buffer with several
843 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
844 	 * values we are setting here are only for the first context restore:
845 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
846 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
847 	 * we are not initializing here).
848 	 *
849 	 * Must keep consistent with virtual_update_register_offsets().
850 	 */
851 
852 	if (inhibit)
853 		memset(regs, 0, PAGE_SIZE);
854 
855 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
856 
857 	init_common_regs(regs, ce, engine, inhibit);
858 	init_ppgtt_regs(regs, vm_alias(ce->vm));
859 
860 	init_wa_bb_regs(regs, engine);
861 
862 	__reset_stop_ring(regs, engine);
863 }
864 
865 void lrc_init_regs(const struct intel_context *ce,
866 		   const struct intel_engine_cs *engine,
867 		   bool inhibit)
868 {
869 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
870 }
871 
872 void lrc_reset_regs(const struct intel_context *ce,
873 		    const struct intel_engine_cs *engine)
874 {
875 	__reset_stop_ring(ce->lrc_reg_state, engine);
876 }
877 
878 static void
879 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
880 {
881 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
882 		return;
883 
884 	vaddr += engine->context_size;
885 
886 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
887 }
888 
889 static void
890 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
891 {
892 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
893 		return;
894 
895 	vaddr += engine->context_size;
896 
897 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
898 		drm_err_once(&engine->i915->drm,
899 			     "%s context redzone overwritten!\n",
900 			     engine->name);
901 }
902 
903 void lrc_init_state(struct intel_context *ce,
904 		    struct intel_engine_cs *engine,
905 		    void *state)
906 {
907 	bool inhibit = true;
908 
909 	set_redzone(state, engine);
910 
911 	if (engine->default_state) {
912 		shmem_read(engine->default_state, 0,
913 			   state, engine->context_size);
914 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
915 		inhibit = false;
916 	}
917 
918 	/* Clear the ppHWSP (inc. per-context counters) */
919 	memset(state, 0, PAGE_SIZE);
920 
921 	/*
922 	 * The second page of the context object contains some registers which
923 	 * must be set up prior to the first execution.
924 	 */
925 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
926 }
927 
928 static struct i915_vma *
929 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
930 {
931 	struct drm_i915_gem_object *obj;
932 	struct i915_vma *vma;
933 	u32 context_size;
934 
935 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
936 
937 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
938 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
939 
940 	if (GRAPHICS_VER(engine->i915) == 12) {
941 		ce->wa_bb_page = context_size / PAGE_SIZE;
942 		context_size += PAGE_SIZE;
943 	}
944 
945 	obj = i915_gem_object_create_lmem(engine->i915, context_size, 0);
946 	if (IS_ERR(obj))
947 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
948 	if (IS_ERR(obj))
949 		return ERR_CAST(obj);
950 
951 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
952 	if (IS_ERR(vma)) {
953 		i915_gem_object_put(obj);
954 		return vma;
955 	}
956 
957 	return vma;
958 }
959 
960 static struct intel_timeline *
961 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
962 {
963 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
964 
965 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
966 }
967 
968 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
969 {
970 	struct intel_ring *ring;
971 	struct i915_vma *vma;
972 	int err;
973 
974 	GEM_BUG_ON(ce->state);
975 
976 	vma = __lrc_alloc_state(ce, engine);
977 	if (IS_ERR(vma))
978 		return PTR_ERR(vma);
979 
980 	ring = intel_engine_create_ring(engine, ce->ring_size);
981 	if (IS_ERR(ring)) {
982 		err = PTR_ERR(ring);
983 		goto err_vma;
984 	}
985 
986 	if (!page_mask_bits(ce->timeline)) {
987 		struct intel_timeline *tl;
988 
989 		/*
990 		 * Use the static global HWSP for the kernel context, and
991 		 * a dynamically allocated cacheline for everyone else.
992 		 */
993 		if (unlikely(ce->timeline))
994 			tl = pinned_timeline(ce, engine);
995 		else
996 			tl = intel_timeline_create(engine->gt);
997 		if (IS_ERR(tl)) {
998 			err = PTR_ERR(tl);
999 			goto err_ring;
1000 		}
1001 
1002 		ce->timeline = tl;
1003 	}
1004 
1005 	ce->ring = ring;
1006 	ce->state = vma;
1007 
1008 	return 0;
1009 
1010 err_ring:
1011 	intel_ring_put(ring);
1012 err_vma:
1013 	i915_vma_put(vma);
1014 	return err;
1015 }
1016 
1017 void lrc_reset(struct intel_context *ce)
1018 {
1019 	GEM_BUG_ON(!intel_context_is_pinned(ce));
1020 
1021 	intel_ring_reset(ce->ring, ce->ring->emit);
1022 
1023 	/* Scrub away the garbage */
1024 	lrc_init_regs(ce, ce->engine, true);
1025 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1026 }
1027 
1028 int
1029 lrc_pre_pin(struct intel_context *ce,
1030 	    struct intel_engine_cs *engine,
1031 	    struct i915_gem_ww_ctx *ww,
1032 	    void **vaddr)
1033 {
1034 	GEM_BUG_ON(!ce->state);
1035 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1036 
1037 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1038 					 i915_coherent_map_type(ce->engine->i915,
1039 								ce->state->obj,
1040 								false) |
1041 					 I915_MAP_OVERRIDE);
1042 
1043 	return PTR_ERR_OR_ZERO(*vaddr);
1044 }
1045 
1046 int
1047 lrc_pin(struct intel_context *ce,
1048 	struct intel_engine_cs *engine,
1049 	void *vaddr)
1050 {
1051 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1052 
1053 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1054 		lrc_init_state(ce, engine, vaddr);
1055 
1056 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1057 	return 0;
1058 }
1059 
1060 void lrc_unpin(struct intel_context *ce)
1061 {
1062 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1063 		      ce->engine);
1064 }
1065 
1066 void lrc_post_unpin(struct intel_context *ce)
1067 {
1068 	i915_gem_object_unpin_map(ce->state->obj);
1069 }
1070 
1071 void lrc_fini(struct intel_context *ce)
1072 {
1073 	if (!ce->state)
1074 		return;
1075 
1076 	intel_ring_put(fetch_and_zero(&ce->ring));
1077 	i915_vma_put(fetch_and_zero(&ce->state));
1078 }
1079 
1080 void lrc_destroy(struct kref *kref)
1081 {
1082 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1083 
1084 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1085 	GEM_BUG_ON(intel_context_is_pinned(ce));
1086 
1087 	lrc_fini(ce);
1088 
1089 	intel_context_fini(ce);
1090 	intel_context_free(ce);
1091 }
1092 
1093 static u32 *
1094 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1095 {
1096 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1097 		MI_SRM_LRM_GLOBAL_GTT |
1098 		MI_LRI_LRM_CS_MMIO;
1099 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1100 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1101 		CTX_TIMESTAMP * sizeof(u32);
1102 	*cs++ = 0;
1103 
1104 	*cs++ = MI_LOAD_REGISTER_REG |
1105 		MI_LRR_SOURCE_CS_MMIO |
1106 		MI_LRI_LRM_CS_MMIO;
1107 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1108 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1109 
1110 	*cs++ = MI_LOAD_REGISTER_REG |
1111 		MI_LRR_SOURCE_CS_MMIO |
1112 		MI_LRI_LRM_CS_MMIO;
1113 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1114 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1115 
1116 	return cs;
1117 }
1118 
1119 static u32 *
1120 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1121 {
1122 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1123 
1124 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1125 		MI_SRM_LRM_GLOBAL_GTT |
1126 		MI_LRI_LRM_CS_MMIO;
1127 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1128 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1129 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1130 	*cs++ = 0;
1131 
1132 	return cs;
1133 }
1134 
1135 static u32 *
1136 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1137 {
1138 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1139 
1140 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1141 		MI_SRM_LRM_GLOBAL_GTT |
1142 		MI_LRI_LRM_CS_MMIO;
1143 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1144 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1145 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1146 	*cs++ = 0;
1147 
1148 	*cs++ = MI_LOAD_REGISTER_REG |
1149 		MI_LRR_SOURCE_CS_MMIO |
1150 		MI_LRI_LRM_CS_MMIO;
1151 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1152 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1153 
1154 	return cs;
1155 }
1156 
1157 static u32 *
1158 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1159 {
1160 	cs = gen12_emit_timestamp_wa(ce, cs);
1161 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1162 	cs = gen12_emit_restore_scratch(ce, cs);
1163 
1164 	return cs;
1165 }
1166 
1167 static u32 *
1168 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1169 {
1170 	cs = gen12_emit_timestamp_wa(ce, cs);
1171 	cs = gen12_emit_restore_scratch(ce, cs);
1172 
1173 	return cs;
1174 }
1175 
1176 static u32 context_wa_bb_offset(const struct intel_context *ce)
1177 {
1178 	return PAGE_SIZE * ce->wa_bb_page;
1179 }
1180 
1181 static u32 *context_indirect_bb(const struct intel_context *ce)
1182 {
1183 	void *ptr;
1184 
1185 	GEM_BUG_ON(!ce->wa_bb_page);
1186 
1187 	ptr = ce->lrc_reg_state;
1188 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1189 	ptr += context_wa_bb_offset(ce);
1190 
1191 	return ptr;
1192 }
1193 
1194 static void
1195 setup_indirect_ctx_bb(const struct intel_context *ce,
1196 		      const struct intel_engine_cs *engine,
1197 		      u32 *(*emit)(const struct intel_context *, u32 *))
1198 {
1199 	u32 * const start = context_indirect_bb(ce);
1200 	u32 *cs;
1201 
1202 	cs = emit(ce, start);
1203 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1204 	while ((unsigned long)cs % CACHELINE_BYTES)
1205 		*cs++ = MI_NOOP;
1206 
1207 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1208 			       i915_ggtt_offset(ce->state) +
1209 			       context_wa_bb_offset(ce),
1210 			       (cs - start) * sizeof(*cs));
1211 }
1212 
1213 /*
1214  * The context descriptor encodes various attributes of a context,
1215  * including its GTT address and some flags. Because it's fairly
1216  * expensive to calculate, we'll just do it once and cache the result,
1217  * which remains valid until the context is unpinned.
1218  *
1219  * This is what a descriptor looks like, from LSB to MSB::
1220  *
1221  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1222  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1223  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1224  *      bits 53-54:    mbz, reserved for use by hardware
1225  *      bits 55-63:    group ID, currently unused and set to 0
1226  *
1227  * Starting from Gen11, the upper dword of the descriptor has a new format:
1228  *
1229  *      bits 32-36:    reserved
1230  *      bits 37-47:    SW context ID
1231  *      bits 48:53:    engine instance
1232  *      bit 54:        mbz, reserved for use by hardware
1233  *      bits 55-60:    SW counter
1234  *      bits 61-63:    engine class
1235  *
1236  * On Xe_HP, the upper dword of the descriptor has a new format:
1237  *
1238  *      bits 32-37:    virtual function number
1239  *      bit 38:        mbz, reserved for use by hardware
1240  *      bits 39-54:    SW context ID
1241  *      bits 55-57:    reserved
1242  *      bits 58-63:    SW counter
1243  *
1244  * engine info, SW context ID and SW counter need to form a unique number
1245  * (Context ID) per lrc.
1246  */
1247 static u32 lrc_descriptor(const struct intel_context *ce)
1248 {
1249 	u32 desc;
1250 
1251 	desc = INTEL_LEGACY_32B_CONTEXT;
1252 	if (i915_vm_is_4lvl(ce->vm))
1253 		desc = INTEL_LEGACY_64B_CONTEXT;
1254 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1255 
1256 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1257 	if (GRAPHICS_VER(ce->vm->i915) == 8)
1258 		desc |= GEN8_CTX_L3LLC_COHERENT;
1259 
1260 	return i915_ggtt_offset(ce->state) | desc;
1261 }
1262 
1263 u32 lrc_update_regs(const struct intel_context *ce,
1264 		    const struct intel_engine_cs *engine,
1265 		    u32 head)
1266 {
1267 	struct intel_ring *ring = ce->ring;
1268 	u32 *regs = ce->lrc_reg_state;
1269 
1270 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1271 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1272 
1273 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1274 	regs[CTX_RING_HEAD] = head;
1275 	regs[CTX_RING_TAIL] = ring->tail;
1276 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1277 
1278 	/* RPCS */
1279 	if (engine->class == RENDER_CLASS) {
1280 		regs[CTX_R_PWR_CLK_STATE] =
1281 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1282 
1283 		i915_oa_init_reg_state(ce, engine);
1284 	}
1285 
1286 	if (ce->wa_bb_page) {
1287 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1288 
1289 		fn = gen12_emit_indirect_ctx_xcs;
1290 		if (ce->engine->class == RENDER_CLASS)
1291 			fn = gen12_emit_indirect_ctx_rcs;
1292 
1293 		/* Mutually exclusive wrt to global indirect bb */
1294 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1295 		setup_indirect_ctx_bb(ce, engine, fn);
1296 	}
1297 
1298 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1299 }
1300 
1301 void lrc_update_offsets(struct intel_context *ce,
1302 			struct intel_engine_cs *engine)
1303 {
1304 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1305 }
1306 
1307 void lrc_check_regs(const struct intel_context *ce,
1308 		    const struct intel_engine_cs *engine,
1309 		    const char *when)
1310 {
1311 	const struct intel_ring *ring = ce->ring;
1312 	u32 *regs = ce->lrc_reg_state;
1313 	bool valid = true;
1314 	int x;
1315 
1316 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1317 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1318 		       engine->name,
1319 		       regs[CTX_RING_START],
1320 		       i915_ggtt_offset(ring->vma));
1321 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1322 		valid = false;
1323 	}
1324 
1325 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1326 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1327 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1328 		       engine->name,
1329 		       regs[CTX_RING_CTL],
1330 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1331 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1332 		valid = false;
1333 	}
1334 
1335 	x = lrc_ring_mi_mode(engine);
1336 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1337 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1338 		       engine->name, regs[x + 1]);
1339 		regs[x + 1] &= ~STOP_RING;
1340 		regs[x + 1] |= STOP_RING << 16;
1341 		valid = false;
1342 	}
1343 
1344 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1345 }
1346 
1347 /*
1348  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1349  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1350  * but there is a slight complication as this is applied in WA batch where the
1351  * values are only initialized once so we cannot take register value at the
1352  * beginning and reuse it further; hence we save its value to memory, upload a
1353  * constant value with bit21 set and then we restore it back with the saved value.
1354  * To simplify the WA, a constant value is formed by using the default value
1355  * of this register. This shouldn't be a problem because we are only modifying
1356  * it for a short period and this batch in non-premptible. We can ofcourse
1357  * use additional instructions that read the actual value of the register
1358  * at that time and set our bit of interest but it makes the WA complicated.
1359  *
1360  * This WA is also required for Gen9 so extracting as a function avoids
1361  * code duplication.
1362  */
1363 static u32 *
1364 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1365 {
1366 	/* NB no one else is allowed to scribble over scratch + 256! */
1367 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1368 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1369 	*batch++ = intel_gt_scratch_offset(engine->gt,
1370 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1371 	*batch++ = 0;
1372 
1373 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1374 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1375 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1376 
1377 	batch = gen8_emit_pipe_control(batch,
1378 				       PIPE_CONTROL_CS_STALL |
1379 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1380 				       0);
1381 
1382 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1383 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1384 	*batch++ = intel_gt_scratch_offset(engine->gt,
1385 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1386 	*batch++ = 0;
1387 
1388 	return batch;
1389 }
1390 
1391 /*
1392  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1393  * initialized at the beginning and shared across all contexts but this field
1394  * helps us to have multiple batches at different offsets and select them based
1395  * on a criteria. At the moment this batch always start at the beginning of the page
1396  * and at this point we don't have multiple wa_ctx batch buffers.
1397  *
1398  * The number of WA applied are not known at the beginning; we use this field
1399  * to return the no of DWORDS written.
1400  *
1401  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1402  * so it adds NOOPs as padding to make it cacheline aligned.
1403  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1404  * makes a complete batch buffer.
1405  */
1406 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1407 {
1408 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1409 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1410 
1411 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1412 	if (IS_BROADWELL(engine->i915))
1413 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1414 
1415 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1416 	/* Actual scratch location is at 128 bytes offset */
1417 	batch = gen8_emit_pipe_control(batch,
1418 				       PIPE_CONTROL_FLUSH_L3 |
1419 				       PIPE_CONTROL_STORE_DATA_INDEX |
1420 				       PIPE_CONTROL_CS_STALL |
1421 				       PIPE_CONTROL_QW_WRITE,
1422 				       LRC_PPHWSP_SCRATCH_ADDR);
1423 
1424 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1425 
1426 	/* Pad to end of cacheline */
1427 	while ((unsigned long)batch % CACHELINE_BYTES)
1428 		*batch++ = MI_NOOP;
1429 
1430 	/*
1431 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1432 	 * execution depends on the length specified in terms of cache lines
1433 	 * in the register CTX_RCS_INDIRECT_CTX
1434 	 */
1435 
1436 	return batch;
1437 }
1438 
1439 struct lri {
1440 	i915_reg_t reg;
1441 	u32 value;
1442 };
1443 
1444 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1445 {
1446 	GEM_BUG_ON(!count || count > 63);
1447 
1448 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1449 	do {
1450 		*batch++ = i915_mmio_reg_offset(lri->reg);
1451 		*batch++ = lri->value;
1452 	} while (lri++, --count);
1453 	*batch++ = MI_NOOP;
1454 
1455 	return batch;
1456 }
1457 
1458 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1459 {
1460 	static const struct lri lri[] = {
1461 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1462 		{
1463 			COMMON_SLICE_CHICKEN2,
1464 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1465 				       0),
1466 		},
1467 
1468 		/* BSpec: 11391 */
1469 		{
1470 			FF_SLICE_CHICKEN,
1471 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1472 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1473 		},
1474 
1475 		/* BSpec: 11299 */
1476 		{
1477 			_3D_CHICKEN3,
1478 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1479 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1480 		}
1481 	};
1482 
1483 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1484 
1485 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1486 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1487 
1488 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1489 	batch = gen8_emit_pipe_control(batch,
1490 				       PIPE_CONTROL_FLUSH_L3 |
1491 				       PIPE_CONTROL_STORE_DATA_INDEX |
1492 				       PIPE_CONTROL_CS_STALL |
1493 				       PIPE_CONTROL_QW_WRITE,
1494 				       LRC_PPHWSP_SCRATCH_ADDR);
1495 
1496 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1497 
1498 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1499 	if (HAS_POOLED_EU(engine->i915)) {
1500 		/*
1501 		 * EU pool configuration is setup along with golden context
1502 		 * during context initialization. This value depends on
1503 		 * device type (2x6 or 3x6) and needs to be updated based
1504 		 * on which subslice is disabled especially for 2x6
1505 		 * devices, however it is safe to load default
1506 		 * configuration of 3x6 device instead of masking off
1507 		 * corresponding bits because HW ignores bits of a disabled
1508 		 * subslice and drops down to appropriate config. Please
1509 		 * see render_state_setup() in i915_gem_render_state.c for
1510 		 * possible configurations, to avoid duplication they are
1511 		 * not shown here again.
1512 		 */
1513 		*batch++ = GEN9_MEDIA_POOL_STATE;
1514 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1515 		*batch++ = 0x00777000;
1516 		*batch++ = 0;
1517 		*batch++ = 0;
1518 		*batch++ = 0;
1519 	}
1520 
1521 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1522 
1523 	/* Pad to end of cacheline */
1524 	while ((unsigned long)batch % CACHELINE_BYTES)
1525 		*batch++ = MI_NOOP;
1526 
1527 	return batch;
1528 }
1529 
1530 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1531 
1532 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1533 {
1534 	struct drm_i915_gem_object *obj;
1535 	struct i915_vma *vma;
1536 	int err;
1537 
1538 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1539 	if (IS_ERR(obj))
1540 		return PTR_ERR(obj);
1541 
1542 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1543 	if (IS_ERR(vma)) {
1544 		err = PTR_ERR(vma);
1545 		goto err;
1546 	}
1547 
1548 	engine->wa_ctx.vma = vma;
1549 	return 0;
1550 
1551 err:
1552 	i915_gem_object_put(obj);
1553 	return err;
1554 }
1555 
1556 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1557 {
1558 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1559 }
1560 
1561 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1562 
1563 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1564 {
1565 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1566 	struct i915_wa_ctx_bb *wa_bb[] = {
1567 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1568 	};
1569 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1570 	struct i915_gem_ww_ctx ww;
1571 	void *batch, *batch_ptr;
1572 	unsigned int i;
1573 	int err;
1574 
1575 	if (engine->class != RENDER_CLASS)
1576 		return;
1577 
1578 	switch (GRAPHICS_VER(engine->i915)) {
1579 	case 12:
1580 	case 11:
1581 		return;
1582 	case 9:
1583 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1584 		wa_bb_fn[1] = NULL;
1585 		break;
1586 	case 8:
1587 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1588 		wa_bb_fn[1] = NULL;
1589 		break;
1590 	default:
1591 		MISSING_CASE(GRAPHICS_VER(engine->i915));
1592 		return;
1593 	}
1594 
1595 	err = lrc_create_wa_ctx(engine);
1596 	if (err) {
1597 		/*
1598 		 * We continue even if we fail to initialize WA batch
1599 		 * because we only expect rare glitches but nothing
1600 		 * critical to prevent us from using GPU
1601 		 */
1602 		drm_err(&engine->i915->drm,
1603 			"Ignoring context switch w/a allocation error:%d\n",
1604 			err);
1605 		return;
1606 	}
1607 
1608 	if (!engine->wa_ctx.vma)
1609 		return;
1610 
1611 	i915_gem_ww_ctx_init(&ww, true);
1612 retry:
1613 	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1614 	if (!err)
1615 		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1616 	if (err)
1617 		goto err;
1618 
1619 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1620 	if (IS_ERR(batch)) {
1621 		err = PTR_ERR(batch);
1622 		goto err_unpin;
1623 	}
1624 
1625 	/*
1626 	 * Emit the two workaround batch buffers, recording the offset from the
1627 	 * start of the workaround batch buffer object for each and their
1628 	 * respective sizes.
1629 	 */
1630 	batch_ptr = batch;
1631 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1632 		wa_bb[i]->offset = batch_ptr - batch;
1633 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1634 						  CACHELINE_BYTES))) {
1635 			err = -EINVAL;
1636 			break;
1637 		}
1638 		if (wa_bb_fn[i])
1639 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1640 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1641 	}
1642 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1643 
1644 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1645 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1646 
1647 	/* Verify that we can handle failure to setup the wa_ctx */
1648 	if (!err)
1649 		err = i915_inject_probe_error(engine->i915, -ENODEV);
1650 
1651 err_unpin:
1652 	if (err)
1653 		i915_vma_unpin(wa_ctx->vma);
1654 err:
1655 	if (err == -EDEADLK) {
1656 		err = i915_gem_ww_ctx_backoff(&ww);
1657 		if (!err)
1658 			goto retry;
1659 	}
1660 	i915_gem_ww_ctx_fini(&ww);
1661 
1662 	if (err) {
1663 		i915_vma_put(engine->wa_ctx.vma);
1664 
1665 		/* Clear all flags to prevent further use */
1666 		memset(wa_ctx, 0, sizeof(*wa_ctx));
1667 	}
1668 }
1669 
1670 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1671 {
1672 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1673 	ce->runtime.num_underflow++;
1674 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1675 #endif
1676 }
1677 
1678 void lrc_update_runtime(struct intel_context *ce)
1679 {
1680 	u32 old;
1681 	s32 dt;
1682 
1683 	if (intel_context_is_barrier(ce))
1684 		return;
1685 
1686 	old = ce->runtime.last;
1687 	ce->runtime.last = lrc_get_runtime(ce);
1688 	dt = ce->runtime.last - old;
1689 
1690 	if (unlikely(dt < 0)) {
1691 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1692 			 old, ce->runtime.last, dt);
1693 		st_update_runtime_underflow(ce, dt);
1694 		return;
1695 	}
1696 
1697 	ewma_runtime_add(&ce->runtime.avg, dt);
1698 	ce->runtime.total += dt;
1699 }
1700 
1701 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1702 #include "selftest_lrc.c"
1703 #endif
1704