xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision fa85bfd1)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gem/i915_gem_lmem.h"
7 
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "intel_engine.h"
12 #include "intel_gpu_commands.h"
13 #include "intel_gt.h"
14 #include "intel_lrc.h"
15 #include "intel_lrc_reg.h"
16 #include "intel_ring.h"
17 #include "shmem_utils.h"
18 
19 static void set_offsets(u32 *regs,
20 			const u8 *data,
21 			const struct intel_engine_cs *engine,
22 			bool close)
23 #define NOP(x) (BIT(7) | (x))
24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
25 #define POSTED BIT(0)
26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
27 #define REG16(x) \
28 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
29 	(((x) >> 2) & 0x7f)
30 #define END 0
31 {
32 	const u32 base = engine->mmio_base;
33 
34 	while (*data) {
35 		u8 count, flags;
36 
37 		if (*data & BIT(7)) { /* skip */
38 			count = *data++ & ~BIT(7);
39 			regs += count;
40 			continue;
41 		}
42 
43 		count = *data & 0x3f;
44 		flags = *data >> 6;
45 		data++;
46 
47 		*regs = MI_LOAD_REGISTER_IMM(count);
48 		if (flags & POSTED)
49 			*regs |= MI_LRI_FORCE_POSTED;
50 		if (INTEL_GEN(engine->i915) >= 11)
51 			*regs |= MI_LRI_LRM_CS_MMIO;
52 		regs++;
53 
54 		GEM_BUG_ON(!count);
55 		do {
56 			u32 offset = 0;
57 			u8 v;
58 
59 			do {
60 				v = *data++;
61 				offset <<= 7;
62 				offset |= v & ~BIT(7);
63 			} while (v & BIT(7));
64 
65 			regs[0] = base + (offset << 2);
66 			regs += 2;
67 		} while (--count);
68 	}
69 
70 	if (close) {
71 		/* Close the batch; used mainly by live_lrc_layout() */
72 		*regs = MI_BATCH_BUFFER_END;
73 		if (INTEL_GEN(engine->i915) >= 10)
74 			*regs |= BIT(0);
75 	}
76 }
77 
78 static const u8 gen8_xcs_offsets[] = {
79 	NOP(1),
80 	LRI(11, 0),
81 	REG16(0x244),
82 	REG(0x034),
83 	REG(0x030),
84 	REG(0x038),
85 	REG(0x03c),
86 	REG(0x168),
87 	REG(0x140),
88 	REG(0x110),
89 	REG(0x11c),
90 	REG(0x114),
91 	REG(0x118),
92 
93 	NOP(9),
94 	LRI(9, 0),
95 	REG16(0x3a8),
96 	REG16(0x28c),
97 	REG16(0x288),
98 	REG16(0x284),
99 	REG16(0x280),
100 	REG16(0x27c),
101 	REG16(0x278),
102 	REG16(0x274),
103 	REG16(0x270),
104 
105 	NOP(13),
106 	LRI(2, 0),
107 	REG16(0x200),
108 	REG(0x028),
109 
110 	END
111 };
112 
113 static const u8 gen9_xcs_offsets[] = {
114 	NOP(1),
115 	LRI(14, POSTED),
116 	REG16(0x244),
117 	REG(0x034),
118 	REG(0x030),
119 	REG(0x038),
120 	REG(0x03c),
121 	REG(0x168),
122 	REG(0x140),
123 	REG(0x110),
124 	REG(0x11c),
125 	REG(0x114),
126 	REG(0x118),
127 	REG(0x1c0),
128 	REG(0x1c4),
129 	REG(0x1c8),
130 
131 	NOP(3),
132 	LRI(9, POSTED),
133 	REG16(0x3a8),
134 	REG16(0x28c),
135 	REG16(0x288),
136 	REG16(0x284),
137 	REG16(0x280),
138 	REG16(0x27c),
139 	REG16(0x278),
140 	REG16(0x274),
141 	REG16(0x270),
142 
143 	NOP(13),
144 	LRI(1, POSTED),
145 	REG16(0x200),
146 
147 	NOP(13),
148 	LRI(44, POSTED),
149 	REG(0x028),
150 	REG(0x09c),
151 	REG(0x0c0),
152 	REG(0x178),
153 	REG(0x17c),
154 	REG16(0x358),
155 	REG(0x170),
156 	REG(0x150),
157 	REG(0x154),
158 	REG(0x158),
159 	REG16(0x41c),
160 	REG16(0x600),
161 	REG16(0x604),
162 	REG16(0x608),
163 	REG16(0x60c),
164 	REG16(0x610),
165 	REG16(0x614),
166 	REG16(0x618),
167 	REG16(0x61c),
168 	REG16(0x620),
169 	REG16(0x624),
170 	REG16(0x628),
171 	REG16(0x62c),
172 	REG16(0x630),
173 	REG16(0x634),
174 	REG16(0x638),
175 	REG16(0x63c),
176 	REG16(0x640),
177 	REG16(0x644),
178 	REG16(0x648),
179 	REG16(0x64c),
180 	REG16(0x650),
181 	REG16(0x654),
182 	REG16(0x658),
183 	REG16(0x65c),
184 	REG16(0x660),
185 	REG16(0x664),
186 	REG16(0x668),
187 	REG16(0x66c),
188 	REG16(0x670),
189 	REG16(0x674),
190 	REG16(0x678),
191 	REG16(0x67c),
192 	REG(0x068),
193 
194 	END
195 };
196 
197 static const u8 gen12_xcs_offsets[] = {
198 	NOP(1),
199 	LRI(13, POSTED),
200 	REG16(0x244),
201 	REG(0x034),
202 	REG(0x030),
203 	REG(0x038),
204 	REG(0x03c),
205 	REG(0x168),
206 	REG(0x140),
207 	REG(0x110),
208 	REG(0x1c0),
209 	REG(0x1c4),
210 	REG(0x1c8),
211 	REG(0x180),
212 	REG16(0x2b4),
213 
214 	NOP(5),
215 	LRI(9, POSTED),
216 	REG16(0x3a8),
217 	REG16(0x28c),
218 	REG16(0x288),
219 	REG16(0x284),
220 	REG16(0x280),
221 	REG16(0x27c),
222 	REG16(0x278),
223 	REG16(0x274),
224 	REG16(0x270),
225 
226 	END
227 };
228 
229 static const u8 gen8_rcs_offsets[] = {
230 	NOP(1),
231 	LRI(14, POSTED),
232 	REG16(0x244),
233 	REG(0x034),
234 	REG(0x030),
235 	REG(0x038),
236 	REG(0x03c),
237 	REG(0x168),
238 	REG(0x140),
239 	REG(0x110),
240 	REG(0x11c),
241 	REG(0x114),
242 	REG(0x118),
243 	REG(0x1c0),
244 	REG(0x1c4),
245 	REG(0x1c8),
246 
247 	NOP(3),
248 	LRI(9, POSTED),
249 	REG16(0x3a8),
250 	REG16(0x28c),
251 	REG16(0x288),
252 	REG16(0x284),
253 	REG16(0x280),
254 	REG16(0x27c),
255 	REG16(0x278),
256 	REG16(0x274),
257 	REG16(0x270),
258 
259 	NOP(13),
260 	LRI(1, 0),
261 	REG(0x0c8),
262 
263 	END
264 };
265 
266 static const u8 gen9_rcs_offsets[] = {
267 	NOP(1),
268 	LRI(14, POSTED),
269 	REG16(0x244),
270 	REG(0x34),
271 	REG(0x30),
272 	REG(0x38),
273 	REG(0x3c),
274 	REG(0x168),
275 	REG(0x140),
276 	REG(0x110),
277 	REG(0x11c),
278 	REG(0x114),
279 	REG(0x118),
280 	REG(0x1c0),
281 	REG(0x1c4),
282 	REG(0x1c8),
283 
284 	NOP(3),
285 	LRI(9, POSTED),
286 	REG16(0x3a8),
287 	REG16(0x28c),
288 	REG16(0x288),
289 	REG16(0x284),
290 	REG16(0x280),
291 	REG16(0x27c),
292 	REG16(0x278),
293 	REG16(0x274),
294 	REG16(0x270),
295 
296 	NOP(13),
297 	LRI(1, 0),
298 	REG(0xc8),
299 
300 	NOP(13),
301 	LRI(44, POSTED),
302 	REG(0x28),
303 	REG(0x9c),
304 	REG(0xc0),
305 	REG(0x178),
306 	REG(0x17c),
307 	REG16(0x358),
308 	REG(0x170),
309 	REG(0x150),
310 	REG(0x154),
311 	REG(0x158),
312 	REG16(0x41c),
313 	REG16(0x600),
314 	REG16(0x604),
315 	REG16(0x608),
316 	REG16(0x60c),
317 	REG16(0x610),
318 	REG16(0x614),
319 	REG16(0x618),
320 	REG16(0x61c),
321 	REG16(0x620),
322 	REG16(0x624),
323 	REG16(0x628),
324 	REG16(0x62c),
325 	REG16(0x630),
326 	REG16(0x634),
327 	REG16(0x638),
328 	REG16(0x63c),
329 	REG16(0x640),
330 	REG16(0x644),
331 	REG16(0x648),
332 	REG16(0x64c),
333 	REG16(0x650),
334 	REG16(0x654),
335 	REG16(0x658),
336 	REG16(0x65c),
337 	REG16(0x660),
338 	REG16(0x664),
339 	REG16(0x668),
340 	REG16(0x66c),
341 	REG16(0x670),
342 	REG16(0x674),
343 	REG16(0x678),
344 	REG16(0x67c),
345 	REG(0x68),
346 
347 	END
348 };
349 
350 static const u8 gen11_rcs_offsets[] = {
351 	NOP(1),
352 	LRI(15, POSTED),
353 	REG16(0x244),
354 	REG(0x034),
355 	REG(0x030),
356 	REG(0x038),
357 	REG(0x03c),
358 	REG(0x168),
359 	REG(0x140),
360 	REG(0x110),
361 	REG(0x11c),
362 	REG(0x114),
363 	REG(0x118),
364 	REG(0x1c0),
365 	REG(0x1c4),
366 	REG(0x1c8),
367 	REG(0x180),
368 
369 	NOP(1),
370 	LRI(9, POSTED),
371 	REG16(0x3a8),
372 	REG16(0x28c),
373 	REG16(0x288),
374 	REG16(0x284),
375 	REG16(0x280),
376 	REG16(0x27c),
377 	REG16(0x278),
378 	REG16(0x274),
379 	REG16(0x270),
380 
381 	LRI(1, POSTED),
382 	REG(0x1b0),
383 
384 	NOP(10),
385 	LRI(1, 0),
386 	REG(0x0c8),
387 
388 	END
389 };
390 
391 static const u8 gen12_rcs_offsets[] = {
392 	NOP(1),
393 	LRI(13, POSTED),
394 	REG16(0x244),
395 	REG(0x034),
396 	REG(0x030),
397 	REG(0x038),
398 	REG(0x03c),
399 	REG(0x168),
400 	REG(0x140),
401 	REG(0x110),
402 	REG(0x1c0),
403 	REG(0x1c4),
404 	REG(0x1c8),
405 	REG(0x180),
406 	REG16(0x2b4),
407 
408 	NOP(5),
409 	LRI(9, POSTED),
410 	REG16(0x3a8),
411 	REG16(0x28c),
412 	REG16(0x288),
413 	REG16(0x284),
414 	REG16(0x280),
415 	REG16(0x27c),
416 	REG16(0x278),
417 	REG16(0x274),
418 	REG16(0x270),
419 
420 	LRI(3, POSTED),
421 	REG(0x1b0),
422 	REG16(0x5a8),
423 	REG16(0x5ac),
424 
425 	NOP(6),
426 	LRI(1, 0),
427 	REG(0x0c8),
428 	NOP(3 + 9 + 1),
429 
430 	LRI(51, POSTED),
431 	REG16(0x588),
432 	REG16(0x588),
433 	REG16(0x588),
434 	REG16(0x588),
435 	REG16(0x588),
436 	REG16(0x588),
437 	REG(0x028),
438 	REG(0x09c),
439 	REG(0x0c0),
440 	REG(0x178),
441 	REG(0x17c),
442 	REG16(0x358),
443 	REG(0x170),
444 	REG(0x150),
445 	REG(0x154),
446 	REG(0x158),
447 	REG16(0x41c),
448 	REG16(0x600),
449 	REG16(0x604),
450 	REG16(0x608),
451 	REG16(0x60c),
452 	REG16(0x610),
453 	REG16(0x614),
454 	REG16(0x618),
455 	REG16(0x61c),
456 	REG16(0x620),
457 	REG16(0x624),
458 	REG16(0x628),
459 	REG16(0x62c),
460 	REG16(0x630),
461 	REG16(0x634),
462 	REG16(0x638),
463 	REG16(0x63c),
464 	REG16(0x640),
465 	REG16(0x644),
466 	REG16(0x648),
467 	REG16(0x64c),
468 	REG16(0x650),
469 	REG16(0x654),
470 	REG16(0x658),
471 	REG16(0x65c),
472 	REG16(0x660),
473 	REG16(0x664),
474 	REG16(0x668),
475 	REG16(0x66c),
476 	REG16(0x670),
477 	REG16(0x674),
478 	REG16(0x678),
479 	REG16(0x67c),
480 	REG(0x068),
481 	REG(0x084),
482 	NOP(1),
483 
484 	END
485 };
486 
487 #undef END
488 #undef REG16
489 #undef REG
490 #undef LRI
491 #undef NOP
492 
493 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
494 {
495 	/*
496 	 * The gen12+ lists only have the registers we program in the basic
497 	 * default state. We rely on the context image using relative
498 	 * addressing to automatic fixup the register state between the
499 	 * physical engines for virtual engine.
500 	 */
501 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
502 		   !intel_engine_has_relative_mmio(engine));
503 
504 	if (engine->class == RENDER_CLASS) {
505 		if (INTEL_GEN(engine->i915) >= 12)
506 			return gen12_rcs_offsets;
507 		else if (INTEL_GEN(engine->i915) >= 11)
508 			return gen11_rcs_offsets;
509 		else if (INTEL_GEN(engine->i915) >= 9)
510 			return gen9_rcs_offsets;
511 		else
512 			return gen8_rcs_offsets;
513 	} else {
514 		if (INTEL_GEN(engine->i915) >= 12)
515 			return gen12_xcs_offsets;
516 		else if (INTEL_GEN(engine->i915) >= 9)
517 			return gen9_xcs_offsets;
518 		else
519 			return gen8_xcs_offsets;
520 	}
521 }
522 
523 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
524 {
525 	if (INTEL_GEN(engine->i915) >= 12)
526 		return 0x60;
527 	else if (INTEL_GEN(engine->i915) >= 9)
528 		return 0x54;
529 	else if (engine->class == RENDER_CLASS)
530 		return 0x58;
531 	else
532 		return -1;
533 }
534 
535 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
536 {
537 	if (INTEL_GEN(engine->i915) >= 12)
538 		return 0x74;
539 	else if (INTEL_GEN(engine->i915) >= 9)
540 		return 0x68;
541 	else if (engine->class == RENDER_CLASS)
542 		return 0xd8;
543 	else
544 		return -1;
545 }
546 
547 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
548 {
549 	if (INTEL_GEN(engine->i915) >= 12)
550 		return 0x12;
551 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
552 		return 0x18;
553 	else
554 		return -1;
555 }
556 
557 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
558 {
559 	int x;
560 
561 	x = lrc_ring_wa_bb_per_ctx(engine);
562 	if (x < 0)
563 		return x;
564 
565 	return x + 2;
566 }
567 
568 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
569 {
570 	int x;
571 
572 	x = lrc_ring_indirect_ptr(engine);
573 	if (x < 0)
574 		return x;
575 
576 	return x + 2;
577 }
578 
579 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
580 {
581 	if (engine->class != RENDER_CLASS)
582 		return -1;
583 
584 	if (INTEL_GEN(engine->i915) >= 12)
585 		return 0xb6;
586 	else if (INTEL_GEN(engine->i915) >= 11)
587 		return 0xaa;
588 	else
589 		return -1;
590 }
591 
592 static u32
593 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
594 {
595 	switch (INTEL_GEN(engine->i915)) {
596 	default:
597 		MISSING_CASE(INTEL_GEN(engine->i915));
598 		fallthrough;
599 	case 12:
600 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
601 	case 11:
602 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
603 	case 10:
604 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
605 	case 9:
606 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
607 	case 8:
608 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
609 	}
610 }
611 
612 static void
613 lrc_setup_indirect_ctx(u32 *regs,
614 		       const struct intel_engine_cs *engine,
615 		       u32 ctx_bb_ggtt_addr,
616 		       u32 size)
617 {
618 	GEM_BUG_ON(!size);
619 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
620 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
621 	regs[lrc_ring_indirect_ptr(engine) + 1] =
622 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
623 
624 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
625 	regs[lrc_ring_indirect_offset(engine) + 1] =
626 		lrc_ring_indirect_offset_default(engine) << 6;
627 }
628 
629 static void init_common_regs(u32 * const regs,
630 			     const struct intel_context *ce,
631 			     const struct intel_engine_cs *engine,
632 			     bool inhibit)
633 {
634 	u32 ctl;
635 
636 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
637 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
638 	if (inhibit)
639 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
640 	if (INTEL_GEN(engine->i915) < 11)
641 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
642 					   CTX_CTRL_RS_CTX_ENABLE);
643 	regs[CTX_CONTEXT_CONTROL] = ctl;
644 
645 	regs[CTX_TIMESTAMP] = ce->runtime.last;
646 }
647 
648 static void init_wa_bb_regs(u32 * const regs,
649 			    const struct intel_engine_cs *engine)
650 {
651 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
652 
653 	if (wa_ctx->per_ctx.size) {
654 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
655 
656 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
657 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
658 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
659 	}
660 
661 	if (wa_ctx->indirect_ctx.size) {
662 		lrc_setup_indirect_ctx(regs, engine,
663 				       i915_ggtt_offset(wa_ctx->vma) +
664 				       wa_ctx->indirect_ctx.offset,
665 				       wa_ctx->indirect_ctx.size);
666 	}
667 }
668 
669 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
670 {
671 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
672 		/* 64b PPGTT (48bit canonical)
673 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
674 		 * other PDP Descriptors are ignored.
675 		 */
676 		ASSIGN_CTX_PML4(ppgtt, regs);
677 	} else {
678 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
679 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
680 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
681 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
682 	}
683 }
684 
685 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
686 {
687 	if (i915_is_ggtt(vm))
688 		return i915_vm_to_ggtt(vm)->alias;
689 	else
690 		return i915_vm_to_ppgtt(vm);
691 }
692 
693 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
694 {
695 	int x;
696 
697 	x = lrc_ring_mi_mode(engine);
698 	if (x != -1) {
699 		regs[x + 1] &= ~STOP_RING;
700 		regs[x + 1] |= STOP_RING << 16;
701 	}
702 }
703 
704 static void __lrc_init_regs(u32 *regs,
705 			    const struct intel_context *ce,
706 			    const struct intel_engine_cs *engine,
707 			    bool inhibit)
708 {
709 	/*
710 	 * A context is actually a big batch buffer with several
711 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
712 	 * values we are setting here are only for the first context restore:
713 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
714 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
715 	 * we are not initializing here).
716 	 *
717 	 * Must keep consistent with virtual_update_register_offsets().
718 	 */
719 
720 	if (inhibit)
721 		memset(regs, 0, PAGE_SIZE);
722 
723 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
724 
725 	init_common_regs(regs, ce, engine, inhibit);
726 	init_ppgtt_regs(regs, vm_alias(ce->vm));
727 
728 	init_wa_bb_regs(regs, engine);
729 
730 	__reset_stop_ring(regs, engine);
731 }
732 
733 void lrc_init_regs(const struct intel_context *ce,
734 		   const struct intel_engine_cs *engine,
735 		   bool inhibit)
736 {
737 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
738 }
739 
740 void lrc_reset_regs(const struct intel_context *ce,
741 		    const struct intel_engine_cs *engine)
742 {
743 	__reset_stop_ring(ce->lrc_reg_state, engine);
744 }
745 
746 static void
747 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
748 {
749 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
750 		return;
751 
752 	vaddr += engine->context_size;
753 
754 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
755 }
756 
757 static void
758 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
759 {
760 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
761 		return;
762 
763 	vaddr += engine->context_size;
764 
765 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
766 		drm_err_once(&engine->i915->drm,
767 			     "%s context redzone overwritten!\n",
768 			     engine->name);
769 }
770 
771 void lrc_init_state(struct intel_context *ce,
772 		    struct intel_engine_cs *engine,
773 		    void *state)
774 {
775 	bool inhibit = true;
776 
777 	set_redzone(state, engine);
778 
779 	if (engine->default_state) {
780 		shmem_read(engine->default_state, 0,
781 			   state, engine->context_size);
782 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
783 		inhibit = false;
784 	}
785 
786 	/* Clear the ppHWSP (inc. per-context counters) */
787 	memset(state, 0, PAGE_SIZE);
788 
789 	/*
790 	 * The second page of the context object contains some registers which
791 	 * must be set up prior to the first execution.
792 	 */
793 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
794 }
795 
796 static struct i915_vma *
797 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
798 {
799 	struct drm_i915_gem_object *obj;
800 	struct i915_vma *vma;
801 	u32 context_size;
802 
803 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
804 
805 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
806 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
807 
808 	if (INTEL_GEN(engine->i915) == 12) {
809 		ce->wa_bb_page = context_size / PAGE_SIZE;
810 		context_size += PAGE_SIZE;
811 	}
812 
813 	obj = i915_gem_object_create_lmem(engine->i915, context_size, 0);
814 	if (IS_ERR(obj))
815 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
816 	if (IS_ERR(obj))
817 		return ERR_CAST(obj);
818 
819 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
820 	if (IS_ERR(vma)) {
821 		i915_gem_object_put(obj);
822 		return vma;
823 	}
824 
825 	return vma;
826 }
827 
828 static struct intel_timeline *
829 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
830 {
831 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
832 
833 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
834 }
835 
836 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
837 {
838 	struct intel_ring *ring;
839 	struct i915_vma *vma;
840 	int err;
841 
842 	GEM_BUG_ON(ce->state);
843 
844 	vma = __lrc_alloc_state(ce, engine);
845 	if (IS_ERR(vma))
846 		return PTR_ERR(vma);
847 
848 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
849 	if (IS_ERR(ring)) {
850 		err = PTR_ERR(ring);
851 		goto err_vma;
852 	}
853 
854 	if (!page_mask_bits(ce->timeline)) {
855 		struct intel_timeline *tl;
856 
857 		/*
858 		 * Use the static global HWSP for the kernel context, and
859 		 * a dynamically allocated cacheline for everyone else.
860 		 */
861 		if (unlikely(ce->timeline))
862 			tl = pinned_timeline(ce, engine);
863 		else
864 			tl = intel_timeline_create(engine->gt);
865 		if (IS_ERR(tl)) {
866 			err = PTR_ERR(tl);
867 			goto err_ring;
868 		}
869 
870 		ce->timeline = tl;
871 	}
872 
873 	ce->ring = ring;
874 	ce->state = vma;
875 
876 	return 0;
877 
878 err_ring:
879 	intel_ring_put(ring);
880 err_vma:
881 	i915_vma_put(vma);
882 	return err;
883 }
884 
885 void lrc_reset(struct intel_context *ce)
886 {
887 	GEM_BUG_ON(!intel_context_is_pinned(ce));
888 
889 	intel_ring_reset(ce->ring, ce->ring->emit);
890 
891 	/* Scrub away the garbage */
892 	lrc_init_regs(ce, ce->engine, true);
893 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
894 }
895 
896 int
897 lrc_pre_pin(struct intel_context *ce,
898 	    struct intel_engine_cs *engine,
899 	    struct i915_gem_ww_ctx *ww,
900 	    void **vaddr)
901 {
902 	GEM_BUG_ON(!ce->state);
903 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
904 
905 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
906 					 i915_coherent_map_type(ce->engine->i915,
907 								ce->state->obj,
908 								false) |
909 					 I915_MAP_OVERRIDE);
910 
911 	return PTR_ERR_OR_ZERO(*vaddr);
912 }
913 
914 int
915 lrc_pin(struct intel_context *ce,
916 	struct intel_engine_cs *engine,
917 	void *vaddr)
918 {
919 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
920 
921 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
922 		lrc_init_state(ce, engine, vaddr);
923 
924 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
925 	return 0;
926 }
927 
928 void lrc_unpin(struct intel_context *ce)
929 {
930 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
931 		      ce->engine);
932 }
933 
934 void lrc_post_unpin(struct intel_context *ce)
935 {
936 	i915_gem_object_unpin_map(ce->state->obj);
937 }
938 
939 void lrc_fini(struct intel_context *ce)
940 {
941 	if (!ce->state)
942 		return;
943 
944 	intel_ring_put(fetch_and_zero(&ce->ring));
945 	i915_vma_put(fetch_and_zero(&ce->state));
946 }
947 
948 void lrc_destroy(struct kref *kref)
949 {
950 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
951 
952 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
953 	GEM_BUG_ON(intel_context_is_pinned(ce));
954 
955 	lrc_fini(ce);
956 
957 	intel_context_fini(ce);
958 	intel_context_free(ce);
959 }
960 
961 static u32 *
962 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
963 {
964 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
965 		MI_SRM_LRM_GLOBAL_GTT |
966 		MI_LRI_LRM_CS_MMIO;
967 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
968 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
969 		CTX_TIMESTAMP * sizeof(u32);
970 	*cs++ = 0;
971 
972 	*cs++ = MI_LOAD_REGISTER_REG |
973 		MI_LRR_SOURCE_CS_MMIO |
974 		MI_LRI_LRM_CS_MMIO;
975 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
976 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
977 
978 	*cs++ = MI_LOAD_REGISTER_REG |
979 		MI_LRR_SOURCE_CS_MMIO |
980 		MI_LRI_LRM_CS_MMIO;
981 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
982 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
983 
984 	return cs;
985 }
986 
987 static u32 *
988 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
989 {
990 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
991 
992 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
993 		MI_SRM_LRM_GLOBAL_GTT |
994 		MI_LRI_LRM_CS_MMIO;
995 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
996 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
997 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
998 	*cs++ = 0;
999 
1000 	return cs;
1001 }
1002 
1003 static u32 *
1004 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1005 {
1006 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1007 
1008 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1009 		MI_SRM_LRM_GLOBAL_GTT |
1010 		MI_LRI_LRM_CS_MMIO;
1011 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1012 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1013 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1014 	*cs++ = 0;
1015 
1016 	*cs++ = MI_LOAD_REGISTER_REG |
1017 		MI_LRR_SOURCE_CS_MMIO |
1018 		MI_LRI_LRM_CS_MMIO;
1019 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1020 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1021 
1022 	return cs;
1023 }
1024 
1025 static u32 *
1026 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1027 {
1028 	cs = gen12_emit_timestamp_wa(ce, cs);
1029 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1030 	cs = gen12_emit_restore_scratch(ce, cs);
1031 
1032 	return cs;
1033 }
1034 
1035 static u32 *
1036 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1037 {
1038 	cs = gen12_emit_timestamp_wa(ce, cs);
1039 	cs = gen12_emit_restore_scratch(ce, cs);
1040 
1041 	return cs;
1042 }
1043 
1044 static u32 context_wa_bb_offset(const struct intel_context *ce)
1045 {
1046 	return PAGE_SIZE * ce->wa_bb_page;
1047 }
1048 
1049 static u32 *context_indirect_bb(const struct intel_context *ce)
1050 {
1051 	void *ptr;
1052 
1053 	GEM_BUG_ON(!ce->wa_bb_page);
1054 
1055 	ptr = ce->lrc_reg_state;
1056 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1057 	ptr += context_wa_bb_offset(ce);
1058 
1059 	return ptr;
1060 }
1061 
1062 static void
1063 setup_indirect_ctx_bb(const struct intel_context *ce,
1064 		      const struct intel_engine_cs *engine,
1065 		      u32 *(*emit)(const struct intel_context *, u32 *))
1066 {
1067 	u32 * const start = context_indirect_bb(ce);
1068 	u32 *cs;
1069 
1070 	cs = emit(ce, start);
1071 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1072 	while ((unsigned long)cs % CACHELINE_BYTES)
1073 		*cs++ = MI_NOOP;
1074 
1075 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1076 			       i915_ggtt_offset(ce->state) +
1077 			       context_wa_bb_offset(ce),
1078 			       (cs - start) * sizeof(*cs));
1079 }
1080 
1081 /*
1082  * The context descriptor encodes various attributes of a context,
1083  * including its GTT address and some flags. Because it's fairly
1084  * expensive to calculate, we'll just do it once and cache the result,
1085  * which remains valid until the context is unpinned.
1086  *
1087  * This is what a descriptor looks like, from LSB to MSB::
1088  *
1089  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1090  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1091  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1092  *      bits 53-54:    mbz, reserved for use by hardware
1093  *      bits 55-63:    group ID, currently unused and set to 0
1094  *
1095  * Starting from Gen11, the upper dword of the descriptor has a new format:
1096  *
1097  *      bits 32-36:    reserved
1098  *      bits 37-47:    SW context ID
1099  *      bits 48:53:    engine instance
1100  *      bit 54:        mbz, reserved for use by hardware
1101  *      bits 55-60:    SW counter
1102  *      bits 61-63:    engine class
1103  *
1104  * engine info, SW context ID and SW counter need to form a unique number
1105  * (Context ID) per lrc.
1106  */
1107 static u32 lrc_descriptor(const struct intel_context *ce)
1108 {
1109 	u32 desc;
1110 
1111 	desc = INTEL_LEGACY_32B_CONTEXT;
1112 	if (i915_vm_is_4lvl(ce->vm))
1113 		desc = INTEL_LEGACY_64B_CONTEXT;
1114 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1115 
1116 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1117 	if (IS_GEN(ce->vm->i915, 8))
1118 		desc |= GEN8_CTX_L3LLC_COHERENT;
1119 
1120 	return i915_ggtt_offset(ce->state) | desc;
1121 }
1122 
1123 u32 lrc_update_regs(const struct intel_context *ce,
1124 		    const struct intel_engine_cs *engine,
1125 		    u32 head)
1126 {
1127 	struct intel_ring *ring = ce->ring;
1128 	u32 *regs = ce->lrc_reg_state;
1129 
1130 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1131 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1132 
1133 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1134 	regs[CTX_RING_HEAD] = head;
1135 	regs[CTX_RING_TAIL] = ring->tail;
1136 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1137 
1138 	/* RPCS */
1139 	if (engine->class == RENDER_CLASS) {
1140 		regs[CTX_R_PWR_CLK_STATE] =
1141 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1142 
1143 		i915_oa_init_reg_state(ce, engine);
1144 	}
1145 
1146 	if (ce->wa_bb_page) {
1147 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1148 
1149 		fn = gen12_emit_indirect_ctx_xcs;
1150 		if (ce->engine->class == RENDER_CLASS)
1151 			fn = gen12_emit_indirect_ctx_rcs;
1152 
1153 		/* Mutually exclusive wrt to global indirect bb */
1154 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1155 		setup_indirect_ctx_bb(ce, engine, fn);
1156 	}
1157 
1158 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1159 }
1160 
1161 void lrc_update_offsets(struct intel_context *ce,
1162 			struct intel_engine_cs *engine)
1163 {
1164 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1165 }
1166 
1167 void lrc_check_regs(const struct intel_context *ce,
1168 		    const struct intel_engine_cs *engine,
1169 		    const char *when)
1170 {
1171 	const struct intel_ring *ring = ce->ring;
1172 	u32 *regs = ce->lrc_reg_state;
1173 	bool valid = true;
1174 	int x;
1175 
1176 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1177 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1178 		       engine->name,
1179 		       regs[CTX_RING_START],
1180 		       i915_ggtt_offset(ring->vma));
1181 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1182 		valid = false;
1183 	}
1184 
1185 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1186 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1187 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1188 		       engine->name,
1189 		       regs[CTX_RING_CTL],
1190 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1191 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1192 		valid = false;
1193 	}
1194 
1195 	x = lrc_ring_mi_mode(engine);
1196 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1197 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1198 		       engine->name, regs[x + 1]);
1199 		regs[x + 1] &= ~STOP_RING;
1200 		regs[x + 1] |= STOP_RING << 16;
1201 		valid = false;
1202 	}
1203 
1204 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1205 }
1206 
1207 /*
1208  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1209  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1210  * but there is a slight complication as this is applied in WA batch where the
1211  * values are only initialized once so we cannot take register value at the
1212  * beginning and reuse it further; hence we save its value to memory, upload a
1213  * constant value with bit21 set and then we restore it back with the saved value.
1214  * To simplify the WA, a constant value is formed by using the default value
1215  * of this register. This shouldn't be a problem because we are only modifying
1216  * it for a short period and this batch in non-premptible. We can ofcourse
1217  * use additional instructions that read the actual value of the register
1218  * at that time and set our bit of interest but it makes the WA complicated.
1219  *
1220  * This WA is also required for Gen9 so extracting as a function avoids
1221  * code duplication.
1222  */
1223 static u32 *
1224 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1225 {
1226 	/* NB no one else is allowed to scribble over scratch + 256! */
1227 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1228 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1229 	*batch++ = intel_gt_scratch_offset(engine->gt,
1230 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1231 	*batch++ = 0;
1232 
1233 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1234 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1235 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1236 
1237 	batch = gen8_emit_pipe_control(batch,
1238 				       PIPE_CONTROL_CS_STALL |
1239 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1240 				       0);
1241 
1242 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1243 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1244 	*batch++ = intel_gt_scratch_offset(engine->gt,
1245 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1246 	*batch++ = 0;
1247 
1248 	return batch;
1249 }
1250 
1251 /*
1252  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1253  * initialized at the beginning and shared across all contexts but this field
1254  * helps us to have multiple batches at different offsets and select them based
1255  * on a criteria. At the moment this batch always start at the beginning of the page
1256  * and at this point we don't have multiple wa_ctx batch buffers.
1257  *
1258  * The number of WA applied are not known at the beginning; we use this field
1259  * to return the no of DWORDS written.
1260  *
1261  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1262  * so it adds NOOPs as padding to make it cacheline aligned.
1263  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1264  * makes a complete batch buffer.
1265  */
1266 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1267 {
1268 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1269 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1270 
1271 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1272 	if (IS_BROADWELL(engine->i915))
1273 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1274 
1275 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1276 	/* Actual scratch location is at 128 bytes offset */
1277 	batch = gen8_emit_pipe_control(batch,
1278 				       PIPE_CONTROL_FLUSH_L3 |
1279 				       PIPE_CONTROL_STORE_DATA_INDEX |
1280 				       PIPE_CONTROL_CS_STALL |
1281 				       PIPE_CONTROL_QW_WRITE,
1282 				       LRC_PPHWSP_SCRATCH_ADDR);
1283 
1284 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1285 
1286 	/* Pad to end of cacheline */
1287 	while ((unsigned long)batch % CACHELINE_BYTES)
1288 		*batch++ = MI_NOOP;
1289 
1290 	/*
1291 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1292 	 * execution depends on the length specified in terms of cache lines
1293 	 * in the register CTX_RCS_INDIRECT_CTX
1294 	 */
1295 
1296 	return batch;
1297 }
1298 
1299 struct lri {
1300 	i915_reg_t reg;
1301 	u32 value;
1302 };
1303 
1304 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1305 {
1306 	GEM_BUG_ON(!count || count > 63);
1307 
1308 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1309 	do {
1310 		*batch++ = i915_mmio_reg_offset(lri->reg);
1311 		*batch++ = lri->value;
1312 	} while (lri++, --count);
1313 	*batch++ = MI_NOOP;
1314 
1315 	return batch;
1316 }
1317 
1318 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1319 {
1320 	static const struct lri lri[] = {
1321 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1322 		{
1323 			COMMON_SLICE_CHICKEN2,
1324 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1325 				       0),
1326 		},
1327 
1328 		/* BSpec: 11391 */
1329 		{
1330 			FF_SLICE_CHICKEN,
1331 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1332 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1333 		},
1334 
1335 		/* BSpec: 11299 */
1336 		{
1337 			_3D_CHICKEN3,
1338 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1339 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1340 		}
1341 	};
1342 
1343 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1344 
1345 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1346 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1347 
1348 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1349 	batch = gen8_emit_pipe_control(batch,
1350 				       PIPE_CONTROL_FLUSH_L3 |
1351 				       PIPE_CONTROL_STORE_DATA_INDEX |
1352 				       PIPE_CONTROL_CS_STALL |
1353 				       PIPE_CONTROL_QW_WRITE,
1354 				       LRC_PPHWSP_SCRATCH_ADDR);
1355 
1356 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1357 
1358 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1359 	if (HAS_POOLED_EU(engine->i915)) {
1360 		/*
1361 		 * EU pool configuration is setup along with golden context
1362 		 * during context initialization. This value depends on
1363 		 * device type (2x6 or 3x6) and needs to be updated based
1364 		 * on which subslice is disabled especially for 2x6
1365 		 * devices, however it is safe to load default
1366 		 * configuration of 3x6 device instead of masking off
1367 		 * corresponding bits because HW ignores bits of a disabled
1368 		 * subslice and drops down to appropriate config. Please
1369 		 * see render_state_setup() in i915_gem_render_state.c for
1370 		 * possible configurations, to avoid duplication they are
1371 		 * not shown here again.
1372 		 */
1373 		*batch++ = GEN9_MEDIA_POOL_STATE;
1374 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1375 		*batch++ = 0x00777000;
1376 		*batch++ = 0;
1377 		*batch++ = 0;
1378 		*batch++ = 0;
1379 	}
1380 
1381 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1382 
1383 	/* Pad to end of cacheline */
1384 	while ((unsigned long)batch % CACHELINE_BYTES)
1385 		*batch++ = MI_NOOP;
1386 
1387 	return batch;
1388 }
1389 
1390 static u32 *
1391 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1392 {
1393 	int i;
1394 
1395 	/*
1396 	 * WaPipeControlBefore3DStateSamplePattern: cnl
1397 	 *
1398 	 * Ensure the engine is idle prior to programming a
1399 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
1400 	 */
1401 	batch = gen8_emit_pipe_control(batch,
1402 				       PIPE_CONTROL_CS_STALL,
1403 				       0);
1404 	/*
1405 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
1406 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
1407 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
1408 	 * confusing. Since gen8_emit_pipe_control() already advances the
1409 	 * batch by 6 dwords, we advance the other 10 here, completing a
1410 	 * cacheline. It's not clear if the workaround requires this padding
1411 	 * before other commands, or if it's just the regular padding we would
1412 	 * already have for the workaround bb, so leave it here for now.
1413 	 */
1414 	for (i = 0; i < 10; i++)
1415 		*batch++ = MI_NOOP;
1416 
1417 	/* Pad to end of cacheline */
1418 	while ((unsigned long)batch % CACHELINE_BYTES)
1419 		*batch++ = MI_NOOP;
1420 
1421 	return batch;
1422 }
1423 
1424 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1425 
1426 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1427 {
1428 	struct drm_i915_gem_object *obj;
1429 	struct i915_vma *vma;
1430 	int err;
1431 
1432 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1433 	if (IS_ERR(obj))
1434 		return PTR_ERR(obj);
1435 
1436 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1437 	if (IS_ERR(vma)) {
1438 		err = PTR_ERR(vma);
1439 		goto err;
1440 	}
1441 
1442 	engine->wa_ctx.vma = vma;
1443 	return 0;
1444 
1445 err:
1446 	i915_gem_object_put(obj);
1447 	return err;
1448 }
1449 
1450 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1451 {
1452 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1453 }
1454 
1455 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1456 
1457 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1458 {
1459 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1460 	struct i915_wa_ctx_bb *wa_bb[] = {
1461 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1462 	};
1463 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1464 	struct i915_gem_ww_ctx ww;
1465 	void *batch, *batch_ptr;
1466 	unsigned int i;
1467 	int err;
1468 
1469 	if (engine->class != RENDER_CLASS)
1470 		return;
1471 
1472 	switch (INTEL_GEN(engine->i915)) {
1473 	case 12:
1474 	case 11:
1475 		return;
1476 	case 10:
1477 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
1478 		wa_bb_fn[1] = NULL;
1479 		break;
1480 	case 9:
1481 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1482 		wa_bb_fn[1] = NULL;
1483 		break;
1484 	case 8:
1485 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1486 		wa_bb_fn[1] = NULL;
1487 		break;
1488 	default:
1489 		MISSING_CASE(INTEL_GEN(engine->i915));
1490 		return;
1491 	}
1492 
1493 	err = lrc_create_wa_ctx(engine);
1494 	if (err) {
1495 		/*
1496 		 * We continue even if we fail to initialize WA batch
1497 		 * because we only expect rare glitches but nothing
1498 		 * critical to prevent us from using GPU
1499 		 */
1500 		drm_err(&engine->i915->drm,
1501 			"Ignoring context switch w/a allocation error:%d\n",
1502 			err);
1503 		return;
1504 	}
1505 
1506 	if (!engine->wa_ctx.vma)
1507 		return;
1508 
1509 	i915_gem_ww_ctx_init(&ww, true);
1510 retry:
1511 	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1512 	if (!err)
1513 		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1514 	if (err)
1515 		goto err;
1516 
1517 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1518 	if (IS_ERR(batch)) {
1519 		err = PTR_ERR(batch);
1520 		goto err_unpin;
1521 	}
1522 
1523 	/*
1524 	 * Emit the two workaround batch buffers, recording the offset from the
1525 	 * start of the workaround batch buffer object for each and their
1526 	 * respective sizes.
1527 	 */
1528 	batch_ptr = batch;
1529 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1530 		wa_bb[i]->offset = batch_ptr - batch;
1531 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1532 						  CACHELINE_BYTES))) {
1533 			err = -EINVAL;
1534 			break;
1535 		}
1536 		if (wa_bb_fn[i])
1537 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1538 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1539 	}
1540 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1541 
1542 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1543 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1544 
1545 	/* Verify that we can handle failure to setup the wa_ctx */
1546 	if (!err)
1547 		err = i915_inject_probe_error(engine->i915, -ENODEV);
1548 
1549 err_unpin:
1550 	if (err)
1551 		i915_vma_unpin(wa_ctx->vma);
1552 err:
1553 	if (err == -EDEADLK) {
1554 		err = i915_gem_ww_ctx_backoff(&ww);
1555 		if (!err)
1556 			goto retry;
1557 	}
1558 	i915_gem_ww_ctx_fini(&ww);
1559 
1560 	if (err) {
1561 		i915_vma_put(engine->wa_ctx.vma);
1562 
1563 		/* Clear all flags to prevent further use */
1564 		memset(wa_ctx, 0, sizeof(*wa_ctx));
1565 	}
1566 }
1567 
1568 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1569 {
1570 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1571 	ce->runtime.num_underflow++;
1572 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1573 #endif
1574 }
1575 
1576 void lrc_update_runtime(struct intel_context *ce)
1577 {
1578 	u32 old;
1579 	s32 dt;
1580 
1581 	if (intel_context_is_barrier(ce))
1582 		return;
1583 
1584 	old = ce->runtime.last;
1585 	ce->runtime.last = lrc_get_runtime(ce);
1586 	dt = ce->runtime.last - old;
1587 
1588 	if (unlikely(dt < 0)) {
1589 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1590 			 old, ce->runtime.last, dt);
1591 		st_update_runtime_underflow(ce, dt);
1592 		return;
1593 	}
1594 
1595 	ewma_runtime_add(&ce->runtime.avg, dt);
1596 	ce->runtime.total += dt;
1597 }
1598 
1599 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1600 #include "selftest_lrc.c"
1601 #endif
1602