xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision a0d3fdb6)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gen8_engine_cs.h"
7 #include "i915_drv.h"
8 #include "i915_perf.h"
9 #include "intel_engine.h"
10 #include "intel_gpu_commands.h"
11 #include "intel_gt.h"
12 #include "intel_lrc.h"
13 #include "intel_lrc_reg.h"
14 #include "intel_ring.h"
15 #include "shmem_utils.h"
16 
17 static inline unsigned int dword_in_page(void *addr)
18 {
19 	return offset_in_page(addr) / sizeof(u32);
20 }
21 
22 static void set_offsets(u32 *regs,
23 			const u8 *data,
24 			const struct intel_engine_cs *engine,
25 			bool close)
26 #define NOP(x) (BIT(7) | (x))
27 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
28 #define POSTED BIT(0)
29 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
30 #define REG16(x) \
31 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
32 	(((x) >> 2) & 0x7f)
33 #define END 0
34 {
35 	const u32 base = engine->mmio_base;
36 
37 	while (*data) {
38 		u8 count, flags;
39 
40 		if (*data & BIT(7)) { /* skip */
41 			count = *data++ & ~BIT(7);
42 			regs += count;
43 			continue;
44 		}
45 
46 		count = *data & 0x3f;
47 		flags = *data >> 6;
48 		data++;
49 
50 		*regs = MI_LOAD_REGISTER_IMM(count);
51 		if (flags & POSTED)
52 			*regs |= MI_LRI_FORCE_POSTED;
53 		if (INTEL_GEN(engine->i915) >= 11)
54 			*regs |= MI_LRI_LRM_CS_MMIO;
55 		regs++;
56 
57 		GEM_BUG_ON(!count);
58 		do {
59 			u32 offset = 0;
60 			u8 v;
61 
62 			do {
63 				v = *data++;
64 				offset <<= 7;
65 				offset |= v & ~BIT(7);
66 			} while (v & BIT(7));
67 
68 			regs[0] = base + (offset << 2);
69 			regs += 2;
70 		} while (--count);
71 	}
72 
73 	if (close) {
74 		/* Close the batch; used mainly by live_lrc_layout() */
75 		*regs = MI_BATCH_BUFFER_END;
76 		if (INTEL_GEN(engine->i915) >= 10)
77 			*regs |= BIT(0);
78 	}
79 }
80 
81 static const u8 gen8_xcs_offsets[] = {
82 	NOP(1),
83 	LRI(11, 0),
84 	REG16(0x244),
85 	REG(0x034),
86 	REG(0x030),
87 	REG(0x038),
88 	REG(0x03c),
89 	REG(0x168),
90 	REG(0x140),
91 	REG(0x110),
92 	REG(0x11c),
93 	REG(0x114),
94 	REG(0x118),
95 
96 	NOP(9),
97 	LRI(9, 0),
98 	REG16(0x3a8),
99 	REG16(0x28c),
100 	REG16(0x288),
101 	REG16(0x284),
102 	REG16(0x280),
103 	REG16(0x27c),
104 	REG16(0x278),
105 	REG16(0x274),
106 	REG16(0x270),
107 
108 	NOP(13),
109 	LRI(2, 0),
110 	REG16(0x200),
111 	REG(0x028),
112 
113 	END
114 };
115 
116 static const u8 gen9_xcs_offsets[] = {
117 	NOP(1),
118 	LRI(14, POSTED),
119 	REG16(0x244),
120 	REG(0x034),
121 	REG(0x030),
122 	REG(0x038),
123 	REG(0x03c),
124 	REG(0x168),
125 	REG(0x140),
126 	REG(0x110),
127 	REG(0x11c),
128 	REG(0x114),
129 	REG(0x118),
130 	REG(0x1c0),
131 	REG(0x1c4),
132 	REG(0x1c8),
133 
134 	NOP(3),
135 	LRI(9, POSTED),
136 	REG16(0x3a8),
137 	REG16(0x28c),
138 	REG16(0x288),
139 	REG16(0x284),
140 	REG16(0x280),
141 	REG16(0x27c),
142 	REG16(0x278),
143 	REG16(0x274),
144 	REG16(0x270),
145 
146 	NOP(13),
147 	LRI(1, POSTED),
148 	REG16(0x200),
149 
150 	NOP(13),
151 	LRI(44, POSTED),
152 	REG(0x028),
153 	REG(0x09c),
154 	REG(0x0c0),
155 	REG(0x178),
156 	REG(0x17c),
157 	REG16(0x358),
158 	REG(0x170),
159 	REG(0x150),
160 	REG(0x154),
161 	REG(0x158),
162 	REG16(0x41c),
163 	REG16(0x600),
164 	REG16(0x604),
165 	REG16(0x608),
166 	REG16(0x60c),
167 	REG16(0x610),
168 	REG16(0x614),
169 	REG16(0x618),
170 	REG16(0x61c),
171 	REG16(0x620),
172 	REG16(0x624),
173 	REG16(0x628),
174 	REG16(0x62c),
175 	REG16(0x630),
176 	REG16(0x634),
177 	REG16(0x638),
178 	REG16(0x63c),
179 	REG16(0x640),
180 	REG16(0x644),
181 	REG16(0x648),
182 	REG16(0x64c),
183 	REG16(0x650),
184 	REG16(0x654),
185 	REG16(0x658),
186 	REG16(0x65c),
187 	REG16(0x660),
188 	REG16(0x664),
189 	REG16(0x668),
190 	REG16(0x66c),
191 	REG16(0x670),
192 	REG16(0x674),
193 	REG16(0x678),
194 	REG16(0x67c),
195 	REG(0x068),
196 
197 	END
198 };
199 
200 static const u8 gen12_xcs_offsets[] = {
201 	NOP(1),
202 	LRI(13, POSTED),
203 	REG16(0x244),
204 	REG(0x034),
205 	REG(0x030),
206 	REG(0x038),
207 	REG(0x03c),
208 	REG(0x168),
209 	REG(0x140),
210 	REG(0x110),
211 	REG(0x1c0),
212 	REG(0x1c4),
213 	REG(0x1c8),
214 	REG(0x180),
215 	REG16(0x2b4),
216 
217 	NOP(5),
218 	LRI(9, POSTED),
219 	REG16(0x3a8),
220 	REG16(0x28c),
221 	REG16(0x288),
222 	REG16(0x284),
223 	REG16(0x280),
224 	REG16(0x27c),
225 	REG16(0x278),
226 	REG16(0x274),
227 	REG16(0x270),
228 
229 	END
230 };
231 
232 static const u8 gen8_rcs_offsets[] = {
233 	NOP(1),
234 	LRI(14, POSTED),
235 	REG16(0x244),
236 	REG(0x034),
237 	REG(0x030),
238 	REG(0x038),
239 	REG(0x03c),
240 	REG(0x168),
241 	REG(0x140),
242 	REG(0x110),
243 	REG(0x11c),
244 	REG(0x114),
245 	REG(0x118),
246 	REG(0x1c0),
247 	REG(0x1c4),
248 	REG(0x1c8),
249 
250 	NOP(3),
251 	LRI(9, POSTED),
252 	REG16(0x3a8),
253 	REG16(0x28c),
254 	REG16(0x288),
255 	REG16(0x284),
256 	REG16(0x280),
257 	REG16(0x27c),
258 	REG16(0x278),
259 	REG16(0x274),
260 	REG16(0x270),
261 
262 	NOP(13),
263 	LRI(1, 0),
264 	REG(0x0c8),
265 
266 	END
267 };
268 
269 static const u8 gen9_rcs_offsets[] = {
270 	NOP(1),
271 	LRI(14, POSTED),
272 	REG16(0x244),
273 	REG(0x34),
274 	REG(0x30),
275 	REG(0x38),
276 	REG(0x3c),
277 	REG(0x168),
278 	REG(0x140),
279 	REG(0x110),
280 	REG(0x11c),
281 	REG(0x114),
282 	REG(0x118),
283 	REG(0x1c0),
284 	REG(0x1c4),
285 	REG(0x1c8),
286 
287 	NOP(3),
288 	LRI(9, POSTED),
289 	REG16(0x3a8),
290 	REG16(0x28c),
291 	REG16(0x288),
292 	REG16(0x284),
293 	REG16(0x280),
294 	REG16(0x27c),
295 	REG16(0x278),
296 	REG16(0x274),
297 	REG16(0x270),
298 
299 	NOP(13),
300 	LRI(1, 0),
301 	REG(0xc8),
302 
303 	NOP(13),
304 	LRI(44, POSTED),
305 	REG(0x28),
306 	REG(0x9c),
307 	REG(0xc0),
308 	REG(0x178),
309 	REG(0x17c),
310 	REG16(0x358),
311 	REG(0x170),
312 	REG(0x150),
313 	REG(0x154),
314 	REG(0x158),
315 	REG16(0x41c),
316 	REG16(0x600),
317 	REG16(0x604),
318 	REG16(0x608),
319 	REG16(0x60c),
320 	REG16(0x610),
321 	REG16(0x614),
322 	REG16(0x618),
323 	REG16(0x61c),
324 	REG16(0x620),
325 	REG16(0x624),
326 	REG16(0x628),
327 	REG16(0x62c),
328 	REG16(0x630),
329 	REG16(0x634),
330 	REG16(0x638),
331 	REG16(0x63c),
332 	REG16(0x640),
333 	REG16(0x644),
334 	REG16(0x648),
335 	REG16(0x64c),
336 	REG16(0x650),
337 	REG16(0x654),
338 	REG16(0x658),
339 	REG16(0x65c),
340 	REG16(0x660),
341 	REG16(0x664),
342 	REG16(0x668),
343 	REG16(0x66c),
344 	REG16(0x670),
345 	REG16(0x674),
346 	REG16(0x678),
347 	REG16(0x67c),
348 	REG(0x68),
349 
350 	END
351 };
352 
353 static const u8 gen11_rcs_offsets[] = {
354 	NOP(1),
355 	LRI(15, POSTED),
356 	REG16(0x244),
357 	REG(0x034),
358 	REG(0x030),
359 	REG(0x038),
360 	REG(0x03c),
361 	REG(0x168),
362 	REG(0x140),
363 	REG(0x110),
364 	REG(0x11c),
365 	REG(0x114),
366 	REG(0x118),
367 	REG(0x1c0),
368 	REG(0x1c4),
369 	REG(0x1c8),
370 	REG(0x180),
371 
372 	NOP(1),
373 	LRI(9, POSTED),
374 	REG16(0x3a8),
375 	REG16(0x28c),
376 	REG16(0x288),
377 	REG16(0x284),
378 	REG16(0x280),
379 	REG16(0x27c),
380 	REG16(0x278),
381 	REG16(0x274),
382 	REG16(0x270),
383 
384 	LRI(1, POSTED),
385 	REG(0x1b0),
386 
387 	NOP(10),
388 	LRI(1, 0),
389 	REG(0x0c8),
390 
391 	END
392 };
393 
394 static const u8 gen12_rcs_offsets[] = {
395 	NOP(1),
396 	LRI(13, POSTED),
397 	REG16(0x244),
398 	REG(0x034),
399 	REG(0x030),
400 	REG(0x038),
401 	REG(0x03c),
402 	REG(0x168),
403 	REG(0x140),
404 	REG(0x110),
405 	REG(0x1c0),
406 	REG(0x1c4),
407 	REG(0x1c8),
408 	REG(0x180),
409 	REG16(0x2b4),
410 
411 	NOP(5),
412 	LRI(9, POSTED),
413 	REG16(0x3a8),
414 	REG16(0x28c),
415 	REG16(0x288),
416 	REG16(0x284),
417 	REG16(0x280),
418 	REG16(0x27c),
419 	REG16(0x278),
420 	REG16(0x274),
421 	REG16(0x270),
422 
423 	LRI(3, POSTED),
424 	REG(0x1b0),
425 	REG16(0x5a8),
426 	REG16(0x5ac),
427 
428 	NOP(6),
429 	LRI(1, 0),
430 	REG(0x0c8),
431 	NOP(3 + 9 + 1),
432 
433 	LRI(51, POSTED),
434 	REG16(0x588),
435 	REG16(0x588),
436 	REG16(0x588),
437 	REG16(0x588),
438 	REG16(0x588),
439 	REG16(0x588),
440 	REG(0x028),
441 	REG(0x09c),
442 	REG(0x0c0),
443 	REG(0x178),
444 	REG(0x17c),
445 	REG16(0x358),
446 	REG(0x170),
447 	REG(0x150),
448 	REG(0x154),
449 	REG(0x158),
450 	REG16(0x41c),
451 	REG16(0x600),
452 	REG16(0x604),
453 	REG16(0x608),
454 	REG16(0x60c),
455 	REG16(0x610),
456 	REG16(0x614),
457 	REG16(0x618),
458 	REG16(0x61c),
459 	REG16(0x620),
460 	REG16(0x624),
461 	REG16(0x628),
462 	REG16(0x62c),
463 	REG16(0x630),
464 	REG16(0x634),
465 	REG16(0x638),
466 	REG16(0x63c),
467 	REG16(0x640),
468 	REG16(0x644),
469 	REG16(0x648),
470 	REG16(0x64c),
471 	REG16(0x650),
472 	REG16(0x654),
473 	REG16(0x658),
474 	REG16(0x65c),
475 	REG16(0x660),
476 	REG16(0x664),
477 	REG16(0x668),
478 	REG16(0x66c),
479 	REG16(0x670),
480 	REG16(0x674),
481 	REG16(0x678),
482 	REG16(0x67c),
483 	REG(0x068),
484 	REG(0x084),
485 	NOP(1),
486 
487 	END
488 };
489 
490 #undef END
491 #undef REG16
492 #undef REG
493 #undef LRI
494 #undef NOP
495 
496 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
497 {
498 	/*
499 	 * The gen12+ lists only have the registers we program in the basic
500 	 * default state. We rely on the context image using relative
501 	 * addressing to automatic fixup the register state between the
502 	 * physical engines for virtual engine.
503 	 */
504 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
505 		   !intel_engine_has_relative_mmio(engine));
506 
507 	if (engine->class == RENDER_CLASS) {
508 		if (INTEL_GEN(engine->i915) >= 12)
509 			return gen12_rcs_offsets;
510 		else if (INTEL_GEN(engine->i915) >= 11)
511 			return gen11_rcs_offsets;
512 		else if (INTEL_GEN(engine->i915) >= 9)
513 			return gen9_rcs_offsets;
514 		else
515 			return gen8_rcs_offsets;
516 	} else {
517 		if (INTEL_GEN(engine->i915) >= 12)
518 			return gen12_xcs_offsets;
519 		else if (INTEL_GEN(engine->i915) >= 9)
520 			return gen9_xcs_offsets;
521 		else
522 			return gen8_xcs_offsets;
523 	}
524 }
525 
526 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
527 {
528 	if (INTEL_GEN(engine->i915) >= 12)
529 		return 0x60;
530 	else if (INTEL_GEN(engine->i915) >= 9)
531 		return 0x54;
532 	else if (engine->class == RENDER_CLASS)
533 		return 0x58;
534 	else
535 		return -1;
536 }
537 
538 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
539 {
540 	if (INTEL_GEN(engine->i915) >= 12)
541 		return 0x74;
542 	else if (INTEL_GEN(engine->i915) >= 9)
543 		return 0x68;
544 	else if (engine->class == RENDER_CLASS)
545 		return 0xd8;
546 	else
547 		return -1;
548 }
549 
550 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
551 {
552 	if (INTEL_GEN(engine->i915) >= 12)
553 		return 0x12;
554 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
555 		return 0x18;
556 	else
557 		return -1;
558 }
559 
560 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
561 {
562 	int x;
563 
564 	x = lrc_ring_wa_bb_per_ctx(engine);
565 	if (x < 0)
566 		return x;
567 
568 	return x + 2;
569 }
570 
571 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
572 {
573 	int x;
574 
575 	x = lrc_ring_indirect_ptr(engine);
576 	if (x < 0)
577 		return x;
578 
579 	return x + 2;
580 }
581 
582 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
583 {
584 	if (engine->class != RENDER_CLASS)
585 		return -1;
586 
587 	if (INTEL_GEN(engine->i915) >= 12)
588 		return 0xb6;
589 	else if (INTEL_GEN(engine->i915) >= 11)
590 		return 0xaa;
591 	else
592 		return -1;
593 }
594 
595 static u32
596 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
597 {
598 	switch (INTEL_GEN(engine->i915)) {
599 	default:
600 		MISSING_CASE(INTEL_GEN(engine->i915));
601 		fallthrough;
602 	case 12:
603 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
604 	case 11:
605 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
606 	case 10:
607 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
608 	case 9:
609 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
610 	case 8:
611 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
612 	}
613 }
614 
615 static void
616 lrc_setup_indirect_ctx(u32 *regs,
617 		       const struct intel_engine_cs *engine,
618 		       u32 ctx_bb_ggtt_addr,
619 		       u32 size)
620 {
621 	GEM_BUG_ON(!size);
622 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
623 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
624 	regs[lrc_ring_indirect_ptr(engine) + 1] =
625 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
626 
627 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
628 	regs[lrc_ring_indirect_offset(engine) + 1] =
629 		lrc_ring_indirect_offset_default(engine) << 6;
630 }
631 
632 static void init_common_regs(u32 * const regs,
633 			     const struct intel_context *ce,
634 			     const struct intel_engine_cs *engine,
635 			     bool inhibit)
636 {
637 	u32 ctl;
638 
639 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
640 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
641 	if (inhibit)
642 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
643 	if (INTEL_GEN(engine->i915) < 11)
644 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
645 					   CTX_CTRL_RS_CTX_ENABLE);
646 	regs[CTX_CONTEXT_CONTROL] = ctl;
647 
648 	regs[CTX_TIMESTAMP] = ce->runtime.last;
649 }
650 
651 static void init_wa_bb_regs(u32 * const regs,
652 			    const struct intel_engine_cs *engine)
653 {
654 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
655 
656 	if (wa_ctx->per_ctx.size) {
657 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
658 
659 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
660 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
661 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
662 	}
663 
664 	if (wa_ctx->indirect_ctx.size) {
665 		lrc_setup_indirect_ctx(regs, engine,
666 				       i915_ggtt_offset(wa_ctx->vma) +
667 				       wa_ctx->indirect_ctx.offset,
668 				       wa_ctx->indirect_ctx.size);
669 	}
670 }
671 
672 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
673 {
674 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
675 		/* 64b PPGTT (48bit canonical)
676 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
677 		 * other PDP Descriptors are ignored.
678 		 */
679 		ASSIGN_CTX_PML4(ppgtt, regs);
680 	} else {
681 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
682 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
683 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
684 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
685 	}
686 }
687 
688 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
689 {
690 	if (i915_is_ggtt(vm))
691 		return i915_vm_to_ggtt(vm)->alias;
692 	else
693 		return i915_vm_to_ppgtt(vm);
694 }
695 
696 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
697 {
698 	int x;
699 
700 	x = lrc_ring_mi_mode(engine);
701 	if (x != -1) {
702 		regs[x + 1] &= ~STOP_RING;
703 		regs[x + 1] |= STOP_RING << 16;
704 	}
705 }
706 
707 static void __lrc_init_regs(u32 *regs,
708 			    const struct intel_context *ce,
709 			    const struct intel_engine_cs *engine,
710 			    bool inhibit)
711 {
712 	/*
713 	 * A context is actually a big batch buffer with several
714 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
715 	 * values we are setting here are only for the first context restore:
716 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
717 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
718 	 * we are not initializing here).
719 	 *
720 	 * Must keep consistent with virtual_update_register_offsets().
721 	 */
722 
723 	if (inhibit)
724 		memset(regs, 0, PAGE_SIZE);
725 
726 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
727 
728 	init_common_regs(regs, ce, engine, inhibit);
729 	init_ppgtt_regs(regs, vm_alias(ce->vm));
730 
731 	init_wa_bb_regs(regs, engine);
732 
733 	__reset_stop_ring(regs, engine);
734 }
735 
736 void lrc_init_regs(const struct intel_context *ce,
737 		   const struct intel_engine_cs *engine,
738 		   bool inhibit)
739 {
740 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
741 }
742 
743 void lrc_reset_regs(const struct intel_context *ce,
744 		    const struct intel_engine_cs *engine)
745 {
746 	__reset_stop_ring(ce->lrc_reg_state, engine);
747 }
748 
749 static void
750 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
751 {
752 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
753 		return;
754 
755 	vaddr += engine->context_size;
756 
757 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
758 }
759 
760 static void
761 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
762 {
763 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
764 		return;
765 
766 	vaddr += engine->context_size;
767 
768 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
769 		drm_err_once(&engine->i915->drm,
770 			     "%s context redzone overwritten!\n",
771 			     engine->name);
772 }
773 
774 void lrc_init_state(struct intel_context *ce,
775 		    struct intel_engine_cs *engine,
776 		    void *state)
777 {
778 	bool inhibit = true;
779 
780 	set_redzone(state, engine);
781 
782 	if (engine->default_state) {
783 		shmem_read(engine->default_state, 0,
784 			   state, engine->context_size);
785 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
786 		inhibit = false;
787 	}
788 
789 	/* Clear the ppHWSP (inc. per-context counters) */
790 	memset(state, 0, PAGE_SIZE);
791 
792 	/*
793 	 * The second page of the context object contains some registers which
794 	 * must be set up prior to the first execution.
795 	 */
796 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
797 }
798 
799 static struct i915_vma *
800 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
801 {
802 	struct drm_i915_gem_object *obj;
803 	struct i915_vma *vma;
804 	u32 context_size;
805 
806 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
807 
808 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
809 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
810 
811 	if (INTEL_GEN(engine->i915) == 12) {
812 		ce->wa_bb_page = context_size / PAGE_SIZE;
813 		context_size += PAGE_SIZE;
814 	}
815 
816 	obj = i915_gem_object_create_shmem(engine->i915, context_size);
817 	if (IS_ERR(obj))
818 		return ERR_CAST(obj);
819 
820 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
821 	if (IS_ERR(vma)) {
822 		i915_gem_object_put(obj);
823 		return vma;
824 	}
825 
826 	return vma;
827 }
828 
829 static struct intel_timeline *
830 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
831 {
832 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
833 
834 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
835 }
836 
837 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
838 {
839 	struct intel_ring *ring;
840 	struct i915_vma *vma;
841 	int err;
842 
843 	GEM_BUG_ON(ce->state);
844 
845 	vma = __lrc_alloc_state(ce, engine);
846 	if (IS_ERR(vma))
847 		return PTR_ERR(vma);
848 
849 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
850 	if (IS_ERR(ring)) {
851 		err = PTR_ERR(ring);
852 		goto err_vma;
853 	}
854 
855 	if (!page_mask_bits(ce->timeline)) {
856 		struct intel_timeline *tl;
857 
858 		/*
859 		 * Use the static global HWSP for the kernel context, and
860 		 * a dynamically allocated cacheline for everyone else.
861 		 */
862 		if (unlikely(ce->timeline))
863 			tl = pinned_timeline(ce, engine);
864 		else
865 			tl = intel_timeline_create(engine->gt);
866 		if (IS_ERR(tl)) {
867 			err = PTR_ERR(tl);
868 			goto err_ring;
869 		}
870 
871 		ce->timeline = tl;
872 	}
873 
874 	ce->ring = ring;
875 	ce->state = vma;
876 
877 	return 0;
878 
879 err_ring:
880 	intel_ring_put(ring);
881 err_vma:
882 	i915_vma_put(vma);
883 	return err;
884 }
885 
886 void lrc_reset(struct intel_context *ce)
887 {
888 	CE_TRACE(ce, "reset\n");
889 	GEM_BUG_ON(!intel_context_is_pinned(ce));
890 
891 	intel_ring_reset(ce->ring, ce->ring->emit);
892 
893 	/* Scrub away the garbage */
894 	lrc_init_regs(ce, ce->engine, true);
895 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
896 }
897 
898 int
899 lrc_pre_pin(struct intel_context *ce,
900 	    struct intel_engine_cs *engine,
901 	    struct i915_gem_ww_ctx *ww,
902 	    void **vaddr)
903 {
904 	GEM_BUG_ON(!ce->state);
905 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
906 
907 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
908 					 i915_coherent_map_type(ce->engine->i915) |
909 					 I915_MAP_OVERRIDE);
910 
911 	return PTR_ERR_OR_ZERO(*vaddr);
912 }
913 
914 int
915 lrc_pin(struct intel_context *ce,
916 	struct intel_engine_cs *engine,
917 	void *vaddr)
918 {
919 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
920 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
921 	return 0;
922 }
923 
924 void lrc_unpin(struct intel_context *ce)
925 {
926 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
927 		      ce->engine);
928 }
929 
930 void lrc_post_unpin(struct intel_context *ce)
931 {
932 	i915_gem_object_unpin_map(ce->state->obj);
933 }
934 
935 void lrc_fini(struct intel_context *ce)
936 {
937 	if (!ce->state)
938 		return;
939 
940 	intel_ring_put(fetch_and_zero(&ce->ring));
941 	i915_vma_put(fetch_and_zero(&ce->state));
942 }
943 
944 void lrc_destroy(struct kref *kref)
945 {
946 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
947 
948 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
949 	GEM_BUG_ON(intel_context_is_pinned(ce));
950 
951 	lrc_fini(ce);
952 
953 	intel_context_fini(ce);
954 	intel_context_free(ce);
955 }
956 
957 static u32 *
958 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
959 {
960 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
961 		MI_SRM_LRM_GLOBAL_GTT |
962 		MI_LRI_LRM_CS_MMIO;
963 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
964 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
965 		CTX_TIMESTAMP * sizeof(u32);
966 	*cs++ = 0;
967 
968 	*cs++ = MI_LOAD_REGISTER_REG |
969 		MI_LRR_SOURCE_CS_MMIO |
970 		MI_LRI_LRM_CS_MMIO;
971 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
972 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
973 
974 	*cs++ = MI_LOAD_REGISTER_REG |
975 		MI_LRR_SOURCE_CS_MMIO |
976 		MI_LRI_LRM_CS_MMIO;
977 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
978 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
979 
980 	return cs;
981 }
982 
983 static u32 *
984 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
985 {
986 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
987 
988 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
989 		MI_SRM_LRM_GLOBAL_GTT |
990 		MI_LRI_LRM_CS_MMIO;
991 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
992 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
993 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
994 	*cs++ = 0;
995 
996 	return cs;
997 }
998 
999 static u32 *
1000 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1001 {
1002 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1003 
1004 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1005 		MI_SRM_LRM_GLOBAL_GTT |
1006 		MI_LRI_LRM_CS_MMIO;
1007 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1008 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1009 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1010 	*cs++ = 0;
1011 
1012 	*cs++ = MI_LOAD_REGISTER_REG |
1013 		MI_LRR_SOURCE_CS_MMIO |
1014 		MI_LRI_LRM_CS_MMIO;
1015 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1016 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1017 
1018 	return cs;
1019 }
1020 
1021 static u32 *
1022 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1023 {
1024 	cs = gen12_emit_timestamp_wa(ce, cs);
1025 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1026 	cs = gen12_emit_restore_scratch(ce, cs);
1027 
1028 	return cs;
1029 }
1030 
1031 static u32 *
1032 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1033 {
1034 	cs = gen12_emit_timestamp_wa(ce, cs);
1035 	cs = gen12_emit_restore_scratch(ce, cs);
1036 
1037 	return cs;
1038 }
1039 
1040 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
1041 {
1042 	return PAGE_SIZE * ce->wa_bb_page;
1043 }
1044 
1045 static u32 *context_indirect_bb(const struct intel_context *ce)
1046 {
1047 	void *ptr;
1048 
1049 	GEM_BUG_ON(!ce->wa_bb_page);
1050 
1051 	ptr = ce->lrc_reg_state;
1052 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1053 	ptr += context_wa_bb_offset(ce);
1054 
1055 	return ptr;
1056 }
1057 
1058 static void
1059 setup_indirect_ctx_bb(const struct intel_context *ce,
1060 		      const struct intel_engine_cs *engine,
1061 		      u32 *(*emit)(const struct intel_context *, u32 *))
1062 {
1063 	u32 * const start = context_indirect_bb(ce);
1064 	u32 *cs;
1065 
1066 	cs = emit(ce, start);
1067 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1068 	while ((unsigned long)cs % CACHELINE_BYTES)
1069 		*cs++ = MI_NOOP;
1070 
1071 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1072 			       i915_ggtt_offset(ce->state) +
1073 			       context_wa_bb_offset(ce),
1074 			       (cs - start) * sizeof(*cs));
1075 }
1076 
1077 /*
1078  * The context descriptor encodes various attributes of a context,
1079  * including its GTT address and some flags. Because it's fairly
1080  * expensive to calculate, we'll just do it once and cache the result,
1081  * which remains valid until the context is unpinned.
1082  *
1083  * This is what a descriptor looks like, from LSB to MSB::
1084  *
1085  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1086  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1087  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1088  *      bits 53-54:    mbz, reserved for use by hardware
1089  *      bits 55-63:    group ID, currently unused and set to 0
1090  *
1091  * Starting from Gen11, the upper dword of the descriptor has a new format:
1092  *
1093  *      bits 32-36:    reserved
1094  *      bits 37-47:    SW context ID
1095  *      bits 48:53:    engine instance
1096  *      bit 54:        mbz, reserved for use by hardware
1097  *      bits 55-60:    SW counter
1098  *      bits 61-63:    engine class
1099  *
1100  * engine info, SW context ID and SW counter need to form a unique number
1101  * (Context ID) per lrc.
1102  */
1103 static inline u32 lrc_descriptor(const struct intel_context *ce)
1104 {
1105 	u32 desc;
1106 
1107 	desc = INTEL_LEGACY_32B_CONTEXT;
1108 	if (i915_vm_is_4lvl(ce->vm))
1109 		desc = INTEL_LEGACY_64B_CONTEXT;
1110 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1111 
1112 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1113 	if (IS_GEN(ce->vm->i915, 8))
1114 		desc |= GEN8_CTX_L3LLC_COHERENT;
1115 
1116 	return i915_ggtt_offset(ce->state) | desc;
1117 }
1118 
1119 u32 lrc_update_regs(const struct intel_context *ce,
1120 		    const struct intel_engine_cs *engine,
1121 		    u32 head)
1122 {
1123 	struct intel_ring *ring = ce->ring;
1124 	u32 *regs = ce->lrc_reg_state;
1125 
1126 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1127 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1128 
1129 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1130 	regs[CTX_RING_HEAD] = head;
1131 	regs[CTX_RING_TAIL] = ring->tail;
1132 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1133 
1134 	/* RPCS */
1135 	if (engine->class == RENDER_CLASS) {
1136 		regs[CTX_R_PWR_CLK_STATE] =
1137 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1138 
1139 		i915_oa_init_reg_state(ce, engine);
1140 	}
1141 
1142 	if (ce->wa_bb_page) {
1143 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1144 
1145 		fn = gen12_emit_indirect_ctx_xcs;
1146 		if (ce->engine->class == RENDER_CLASS)
1147 			fn = gen12_emit_indirect_ctx_rcs;
1148 
1149 		/* Mutually exclusive wrt to global indirect bb */
1150 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1151 		setup_indirect_ctx_bb(ce, engine, fn);
1152 	}
1153 
1154 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1155 }
1156 
1157 void lrc_update_offsets(struct intel_context *ce,
1158 			struct intel_engine_cs *engine)
1159 {
1160 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1161 }
1162 
1163 void lrc_check_regs(const struct intel_context *ce,
1164 		    const struct intel_engine_cs *engine,
1165 		    const char *when)
1166 {
1167 	const struct intel_ring *ring = ce->ring;
1168 	u32 *regs = ce->lrc_reg_state;
1169 	bool valid = true;
1170 	int x;
1171 
1172 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1173 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1174 		       engine->name,
1175 		       regs[CTX_RING_START],
1176 		       i915_ggtt_offset(ring->vma));
1177 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1178 		valid = false;
1179 	}
1180 
1181 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1182 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1183 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1184 		       engine->name,
1185 		       regs[CTX_RING_CTL],
1186 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1187 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1188 		valid = false;
1189 	}
1190 
1191 	x = lrc_ring_mi_mode(engine);
1192 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1193 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1194 		       engine->name, regs[x + 1]);
1195 		regs[x + 1] &= ~STOP_RING;
1196 		regs[x + 1] |= STOP_RING << 16;
1197 		valid = false;
1198 	}
1199 
1200 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1201 }
1202 
1203 /*
1204  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1205  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1206  * but there is a slight complication as this is applied in WA batch where the
1207  * values are only initialized once so we cannot take register value at the
1208  * beginning and reuse it further; hence we save its value to memory, upload a
1209  * constant value with bit21 set and then we restore it back with the saved value.
1210  * To simplify the WA, a constant value is formed by using the default value
1211  * of this register. This shouldn't be a problem because we are only modifying
1212  * it for a short period and this batch in non-premptible. We can ofcourse
1213  * use additional instructions that read the actual value of the register
1214  * at that time and set our bit of interest but it makes the WA complicated.
1215  *
1216  * This WA is also required for Gen9 so extracting as a function avoids
1217  * code duplication.
1218  */
1219 static u32 *
1220 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1221 {
1222 	/* NB no one else is allowed to scribble over scratch + 256! */
1223 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1224 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1225 	*batch++ = intel_gt_scratch_offset(engine->gt,
1226 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1227 	*batch++ = 0;
1228 
1229 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1230 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1231 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1232 
1233 	batch = gen8_emit_pipe_control(batch,
1234 				       PIPE_CONTROL_CS_STALL |
1235 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1236 				       0);
1237 
1238 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1239 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1240 	*batch++ = intel_gt_scratch_offset(engine->gt,
1241 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1242 	*batch++ = 0;
1243 
1244 	return batch;
1245 }
1246 
1247 /*
1248  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1249  * initialized at the beginning and shared across all contexts but this field
1250  * helps us to have multiple batches at different offsets and select them based
1251  * on a criteria. At the moment this batch always start at the beginning of the page
1252  * and at this point we don't have multiple wa_ctx batch buffers.
1253  *
1254  * The number of WA applied are not known at the beginning; we use this field
1255  * to return the no of DWORDS written.
1256  *
1257  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1258  * so it adds NOOPs as padding to make it cacheline aligned.
1259  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1260  * makes a complete batch buffer.
1261  */
1262 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1263 {
1264 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1265 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1266 
1267 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1268 	if (IS_BROADWELL(engine->i915))
1269 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1270 
1271 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1272 	/* Actual scratch location is at 128 bytes offset */
1273 	batch = gen8_emit_pipe_control(batch,
1274 				       PIPE_CONTROL_FLUSH_L3 |
1275 				       PIPE_CONTROL_STORE_DATA_INDEX |
1276 				       PIPE_CONTROL_CS_STALL |
1277 				       PIPE_CONTROL_QW_WRITE,
1278 				       LRC_PPHWSP_SCRATCH_ADDR);
1279 
1280 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1281 
1282 	/* Pad to end of cacheline */
1283 	while ((unsigned long)batch % CACHELINE_BYTES)
1284 		*batch++ = MI_NOOP;
1285 
1286 	/*
1287 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1288 	 * execution depends on the length specified in terms of cache lines
1289 	 * in the register CTX_RCS_INDIRECT_CTX
1290 	 */
1291 
1292 	return batch;
1293 }
1294 
1295 struct lri {
1296 	i915_reg_t reg;
1297 	u32 value;
1298 };
1299 
1300 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1301 {
1302 	GEM_BUG_ON(!count || count > 63);
1303 
1304 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1305 	do {
1306 		*batch++ = i915_mmio_reg_offset(lri->reg);
1307 		*batch++ = lri->value;
1308 	} while (lri++, --count);
1309 	*batch++ = MI_NOOP;
1310 
1311 	return batch;
1312 }
1313 
1314 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1315 {
1316 	static const struct lri lri[] = {
1317 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1318 		{
1319 			COMMON_SLICE_CHICKEN2,
1320 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1321 				       0),
1322 		},
1323 
1324 		/* BSpec: 11391 */
1325 		{
1326 			FF_SLICE_CHICKEN,
1327 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1328 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1329 		},
1330 
1331 		/* BSpec: 11299 */
1332 		{
1333 			_3D_CHICKEN3,
1334 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1335 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1336 		}
1337 	};
1338 
1339 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1340 
1341 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1342 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1343 
1344 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1345 	batch = gen8_emit_pipe_control(batch,
1346 				       PIPE_CONTROL_FLUSH_L3 |
1347 				       PIPE_CONTROL_STORE_DATA_INDEX |
1348 				       PIPE_CONTROL_CS_STALL |
1349 				       PIPE_CONTROL_QW_WRITE,
1350 				       LRC_PPHWSP_SCRATCH_ADDR);
1351 
1352 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1353 
1354 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1355 	if (HAS_POOLED_EU(engine->i915)) {
1356 		/*
1357 		 * EU pool configuration is setup along with golden context
1358 		 * during context initialization. This value depends on
1359 		 * device type (2x6 or 3x6) and needs to be updated based
1360 		 * on which subslice is disabled especially for 2x6
1361 		 * devices, however it is safe to load default
1362 		 * configuration of 3x6 device instead of masking off
1363 		 * corresponding bits because HW ignores bits of a disabled
1364 		 * subslice and drops down to appropriate config. Please
1365 		 * see render_state_setup() in i915_gem_render_state.c for
1366 		 * possible configurations, to avoid duplication they are
1367 		 * not shown here again.
1368 		 */
1369 		*batch++ = GEN9_MEDIA_POOL_STATE;
1370 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1371 		*batch++ = 0x00777000;
1372 		*batch++ = 0;
1373 		*batch++ = 0;
1374 		*batch++ = 0;
1375 	}
1376 
1377 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1378 
1379 	/* Pad to end of cacheline */
1380 	while ((unsigned long)batch % CACHELINE_BYTES)
1381 		*batch++ = MI_NOOP;
1382 
1383 	return batch;
1384 }
1385 
1386 static u32 *
1387 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1388 {
1389 	int i;
1390 
1391 	/*
1392 	 * WaPipeControlBefore3DStateSamplePattern: cnl
1393 	 *
1394 	 * Ensure the engine is idle prior to programming a
1395 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
1396 	 */
1397 	batch = gen8_emit_pipe_control(batch,
1398 				       PIPE_CONTROL_CS_STALL,
1399 				       0);
1400 	/*
1401 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
1402 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
1403 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
1404 	 * confusing. Since gen8_emit_pipe_control() already advances the
1405 	 * batch by 6 dwords, we advance the other 10 here, completing a
1406 	 * cacheline. It's not clear if the workaround requires this padding
1407 	 * before other commands, or if it's just the regular padding we would
1408 	 * already have for the workaround bb, so leave it here for now.
1409 	 */
1410 	for (i = 0; i < 10; i++)
1411 		*batch++ = MI_NOOP;
1412 
1413 	/* Pad to end of cacheline */
1414 	while ((unsigned long)batch % CACHELINE_BYTES)
1415 		*batch++ = MI_NOOP;
1416 
1417 	return batch;
1418 }
1419 
1420 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1421 
1422 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
1423 {
1424 	struct drm_i915_gem_object *obj;
1425 	struct i915_vma *vma;
1426 	int err;
1427 
1428 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1429 	if (IS_ERR(obj))
1430 		return PTR_ERR(obj);
1431 
1432 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1433 	if (IS_ERR(vma)) {
1434 		err = PTR_ERR(vma);
1435 		goto err;
1436 	}
1437 
1438 	err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
1439 	if (err)
1440 		goto err;
1441 
1442 	engine->wa_ctx.vma = vma;
1443 	return 0;
1444 
1445 err:
1446 	i915_gem_object_put(obj);
1447 	return err;
1448 }
1449 
1450 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1451 {
1452 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1453 }
1454 
1455 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1456 
1457 int lrc_init_wa_ctx(struct intel_engine_cs *engine)
1458 {
1459 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1460 	struct i915_wa_ctx_bb *wa_bb[] = {
1461 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1462 	};
1463 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1464 	void *batch, *batch_ptr;
1465 	unsigned int i;
1466 	int ret;
1467 
1468 	if (engine->class != RENDER_CLASS)
1469 		return 0;
1470 
1471 	switch (INTEL_GEN(engine->i915)) {
1472 	case 12:
1473 	case 11:
1474 		return 0;
1475 	case 10:
1476 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
1477 		wa_bb_fn[1] = NULL;
1478 		break;
1479 	case 9:
1480 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1481 		wa_bb_fn[1] = NULL;
1482 		break;
1483 	case 8:
1484 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1485 		wa_bb_fn[1] = NULL;
1486 		break;
1487 	default:
1488 		MISSING_CASE(INTEL_GEN(engine->i915));
1489 		return 0;
1490 	}
1491 
1492 	ret = lrc_setup_wa_ctx(engine);
1493 	if (ret) {
1494 		drm_dbg(&engine->i915->drm,
1495 			"Failed to setup context WA page: %d\n", ret);
1496 		return ret;
1497 	}
1498 
1499 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1500 
1501 	/*
1502 	 * Emit the two workaround batch buffers, recording the offset from the
1503 	 * start of the workaround batch buffer object for each and their
1504 	 * respective sizes.
1505 	 */
1506 	batch_ptr = batch;
1507 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1508 		wa_bb[i]->offset = batch_ptr - batch;
1509 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1510 						  CACHELINE_BYTES))) {
1511 			ret = -EINVAL;
1512 			break;
1513 		}
1514 		if (wa_bb_fn[i])
1515 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1516 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1517 	}
1518 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1519 
1520 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1521 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1522 	if (ret)
1523 		lrc_fini_wa_ctx(engine);
1524 
1525 	return ret;
1526 }
1527 
1528 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1529 {
1530 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1531 	ce->runtime.num_underflow++;
1532 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1533 #endif
1534 }
1535 
1536 void lrc_update_runtime(struct intel_context *ce)
1537 {
1538 	u32 old;
1539 	s32 dt;
1540 
1541 	if (intel_context_is_barrier(ce))
1542 		return;
1543 
1544 	old = ce->runtime.last;
1545 	ce->runtime.last = lrc_get_runtime(ce);
1546 	dt = ce->runtime.last - old;
1547 
1548 	if (unlikely(dt < 0)) {
1549 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1550 			 old, ce->runtime.last, dt);
1551 		st_update_runtime_underflow(ce, dt);
1552 		return;
1553 	}
1554 
1555 	ewma_runtime_add(&ce->runtime.avg, dt);
1556 	ce->runtime.total += dt;
1557 }
1558 
1559 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1560 #include "selftest_lrc.c"
1561 #endif
1562