xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision b1a792601f264df7172a728f1a83a05b6b399dfb)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gen8_engine_cs.h"
7 #include "i915_drv.h"
8 #include "i915_perf.h"
9 #include "intel_engine.h"
10 #include "intel_gpu_commands.h"
11 #include "intel_gt.h"
12 #include "intel_lrc.h"
13 #include "intel_lrc_reg.h"
14 #include "intel_ring.h"
15 #include "shmem_utils.h"
16 
17 static void set_offsets(u32 *regs,
18 			const u8 *data,
19 			const struct intel_engine_cs *engine,
20 			bool close)
21 #define NOP(x) (BIT(7) | (x))
22 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
23 #define POSTED BIT(0)
24 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
25 #define REG16(x) \
26 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
27 	(((x) >> 2) & 0x7f)
28 #define END 0
29 {
30 	const u32 base = engine->mmio_base;
31 
32 	while (*data) {
33 		u8 count, flags;
34 
35 		if (*data & BIT(7)) { /* skip */
36 			count = *data++ & ~BIT(7);
37 			regs += count;
38 			continue;
39 		}
40 
41 		count = *data & 0x3f;
42 		flags = *data >> 6;
43 		data++;
44 
45 		*regs = MI_LOAD_REGISTER_IMM(count);
46 		if (flags & POSTED)
47 			*regs |= MI_LRI_FORCE_POSTED;
48 		if (INTEL_GEN(engine->i915) >= 11)
49 			*regs |= MI_LRI_LRM_CS_MMIO;
50 		regs++;
51 
52 		GEM_BUG_ON(!count);
53 		do {
54 			u32 offset = 0;
55 			u8 v;
56 
57 			do {
58 				v = *data++;
59 				offset <<= 7;
60 				offset |= v & ~BIT(7);
61 			} while (v & BIT(7));
62 
63 			regs[0] = base + (offset << 2);
64 			regs += 2;
65 		} while (--count);
66 	}
67 
68 	if (close) {
69 		/* Close the batch; used mainly by live_lrc_layout() */
70 		*regs = MI_BATCH_BUFFER_END;
71 		if (INTEL_GEN(engine->i915) >= 10)
72 			*regs |= BIT(0);
73 	}
74 }
75 
76 static const u8 gen8_xcs_offsets[] = {
77 	NOP(1),
78 	LRI(11, 0),
79 	REG16(0x244),
80 	REG(0x034),
81 	REG(0x030),
82 	REG(0x038),
83 	REG(0x03c),
84 	REG(0x168),
85 	REG(0x140),
86 	REG(0x110),
87 	REG(0x11c),
88 	REG(0x114),
89 	REG(0x118),
90 
91 	NOP(9),
92 	LRI(9, 0),
93 	REG16(0x3a8),
94 	REG16(0x28c),
95 	REG16(0x288),
96 	REG16(0x284),
97 	REG16(0x280),
98 	REG16(0x27c),
99 	REG16(0x278),
100 	REG16(0x274),
101 	REG16(0x270),
102 
103 	NOP(13),
104 	LRI(2, 0),
105 	REG16(0x200),
106 	REG(0x028),
107 
108 	END
109 };
110 
111 static const u8 gen9_xcs_offsets[] = {
112 	NOP(1),
113 	LRI(14, POSTED),
114 	REG16(0x244),
115 	REG(0x034),
116 	REG(0x030),
117 	REG(0x038),
118 	REG(0x03c),
119 	REG(0x168),
120 	REG(0x140),
121 	REG(0x110),
122 	REG(0x11c),
123 	REG(0x114),
124 	REG(0x118),
125 	REG(0x1c0),
126 	REG(0x1c4),
127 	REG(0x1c8),
128 
129 	NOP(3),
130 	LRI(9, POSTED),
131 	REG16(0x3a8),
132 	REG16(0x28c),
133 	REG16(0x288),
134 	REG16(0x284),
135 	REG16(0x280),
136 	REG16(0x27c),
137 	REG16(0x278),
138 	REG16(0x274),
139 	REG16(0x270),
140 
141 	NOP(13),
142 	LRI(1, POSTED),
143 	REG16(0x200),
144 
145 	NOP(13),
146 	LRI(44, POSTED),
147 	REG(0x028),
148 	REG(0x09c),
149 	REG(0x0c0),
150 	REG(0x178),
151 	REG(0x17c),
152 	REG16(0x358),
153 	REG(0x170),
154 	REG(0x150),
155 	REG(0x154),
156 	REG(0x158),
157 	REG16(0x41c),
158 	REG16(0x600),
159 	REG16(0x604),
160 	REG16(0x608),
161 	REG16(0x60c),
162 	REG16(0x610),
163 	REG16(0x614),
164 	REG16(0x618),
165 	REG16(0x61c),
166 	REG16(0x620),
167 	REG16(0x624),
168 	REG16(0x628),
169 	REG16(0x62c),
170 	REG16(0x630),
171 	REG16(0x634),
172 	REG16(0x638),
173 	REG16(0x63c),
174 	REG16(0x640),
175 	REG16(0x644),
176 	REG16(0x648),
177 	REG16(0x64c),
178 	REG16(0x650),
179 	REG16(0x654),
180 	REG16(0x658),
181 	REG16(0x65c),
182 	REG16(0x660),
183 	REG16(0x664),
184 	REG16(0x668),
185 	REG16(0x66c),
186 	REG16(0x670),
187 	REG16(0x674),
188 	REG16(0x678),
189 	REG16(0x67c),
190 	REG(0x068),
191 
192 	END
193 };
194 
195 static const u8 gen12_xcs_offsets[] = {
196 	NOP(1),
197 	LRI(13, POSTED),
198 	REG16(0x244),
199 	REG(0x034),
200 	REG(0x030),
201 	REG(0x038),
202 	REG(0x03c),
203 	REG(0x168),
204 	REG(0x140),
205 	REG(0x110),
206 	REG(0x1c0),
207 	REG(0x1c4),
208 	REG(0x1c8),
209 	REG(0x180),
210 	REG16(0x2b4),
211 
212 	NOP(5),
213 	LRI(9, POSTED),
214 	REG16(0x3a8),
215 	REG16(0x28c),
216 	REG16(0x288),
217 	REG16(0x284),
218 	REG16(0x280),
219 	REG16(0x27c),
220 	REG16(0x278),
221 	REG16(0x274),
222 	REG16(0x270),
223 
224 	END
225 };
226 
227 static const u8 gen8_rcs_offsets[] = {
228 	NOP(1),
229 	LRI(14, POSTED),
230 	REG16(0x244),
231 	REG(0x034),
232 	REG(0x030),
233 	REG(0x038),
234 	REG(0x03c),
235 	REG(0x168),
236 	REG(0x140),
237 	REG(0x110),
238 	REG(0x11c),
239 	REG(0x114),
240 	REG(0x118),
241 	REG(0x1c0),
242 	REG(0x1c4),
243 	REG(0x1c8),
244 
245 	NOP(3),
246 	LRI(9, POSTED),
247 	REG16(0x3a8),
248 	REG16(0x28c),
249 	REG16(0x288),
250 	REG16(0x284),
251 	REG16(0x280),
252 	REG16(0x27c),
253 	REG16(0x278),
254 	REG16(0x274),
255 	REG16(0x270),
256 
257 	NOP(13),
258 	LRI(1, 0),
259 	REG(0x0c8),
260 
261 	END
262 };
263 
264 static const u8 gen9_rcs_offsets[] = {
265 	NOP(1),
266 	LRI(14, POSTED),
267 	REG16(0x244),
268 	REG(0x34),
269 	REG(0x30),
270 	REG(0x38),
271 	REG(0x3c),
272 	REG(0x168),
273 	REG(0x140),
274 	REG(0x110),
275 	REG(0x11c),
276 	REG(0x114),
277 	REG(0x118),
278 	REG(0x1c0),
279 	REG(0x1c4),
280 	REG(0x1c8),
281 
282 	NOP(3),
283 	LRI(9, POSTED),
284 	REG16(0x3a8),
285 	REG16(0x28c),
286 	REG16(0x288),
287 	REG16(0x284),
288 	REG16(0x280),
289 	REG16(0x27c),
290 	REG16(0x278),
291 	REG16(0x274),
292 	REG16(0x270),
293 
294 	NOP(13),
295 	LRI(1, 0),
296 	REG(0xc8),
297 
298 	NOP(13),
299 	LRI(44, POSTED),
300 	REG(0x28),
301 	REG(0x9c),
302 	REG(0xc0),
303 	REG(0x178),
304 	REG(0x17c),
305 	REG16(0x358),
306 	REG(0x170),
307 	REG(0x150),
308 	REG(0x154),
309 	REG(0x158),
310 	REG16(0x41c),
311 	REG16(0x600),
312 	REG16(0x604),
313 	REG16(0x608),
314 	REG16(0x60c),
315 	REG16(0x610),
316 	REG16(0x614),
317 	REG16(0x618),
318 	REG16(0x61c),
319 	REG16(0x620),
320 	REG16(0x624),
321 	REG16(0x628),
322 	REG16(0x62c),
323 	REG16(0x630),
324 	REG16(0x634),
325 	REG16(0x638),
326 	REG16(0x63c),
327 	REG16(0x640),
328 	REG16(0x644),
329 	REG16(0x648),
330 	REG16(0x64c),
331 	REG16(0x650),
332 	REG16(0x654),
333 	REG16(0x658),
334 	REG16(0x65c),
335 	REG16(0x660),
336 	REG16(0x664),
337 	REG16(0x668),
338 	REG16(0x66c),
339 	REG16(0x670),
340 	REG16(0x674),
341 	REG16(0x678),
342 	REG16(0x67c),
343 	REG(0x68),
344 
345 	END
346 };
347 
348 static const u8 gen11_rcs_offsets[] = {
349 	NOP(1),
350 	LRI(15, POSTED),
351 	REG16(0x244),
352 	REG(0x034),
353 	REG(0x030),
354 	REG(0x038),
355 	REG(0x03c),
356 	REG(0x168),
357 	REG(0x140),
358 	REG(0x110),
359 	REG(0x11c),
360 	REG(0x114),
361 	REG(0x118),
362 	REG(0x1c0),
363 	REG(0x1c4),
364 	REG(0x1c8),
365 	REG(0x180),
366 
367 	NOP(1),
368 	LRI(9, POSTED),
369 	REG16(0x3a8),
370 	REG16(0x28c),
371 	REG16(0x288),
372 	REG16(0x284),
373 	REG16(0x280),
374 	REG16(0x27c),
375 	REG16(0x278),
376 	REG16(0x274),
377 	REG16(0x270),
378 
379 	LRI(1, POSTED),
380 	REG(0x1b0),
381 
382 	NOP(10),
383 	LRI(1, 0),
384 	REG(0x0c8),
385 
386 	END
387 };
388 
389 static const u8 gen12_rcs_offsets[] = {
390 	NOP(1),
391 	LRI(13, POSTED),
392 	REG16(0x244),
393 	REG(0x034),
394 	REG(0x030),
395 	REG(0x038),
396 	REG(0x03c),
397 	REG(0x168),
398 	REG(0x140),
399 	REG(0x110),
400 	REG(0x1c0),
401 	REG(0x1c4),
402 	REG(0x1c8),
403 	REG(0x180),
404 	REG16(0x2b4),
405 
406 	NOP(5),
407 	LRI(9, POSTED),
408 	REG16(0x3a8),
409 	REG16(0x28c),
410 	REG16(0x288),
411 	REG16(0x284),
412 	REG16(0x280),
413 	REG16(0x27c),
414 	REG16(0x278),
415 	REG16(0x274),
416 	REG16(0x270),
417 
418 	LRI(3, POSTED),
419 	REG(0x1b0),
420 	REG16(0x5a8),
421 	REG16(0x5ac),
422 
423 	NOP(6),
424 	LRI(1, 0),
425 	REG(0x0c8),
426 	NOP(3 + 9 + 1),
427 
428 	LRI(51, POSTED),
429 	REG16(0x588),
430 	REG16(0x588),
431 	REG16(0x588),
432 	REG16(0x588),
433 	REG16(0x588),
434 	REG16(0x588),
435 	REG(0x028),
436 	REG(0x09c),
437 	REG(0x0c0),
438 	REG(0x178),
439 	REG(0x17c),
440 	REG16(0x358),
441 	REG(0x170),
442 	REG(0x150),
443 	REG(0x154),
444 	REG(0x158),
445 	REG16(0x41c),
446 	REG16(0x600),
447 	REG16(0x604),
448 	REG16(0x608),
449 	REG16(0x60c),
450 	REG16(0x610),
451 	REG16(0x614),
452 	REG16(0x618),
453 	REG16(0x61c),
454 	REG16(0x620),
455 	REG16(0x624),
456 	REG16(0x628),
457 	REG16(0x62c),
458 	REG16(0x630),
459 	REG16(0x634),
460 	REG16(0x638),
461 	REG16(0x63c),
462 	REG16(0x640),
463 	REG16(0x644),
464 	REG16(0x648),
465 	REG16(0x64c),
466 	REG16(0x650),
467 	REG16(0x654),
468 	REG16(0x658),
469 	REG16(0x65c),
470 	REG16(0x660),
471 	REG16(0x664),
472 	REG16(0x668),
473 	REG16(0x66c),
474 	REG16(0x670),
475 	REG16(0x674),
476 	REG16(0x678),
477 	REG16(0x67c),
478 	REG(0x068),
479 	REG(0x084),
480 	NOP(1),
481 
482 	END
483 };
484 
485 #undef END
486 #undef REG16
487 #undef REG
488 #undef LRI
489 #undef NOP
490 
491 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
492 {
493 	/*
494 	 * The gen12+ lists only have the registers we program in the basic
495 	 * default state. We rely on the context image using relative
496 	 * addressing to automatic fixup the register state between the
497 	 * physical engines for virtual engine.
498 	 */
499 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
500 		   !intel_engine_has_relative_mmio(engine));
501 
502 	if (engine->class == RENDER_CLASS) {
503 		if (INTEL_GEN(engine->i915) >= 12)
504 			return gen12_rcs_offsets;
505 		else if (INTEL_GEN(engine->i915) >= 11)
506 			return gen11_rcs_offsets;
507 		else if (INTEL_GEN(engine->i915) >= 9)
508 			return gen9_rcs_offsets;
509 		else
510 			return gen8_rcs_offsets;
511 	} else {
512 		if (INTEL_GEN(engine->i915) >= 12)
513 			return gen12_xcs_offsets;
514 		else if (INTEL_GEN(engine->i915) >= 9)
515 			return gen9_xcs_offsets;
516 		else
517 			return gen8_xcs_offsets;
518 	}
519 }
520 
521 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
522 {
523 	if (INTEL_GEN(engine->i915) >= 12)
524 		return 0x60;
525 	else if (INTEL_GEN(engine->i915) >= 9)
526 		return 0x54;
527 	else if (engine->class == RENDER_CLASS)
528 		return 0x58;
529 	else
530 		return -1;
531 }
532 
533 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
534 {
535 	if (INTEL_GEN(engine->i915) >= 12)
536 		return 0x74;
537 	else if (INTEL_GEN(engine->i915) >= 9)
538 		return 0x68;
539 	else if (engine->class == RENDER_CLASS)
540 		return 0xd8;
541 	else
542 		return -1;
543 }
544 
545 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
546 {
547 	if (INTEL_GEN(engine->i915) >= 12)
548 		return 0x12;
549 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
550 		return 0x18;
551 	else
552 		return -1;
553 }
554 
555 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
556 {
557 	int x;
558 
559 	x = lrc_ring_wa_bb_per_ctx(engine);
560 	if (x < 0)
561 		return x;
562 
563 	return x + 2;
564 }
565 
566 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
567 {
568 	int x;
569 
570 	x = lrc_ring_indirect_ptr(engine);
571 	if (x < 0)
572 		return x;
573 
574 	return x + 2;
575 }
576 
577 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
578 {
579 	if (engine->class != RENDER_CLASS)
580 		return -1;
581 
582 	if (INTEL_GEN(engine->i915) >= 12)
583 		return 0xb6;
584 	else if (INTEL_GEN(engine->i915) >= 11)
585 		return 0xaa;
586 	else
587 		return -1;
588 }
589 
590 static u32
591 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
592 {
593 	switch (INTEL_GEN(engine->i915)) {
594 	default:
595 		MISSING_CASE(INTEL_GEN(engine->i915));
596 		fallthrough;
597 	case 12:
598 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
599 	case 11:
600 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
601 	case 10:
602 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
603 	case 9:
604 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
605 	case 8:
606 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
607 	}
608 }
609 
610 static void
611 lrc_setup_indirect_ctx(u32 *regs,
612 		       const struct intel_engine_cs *engine,
613 		       u32 ctx_bb_ggtt_addr,
614 		       u32 size)
615 {
616 	GEM_BUG_ON(!size);
617 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
618 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
619 	regs[lrc_ring_indirect_ptr(engine) + 1] =
620 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
621 
622 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
623 	regs[lrc_ring_indirect_offset(engine) + 1] =
624 		lrc_ring_indirect_offset_default(engine) << 6;
625 }
626 
627 static void init_common_regs(u32 * const regs,
628 			     const struct intel_context *ce,
629 			     const struct intel_engine_cs *engine,
630 			     bool inhibit)
631 {
632 	u32 ctl;
633 
634 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
635 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
636 	if (inhibit)
637 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
638 	if (INTEL_GEN(engine->i915) < 11)
639 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
640 					   CTX_CTRL_RS_CTX_ENABLE);
641 	regs[CTX_CONTEXT_CONTROL] = ctl;
642 
643 	regs[CTX_TIMESTAMP] = ce->runtime.last;
644 }
645 
646 static void init_wa_bb_regs(u32 * const regs,
647 			    const struct intel_engine_cs *engine)
648 {
649 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
650 
651 	if (wa_ctx->per_ctx.size) {
652 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
653 
654 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
655 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
656 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
657 	}
658 
659 	if (wa_ctx->indirect_ctx.size) {
660 		lrc_setup_indirect_ctx(regs, engine,
661 				       i915_ggtt_offset(wa_ctx->vma) +
662 				       wa_ctx->indirect_ctx.offset,
663 				       wa_ctx->indirect_ctx.size);
664 	}
665 }
666 
667 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
668 {
669 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
670 		/* 64b PPGTT (48bit canonical)
671 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
672 		 * other PDP Descriptors are ignored.
673 		 */
674 		ASSIGN_CTX_PML4(ppgtt, regs);
675 	} else {
676 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
677 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
678 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
679 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
680 	}
681 }
682 
683 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
684 {
685 	if (i915_is_ggtt(vm))
686 		return i915_vm_to_ggtt(vm)->alias;
687 	else
688 		return i915_vm_to_ppgtt(vm);
689 }
690 
691 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
692 {
693 	int x;
694 
695 	x = lrc_ring_mi_mode(engine);
696 	if (x != -1) {
697 		regs[x + 1] &= ~STOP_RING;
698 		regs[x + 1] |= STOP_RING << 16;
699 	}
700 }
701 
702 static void __lrc_init_regs(u32 *regs,
703 			    const struct intel_context *ce,
704 			    const struct intel_engine_cs *engine,
705 			    bool inhibit)
706 {
707 	/*
708 	 * A context is actually a big batch buffer with several
709 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
710 	 * values we are setting here are only for the first context restore:
711 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
712 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
713 	 * we are not initializing here).
714 	 *
715 	 * Must keep consistent with virtual_update_register_offsets().
716 	 */
717 
718 	if (inhibit)
719 		memset(regs, 0, PAGE_SIZE);
720 
721 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
722 
723 	init_common_regs(regs, ce, engine, inhibit);
724 	init_ppgtt_regs(regs, vm_alias(ce->vm));
725 
726 	init_wa_bb_regs(regs, engine);
727 
728 	__reset_stop_ring(regs, engine);
729 }
730 
731 void lrc_init_regs(const struct intel_context *ce,
732 		   const struct intel_engine_cs *engine,
733 		   bool inhibit)
734 {
735 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
736 }
737 
738 void lrc_reset_regs(const struct intel_context *ce,
739 		    const struct intel_engine_cs *engine)
740 {
741 	__reset_stop_ring(ce->lrc_reg_state, engine);
742 }
743 
744 static void
745 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
746 {
747 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
748 		return;
749 
750 	vaddr += engine->context_size;
751 
752 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
753 }
754 
755 static void
756 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
757 {
758 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
759 		return;
760 
761 	vaddr += engine->context_size;
762 
763 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
764 		drm_err_once(&engine->i915->drm,
765 			     "%s context redzone overwritten!\n",
766 			     engine->name);
767 }
768 
769 void lrc_init_state(struct intel_context *ce,
770 		    struct intel_engine_cs *engine,
771 		    void *state)
772 {
773 	bool inhibit = true;
774 
775 	set_redzone(state, engine);
776 
777 	if (engine->default_state) {
778 		shmem_read(engine->default_state, 0,
779 			   state, engine->context_size);
780 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
781 		inhibit = false;
782 	}
783 
784 	/* Clear the ppHWSP (inc. per-context counters) */
785 	memset(state, 0, PAGE_SIZE);
786 
787 	/*
788 	 * The second page of the context object contains some registers which
789 	 * must be set up prior to the first execution.
790 	 */
791 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
792 }
793 
794 static struct i915_vma *
795 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
796 {
797 	struct drm_i915_gem_object *obj;
798 	struct i915_vma *vma;
799 	u32 context_size;
800 
801 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
802 
803 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
804 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
805 
806 	if (INTEL_GEN(engine->i915) == 12) {
807 		ce->wa_bb_page = context_size / PAGE_SIZE;
808 		context_size += PAGE_SIZE;
809 	}
810 
811 	obj = i915_gem_object_create_shmem(engine->i915, context_size);
812 	if (IS_ERR(obj))
813 		return ERR_CAST(obj);
814 
815 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
816 	if (IS_ERR(vma)) {
817 		i915_gem_object_put(obj);
818 		return vma;
819 	}
820 
821 	return vma;
822 }
823 
824 static struct intel_timeline *
825 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
826 {
827 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
828 
829 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
830 }
831 
832 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
833 {
834 	struct intel_ring *ring;
835 	struct i915_vma *vma;
836 	int err;
837 
838 	GEM_BUG_ON(ce->state);
839 
840 	vma = __lrc_alloc_state(ce, engine);
841 	if (IS_ERR(vma))
842 		return PTR_ERR(vma);
843 
844 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
845 	if (IS_ERR(ring)) {
846 		err = PTR_ERR(ring);
847 		goto err_vma;
848 	}
849 
850 	if (!page_mask_bits(ce->timeline)) {
851 		struct intel_timeline *tl;
852 
853 		/*
854 		 * Use the static global HWSP for the kernel context, and
855 		 * a dynamically allocated cacheline for everyone else.
856 		 */
857 		if (unlikely(ce->timeline))
858 			tl = pinned_timeline(ce, engine);
859 		else
860 			tl = intel_timeline_create(engine->gt);
861 		if (IS_ERR(tl)) {
862 			err = PTR_ERR(tl);
863 			goto err_ring;
864 		}
865 
866 		ce->timeline = tl;
867 	}
868 
869 	ce->ring = ring;
870 	ce->state = vma;
871 
872 	return 0;
873 
874 err_ring:
875 	intel_ring_put(ring);
876 err_vma:
877 	i915_vma_put(vma);
878 	return err;
879 }
880 
881 void lrc_reset(struct intel_context *ce)
882 {
883 	GEM_BUG_ON(!intel_context_is_pinned(ce));
884 
885 	intel_ring_reset(ce->ring, ce->ring->emit);
886 
887 	/* Scrub away the garbage */
888 	lrc_init_regs(ce, ce->engine, true);
889 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
890 }
891 
892 int
893 lrc_pre_pin(struct intel_context *ce,
894 	    struct intel_engine_cs *engine,
895 	    struct i915_gem_ww_ctx *ww,
896 	    void **vaddr)
897 {
898 	GEM_BUG_ON(!ce->state);
899 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
900 
901 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
902 					 i915_coherent_map_type(ce->engine->i915) |
903 					 I915_MAP_OVERRIDE);
904 
905 	return PTR_ERR_OR_ZERO(*vaddr);
906 }
907 
908 int
909 lrc_pin(struct intel_context *ce,
910 	struct intel_engine_cs *engine,
911 	void *vaddr)
912 {
913 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
914 
915 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
916 		lrc_init_state(ce, engine, vaddr);
917 
918 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
919 	return 0;
920 }
921 
922 void lrc_unpin(struct intel_context *ce)
923 {
924 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
925 		      ce->engine);
926 }
927 
928 void lrc_post_unpin(struct intel_context *ce)
929 {
930 	i915_gem_object_unpin_map(ce->state->obj);
931 }
932 
933 void lrc_fini(struct intel_context *ce)
934 {
935 	if (!ce->state)
936 		return;
937 
938 	intel_ring_put(fetch_and_zero(&ce->ring));
939 	i915_vma_put(fetch_and_zero(&ce->state));
940 }
941 
942 void lrc_destroy(struct kref *kref)
943 {
944 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
945 
946 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
947 	GEM_BUG_ON(intel_context_is_pinned(ce));
948 
949 	lrc_fini(ce);
950 
951 	intel_context_fini(ce);
952 	intel_context_free(ce);
953 }
954 
955 static u32 *
956 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
957 {
958 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
959 		MI_SRM_LRM_GLOBAL_GTT |
960 		MI_LRI_LRM_CS_MMIO;
961 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
962 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
963 		CTX_TIMESTAMP * sizeof(u32);
964 	*cs++ = 0;
965 
966 	*cs++ = MI_LOAD_REGISTER_REG |
967 		MI_LRR_SOURCE_CS_MMIO |
968 		MI_LRI_LRM_CS_MMIO;
969 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
970 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
971 
972 	*cs++ = MI_LOAD_REGISTER_REG |
973 		MI_LRR_SOURCE_CS_MMIO |
974 		MI_LRI_LRM_CS_MMIO;
975 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
976 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
977 
978 	return cs;
979 }
980 
981 static u32 *
982 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
983 {
984 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
985 
986 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
987 		MI_SRM_LRM_GLOBAL_GTT |
988 		MI_LRI_LRM_CS_MMIO;
989 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
990 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
991 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
992 	*cs++ = 0;
993 
994 	return cs;
995 }
996 
997 static u32 *
998 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
999 {
1000 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1001 
1002 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1003 		MI_SRM_LRM_GLOBAL_GTT |
1004 		MI_LRI_LRM_CS_MMIO;
1005 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1006 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1007 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1008 	*cs++ = 0;
1009 
1010 	*cs++ = MI_LOAD_REGISTER_REG |
1011 		MI_LRR_SOURCE_CS_MMIO |
1012 		MI_LRI_LRM_CS_MMIO;
1013 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1014 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1015 
1016 	return cs;
1017 }
1018 
1019 static u32 *
1020 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1021 {
1022 	cs = gen12_emit_timestamp_wa(ce, cs);
1023 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1024 	cs = gen12_emit_restore_scratch(ce, cs);
1025 
1026 	return cs;
1027 }
1028 
1029 static u32 *
1030 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1031 {
1032 	cs = gen12_emit_timestamp_wa(ce, cs);
1033 	cs = gen12_emit_restore_scratch(ce, cs);
1034 
1035 	return cs;
1036 }
1037 
1038 static u32 context_wa_bb_offset(const struct intel_context *ce)
1039 {
1040 	return PAGE_SIZE * ce->wa_bb_page;
1041 }
1042 
1043 static u32 *context_indirect_bb(const struct intel_context *ce)
1044 {
1045 	void *ptr;
1046 
1047 	GEM_BUG_ON(!ce->wa_bb_page);
1048 
1049 	ptr = ce->lrc_reg_state;
1050 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1051 	ptr += context_wa_bb_offset(ce);
1052 
1053 	return ptr;
1054 }
1055 
1056 static void
1057 setup_indirect_ctx_bb(const struct intel_context *ce,
1058 		      const struct intel_engine_cs *engine,
1059 		      u32 *(*emit)(const struct intel_context *, u32 *))
1060 {
1061 	u32 * const start = context_indirect_bb(ce);
1062 	u32 *cs;
1063 
1064 	cs = emit(ce, start);
1065 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1066 	while ((unsigned long)cs % CACHELINE_BYTES)
1067 		*cs++ = MI_NOOP;
1068 
1069 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1070 			       i915_ggtt_offset(ce->state) +
1071 			       context_wa_bb_offset(ce),
1072 			       (cs - start) * sizeof(*cs));
1073 }
1074 
1075 /*
1076  * The context descriptor encodes various attributes of a context,
1077  * including its GTT address and some flags. Because it's fairly
1078  * expensive to calculate, we'll just do it once and cache the result,
1079  * which remains valid until the context is unpinned.
1080  *
1081  * This is what a descriptor looks like, from LSB to MSB::
1082  *
1083  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1084  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1085  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1086  *      bits 53-54:    mbz, reserved for use by hardware
1087  *      bits 55-63:    group ID, currently unused and set to 0
1088  *
1089  * Starting from Gen11, the upper dword of the descriptor has a new format:
1090  *
1091  *      bits 32-36:    reserved
1092  *      bits 37-47:    SW context ID
1093  *      bits 48:53:    engine instance
1094  *      bit 54:        mbz, reserved for use by hardware
1095  *      bits 55-60:    SW counter
1096  *      bits 61-63:    engine class
1097  *
1098  * engine info, SW context ID and SW counter need to form a unique number
1099  * (Context ID) per lrc.
1100  */
1101 static u32 lrc_descriptor(const struct intel_context *ce)
1102 {
1103 	u32 desc;
1104 
1105 	desc = INTEL_LEGACY_32B_CONTEXT;
1106 	if (i915_vm_is_4lvl(ce->vm))
1107 		desc = INTEL_LEGACY_64B_CONTEXT;
1108 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1109 
1110 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1111 	if (IS_GEN(ce->vm->i915, 8))
1112 		desc |= GEN8_CTX_L3LLC_COHERENT;
1113 
1114 	return i915_ggtt_offset(ce->state) | desc;
1115 }
1116 
1117 u32 lrc_update_regs(const struct intel_context *ce,
1118 		    const struct intel_engine_cs *engine,
1119 		    u32 head)
1120 {
1121 	struct intel_ring *ring = ce->ring;
1122 	u32 *regs = ce->lrc_reg_state;
1123 
1124 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1125 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1126 
1127 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1128 	regs[CTX_RING_HEAD] = head;
1129 	regs[CTX_RING_TAIL] = ring->tail;
1130 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1131 
1132 	/* RPCS */
1133 	if (engine->class == RENDER_CLASS) {
1134 		regs[CTX_R_PWR_CLK_STATE] =
1135 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1136 
1137 		i915_oa_init_reg_state(ce, engine);
1138 	}
1139 
1140 	if (ce->wa_bb_page) {
1141 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1142 
1143 		fn = gen12_emit_indirect_ctx_xcs;
1144 		if (ce->engine->class == RENDER_CLASS)
1145 			fn = gen12_emit_indirect_ctx_rcs;
1146 
1147 		/* Mutually exclusive wrt to global indirect bb */
1148 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1149 		setup_indirect_ctx_bb(ce, engine, fn);
1150 	}
1151 
1152 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1153 }
1154 
1155 void lrc_update_offsets(struct intel_context *ce,
1156 			struct intel_engine_cs *engine)
1157 {
1158 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1159 }
1160 
1161 void lrc_check_regs(const struct intel_context *ce,
1162 		    const struct intel_engine_cs *engine,
1163 		    const char *when)
1164 {
1165 	const struct intel_ring *ring = ce->ring;
1166 	u32 *regs = ce->lrc_reg_state;
1167 	bool valid = true;
1168 	int x;
1169 
1170 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1171 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1172 		       engine->name,
1173 		       regs[CTX_RING_START],
1174 		       i915_ggtt_offset(ring->vma));
1175 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1176 		valid = false;
1177 	}
1178 
1179 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1180 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1181 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1182 		       engine->name,
1183 		       regs[CTX_RING_CTL],
1184 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1185 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1186 		valid = false;
1187 	}
1188 
1189 	x = lrc_ring_mi_mode(engine);
1190 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1191 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1192 		       engine->name, regs[x + 1]);
1193 		regs[x + 1] &= ~STOP_RING;
1194 		regs[x + 1] |= STOP_RING << 16;
1195 		valid = false;
1196 	}
1197 
1198 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1199 }
1200 
1201 /*
1202  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1203  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1204  * but there is a slight complication as this is applied in WA batch where the
1205  * values are only initialized once so we cannot take register value at the
1206  * beginning and reuse it further; hence we save its value to memory, upload a
1207  * constant value with bit21 set and then we restore it back with the saved value.
1208  * To simplify the WA, a constant value is formed by using the default value
1209  * of this register. This shouldn't be a problem because we are only modifying
1210  * it for a short period and this batch in non-premptible. We can ofcourse
1211  * use additional instructions that read the actual value of the register
1212  * at that time and set our bit of interest but it makes the WA complicated.
1213  *
1214  * This WA is also required for Gen9 so extracting as a function avoids
1215  * code duplication.
1216  */
1217 static u32 *
1218 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1219 {
1220 	/* NB no one else is allowed to scribble over scratch + 256! */
1221 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1222 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1223 	*batch++ = intel_gt_scratch_offset(engine->gt,
1224 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1225 	*batch++ = 0;
1226 
1227 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1228 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1229 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1230 
1231 	batch = gen8_emit_pipe_control(batch,
1232 				       PIPE_CONTROL_CS_STALL |
1233 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1234 				       0);
1235 
1236 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1237 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1238 	*batch++ = intel_gt_scratch_offset(engine->gt,
1239 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1240 	*batch++ = 0;
1241 
1242 	return batch;
1243 }
1244 
1245 /*
1246  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1247  * initialized at the beginning and shared across all contexts but this field
1248  * helps us to have multiple batches at different offsets and select them based
1249  * on a criteria. At the moment this batch always start at the beginning of the page
1250  * and at this point we don't have multiple wa_ctx batch buffers.
1251  *
1252  * The number of WA applied are not known at the beginning; we use this field
1253  * to return the no of DWORDS written.
1254  *
1255  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1256  * so it adds NOOPs as padding to make it cacheline aligned.
1257  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1258  * makes a complete batch buffer.
1259  */
1260 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1261 {
1262 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1263 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1264 
1265 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1266 	if (IS_BROADWELL(engine->i915))
1267 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1268 
1269 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1270 	/* Actual scratch location is at 128 bytes offset */
1271 	batch = gen8_emit_pipe_control(batch,
1272 				       PIPE_CONTROL_FLUSH_L3 |
1273 				       PIPE_CONTROL_STORE_DATA_INDEX |
1274 				       PIPE_CONTROL_CS_STALL |
1275 				       PIPE_CONTROL_QW_WRITE,
1276 				       LRC_PPHWSP_SCRATCH_ADDR);
1277 
1278 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1279 
1280 	/* Pad to end of cacheline */
1281 	while ((unsigned long)batch % CACHELINE_BYTES)
1282 		*batch++ = MI_NOOP;
1283 
1284 	/*
1285 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1286 	 * execution depends on the length specified in terms of cache lines
1287 	 * in the register CTX_RCS_INDIRECT_CTX
1288 	 */
1289 
1290 	return batch;
1291 }
1292 
1293 struct lri {
1294 	i915_reg_t reg;
1295 	u32 value;
1296 };
1297 
1298 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1299 {
1300 	GEM_BUG_ON(!count || count > 63);
1301 
1302 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1303 	do {
1304 		*batch++ = i915_mmio_reg_offset(lri->reg);
1305 		*batch++ = lri->value;
1306 	} while (lri++, --count);
1307 	*batch++ = MI_NOOP;
1308 
1309 	return batch;
1310 }
1311 
1312 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1313 {
1314 	static const struct lri lri[] = {
1315 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1316 		{
1317 			COMMON_SLICE_CHICKEN2,
1318 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1319 				       0),
1320 		},
1321 
1322 		/* BSpec: 11391 */
1323 		{
1324 			FF_SLICE_CHICKEN,
1325 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1326 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1327 		},
1328 
1329 		/* BSpec: 11299 */
1330 		{
1331 			_3D_CHICKEN3,
1332 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1333 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1334 		}
1335 	};
1336 
1337 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1338 
1339 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1340 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1341 
1342 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1343 	batch = gen8_emit_pipe_control(batch,
1344 				       PIPE_CONTROL_FLUSH_L3 |
1345 				       PIPE_CONTROL_STORE_DATA_INDEX |
1346 				       PIPE_CONTROL_CS_STALL |
1347 				       PIPE_CONTROL_QW_WRITE,
1348 				       LRC_PPHWSP_SCRATCH_ADDR);
1349 
1350 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1351 
1352 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1353 	if (HAS_POOLED_EU(engine->i915)) {
1354 		/*
1355 		 * EU pool configuration is setup along with golden context
1356 		 * during context initialization. This value depends on
1357 		 * device type (2x6 or 3x6) and needs to be updated based
1358 		 * on which subslice is disabled especially for 2x6
1359 		 * devices, however it is safe to load default
1360 		 * configuration of 3x6 device instead of masking off
1361 		 * corresponding bits because HW ignores bits of a disabled
1362 		 * subslice and drops down to appropriate config. Please
1363 		 * see render_state_setup() in i915_gem_render_state.c for
1364 		 * possible configurations, to avoid duplication they are
1365 		 * not shown here again.
1366 		 */
1367 		*batch++ = GEN9_MEDIA_POOL_STATE;
1368 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1369 		*batch++ = 0x00777000;
1370 		*batch++ = 0;
1371 		*batch++ = 0;
1372 		*batch++ = 0;
1373 	}
1374 
1375 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1376 
1377 	/* Pad to end of cacheline */
1378 	while ((unsigned long)batch % CACHELINE_BYTES)
1379 		*batch++ = MI_NOOP;
1380 
1381 	return batch;
1382 }
1383 
1384 static u32 *
1385 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1386 {
1387 	int i;
1388 
1389 	/*
1390 	 * WaPipeControlBefore3DStateSamplePattern: cnl
1391 	 *
1392 	 * Ensure the engine is idle prior to programming a
1393 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
1394 	 */
1395 	batch = gen8_emit_pipe_control(batch,
1396 				       PIPE_CONTROL_CS_STALL,
1397 				       0);
1398 	/*
1399 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
1400 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
1401 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
1402 	 * confusing. Since gen8_emit_pipe_control() already advances the
1403 	 * batch by 6 dwords, we advance the other 10 here, completing a
1404 	 * cacheline. It's not clear if the workaround requires this padding
1405 	 * before other commands, or if it's just the regular padding we would
1406 	 * already have for the workaround bb, so leave it here for now.
1407 	 */
1408 	for (i = 0; i < 10; i++)
1409 		*batch++ = MI_NOOP;
1410 
1411 	/* Pad to end of cacheline */
1412 	while ((unsigned long)batch % CACHELINE_BYTES)
1413 		*batch++ = MI_NOOP;
1414 
1415 	return batch;
1416 }
1417 
1418 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1419 
1420 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
1421 {
1422 	struct drm_i915_gem_object *obj;
1423 	struct i915_vma *vma;
1424 	int err;
1425 
1426 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1427 	if (IS_ERR(obj))
1428 		return PTR_ERR(obj);
1429 
1430 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1431 	if (IS_ERR(vma)) {
1432 		err = PTR_ERR(vma);
1433 		goto err;
1434 	}
1435 
1436 	err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
1437 	if (err)
1438 		goto err;
1439 
1440 	engine->wa_ctx.vma = vma;
1441 	return 0;
1442 
1443 err:
1444 	i915_gem_object_put(obj);
1445 	return err;
1446 }
1447 
1448 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1449 {
1450 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1451 
1452 	/* Called on error unwind, clear all flags to prevent further use */
1453 	memset(&engine->wa_ctx, 0, sizeof(engine->wa_ctx));
1454 }
1455 
1456 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1457 
1458 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1459 {
1460 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1461 	struct i915_wa_ctx_bb *wa_bb[] = {
1462 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1463 	};
1464 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1465 	void *batch, *batch_ptr;
1466 	unsigned int i;
1467 	int err;
1468 
1469 	if (engine->class != RENDER_CLASS)
1470 		return;
1471 
1472 	switch (INTEL_GEN(engine->i915)) {
1473 	case 12:
1474 	case 11:
1475 		return;
1476 	case 10:
1477 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
1478 		wa_bb_fn[1] = NULL;
1479 		break;
1480 	case 9:
1481 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1482 		wa_bb_fn[1] = NULL;
1483 		break;
1484 	case 8:
1485 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1486 		wa_bb_fn[1] = NULL;
1487 		break;
1488 	default:
1489 		MISSING_CASE(INTEL_GEN(engine->i915));
1490 		return;
1491 	}
1492 
1493 	err = lrc_setup_wa_ctx(engine);
1494 	if (err) {
1495 		/*
1496 		 * We continue even if we fail to initialize WA batch
1497 		 * because we only expect rare glitches but nothing
1498 		 * critical to prevent us from using GPU
1499 		 */
1500 		drm_err(&engine->i915->drm,
1501 			"Ignoring context switch w/a allocation error:%d\n",
1502 			err);
1503 		return;
1504 	}
1505 
1506 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1507 
1508 	/*
1509 	 * Emit the two workaround batch buffers, recording the offset from the
1510 	 * start of the workaround batch buffer object for each and their
1511 	 * respective sizes.
1512 	 */
1513 	batch_ptr = batch;
1514 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1515 		wa_bb[i]->offset = batch_ptr - batch;
1516 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1517 						  CACHELINE_BYTES))) {
1518 			err = -EINVAL;
1519 			break;
1520 		}
1521 		if (wa_bb_fn[i])
1522 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1523 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1524 	}
1525 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1526 
1527 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1528 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1529 
1530 	/* Verify that we can handle failure to setup the wa_ctx */
1531 	if (err || i915_inject_probe_error(engine->i915, -ENODEV))
1532 		lrc_fini_wa_ctx(engine);
1533 }
1534 
1535 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1536 {
1537 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1538 	ce->runtime.num_underflow++;
1539 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1540 #endif
1541 }
1542 
1543 void lrc_update_runtime(struct intel_context *ce)
1544 {
1545 	u32 old;
1546 	s32 dt;
1547 
1548 	if (intel_context_is_barrier(ce))
1549 		return;
1550 
1551 	old = ce->runtime.last;
1552 	ce->runtime.last = lrc_get_runtime(ce);
1553 	dt = ce->runtime.last - old;
1554 
1555 	if (unlikely(dt < 0)) {
1556 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1557 			 old, ce->runtime.last, dt);
1558 		st_update_runtime_underflow(ce, dt);
1559 		return;
1560 	}
1561 
1562 	ewma_runtime_add(&ce->runtime.avg, dt);
1563 	ce->runtime.total += dt;
1564 }
1565 
1566 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1567 #include "selftest_lrc.c"
1568 #endif
1569