xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 8781e5df)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "gem/i915_gem_context.h"
137 
138 #include "i915_drv.h"
139 #include "i915_perf.h"
140 #include "i915_trace.h"
141 #include "i915_vgpu.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 
151 #define RING_EXECLIST_QFULL		(1 << 0x2)
152 #define RING_EXECLIST1_VALID		(1 << 0x3)
153 #define RING_EXECLIST0_VALID		(1 << 0x4)
154 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
155 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
156 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
157 
158 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
159 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
161 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
162 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
163 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
164 
165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
166 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
167 
168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
169 
170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
172 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
173 #define GEN12_IDLE_CTX_ID		0x7FF
174 #define GEN12_CSB_CTX_VALID(csb_dw) \
175 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
176 
177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
179 #define WA_TAIL_DWORDS 2
180 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
181 
182 struct virtual_engine {
183 	struct intel_engine_cs base;
184 	struct intel_context context;
185 
186 	/*
187 	 * We allow only a single request through the virtual engine at a time
188 	 * (each request in the timeline waits for the completion fence of
189 	 * the previous before being submitted). By restricting ourselves to
190 	 * only submitting a single request, each request is placed on to a
191 	 * physical to maximise load spreading (by virtue of the late greedy
192 	 * scheduling -- each real engine takes the next available request
193 	 * upon idling).
194 	 */
195 	struct i915_request *request;
196 
197 	/*
198 	 * We keep a rbtree of available virtual engines inside each physical
199 	 * engine, sorted by priority. Here we preallocate the nodes we need
200 	 * for the virtual engine, indexed by physical_engine->id.
201 	 */
202 	struct ve_node {
203 		struct rb_node rb;
204 		int prio;
205 	} nodes[I915_NUM_ENGINES];
206 
207 	/*
208 	 * Keep track of bonded pairs -- restrictions upon on our selection
209 	 * of physical engines any particular request may be submitted to.
210 	 * If we receive a submit-fence from a master engine, we will only
211 	 * use one of sibling_mask physical engines.
212 	 */
213 	struct ve_bond {
214 		const struct intel_engine_cs *master;
215 		intel_engine_mask_t sibling_mask;
216 	} *bonds;
217 	unsigned int num_bonds;
218 
219 	/* And finally, which physical engines this virtual engine maps onto. */
220 	unsigned int num_siblings;
221 	struct intel_engine_cs *siblings[0];
222 };
223 
224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
225 {
226 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
227 	return container_of(engine, struct virtual_engine, base);
228 }
229 
230 static int __execlists_context_alloc(struct intel_context *ce,
231 				     struct intel_engine_cs *engine);
232 
233 static void execlists_init_reg_state(u32 *reg_state,
234 				     const struct intel_context *ce,
235 				     const struct intel_engine_cs *engine,
236 				     const struct intel_ring *ring,
237 				     bool close);
238 static void
239 __execlists_update_reg_state(const struct intel_context *ce,
240 			     const struct intel_engine_cs *engine);
241 
242 static void mark_eio(struct i915_request *rq)
243 {
244 	if (i915_request_completed(rq))
245 		return;
246 
247 	GEM_BUG_ON(i915_request_signaled(rq));
248 
249 	dma_fence_set_error(&rq->fence, -EIO);
250 	i915_request_mark_complete(rq);
251 }
252 
253 static struct i915_request *
254 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
255 {
256 	struct i915_request *active = rq;
257 
258 	rcu_read_lock();
259 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
260 		if (i915_request_completed(rq))
261 			break;
262 
263 		active = rq;
264 	}
265 	rcu_read_unlock();
266 
267 	return active;
268 }
269 
270 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
271 {
272 	return (i915_ggtt_offset(engine->status_page.vma) +
273 		I915_GEM_HWS_PREEMPT_ADDR);
274 }
275 
276 static inline void
277 ring_set_paused(const struct intel_engine_cs *engine, int state)
278 {
279 	/*
280 	 * We inspect HWS_PREEMPT with a semaphore inside
281 	 * engine->emit_fini_breadcrumb. If the dword is true,
282 	 * the ring is paused as the semaphore will busywait
283 	 * until the dword is false.
284 	 */
285 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
286 	if (state)
287 		wmb();
288 }
289 
290 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
291 {
292 	return rb_entry(rb, struct i915_priolist, node);
293 }
294 
295 static inline int rq_prio(const struct i915_request *rq)
296 {
297 	return rq->sched.attr.priority;
298 }
299 
300 static int effective_prio(const struct i915_request *rq)
301 {
302 	int prio = rq_prio(rq);
303 
304 	/*
305 	 * If this request is special and must not be interrupted at any
306 	 * cost, so be it. Note we are only checking the most recent request
307 	 * in the context and so may be masking an earlier vip request. It
308 	 * is hoped that under the conditions where nopreempt is used, this
309 	 * will not matter (i.e. all requests to that context will be
310 	 * nopreempt for as long as desired).
311 	 */
312 	if (i915_request_has_nopreempt(rq))
313 		prio = I915_PRIORITY_UNPREEMPTABLE;
314 
315 	/*
316 	 * On unwinding the active request, we give it a priority bump
317 	 * if it has completed waiting on any semaphore. If we know that
318 	 * the request has already started, we can prevent an unwanted
319 	 * preempt-to-idle cycle by taking that into account now.
320 	 */
321 	if (__i915_request_has_started(rq))
322 		prio |= I915_PRIORITY_NOSEMAPHORE;
323 
324 	/* Restrict mere WAIT boosts from triggering preemption */
325 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
326 	return prio | __NO_PREEMPTION;
327 }
328 
329 static int queue_prio(const struct intel_engine_execlists *execlists)
330 {
331 	struct i915_priolist *p;
332 	struct rb_node *rb;
333 
334 	rb = rb_first_cached(&execlists->queue);
335 	if (!rb)
336 		return INT_MIN;
337 
338 	/*
339 	 * As the priolist[] are inverted, with the highest priority in [0],
340 	 * we have to flip the index value to become priority.
341 	 */
342 	p = to_priolist(rb);
343 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
344 }
345 
346 static inline bool need_preempt(const struct intel_engine_cs *engine,
347 				const struct i915_request *rq,
348 				struct rb_node *rb)
349 {
350 	int last_prio;
351 
352 	if (!intel_engine_has_semaphores(engine))
353 		return false;
354 
355 	/*
356 	 * Check if the current priority hint merits a preemption attempt.
357 	 *
358 	 * We record the highest value priority we saw during rescheduling
359 	 * prior to this dequeue, therefore we know that if it is strictly
360 	 * less than the current tail of ESLP[0], we do not need to force
361 	 * a preempt-to-idle cycle.
362 	 *
363 	 * However, the priority hint is a mere hint that we may need to
364 	 * preempt. If that hint is stale or we may be trying to preempt
365 	 * ourselves, ignore the request.
366 	 *
367 	 * More naturally we would write
368 	 *      prio >= max(0, last);
369 	 * except that we wish to prevent triggering preemption at the same
370 	 * priority level: the task that is running should remain running
371 	 * to preserve FIFO ordering of dependencies.
372 	 */
373 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
374 	if (engine->execlists.queue_priority_hint <= last_prio)
375 		return false;
376 
377 	/*
378 	 * Check against the first request in ELSP[1], it will, thanks to the
379 	 * power of PI, be the highest priority of that context.
380 	 */
381 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
382 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
383 		return true;
384 
385 	if (rb) {
386 		struct virtual_engine *ve =
387 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
388 		bool preempt = false;
389 
390 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
391 			struct i915_request *next;
392 
393 			rcu_read_lock();
394 			next = READ_ONCE(ve->request);
395 			if (next)
396 				preempt = rq_prio(next) > last_prio;
397 			rcu_read_unlock();
398 		}
399 
400 		if (preempt)
401 			return preempt;
402 	}
403 
404 	/*
405 	 * If the inflight context did not trigger the preemption, then maybe
406 	 * it was the set of queued requests? Pick the highest priority in
407 	 * the queue (the first active priolist) and see if it deserves to be
408 	 * running instead of ELSP[0].
409 	 *
410 	 * The highest priority request in the queue can not be either
411 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
412 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
413 	 */
414 	return queue_prio(&engine->execlists) > last_prio;
415 }
416 
417 __maybe_unused static inline bool
418 assert_priority_queue(const struct i915_request *prev,
419 		      const struct i915_request *next)
420 {
421 	/*
422 	 * Without preemption, the prev may refer to the still active element
423 	 * which we refuse to let go.
424 	 *
425 	 * Even with preemption, there are times when we think it is better not
426 	 * to preempt and leave an ostensibly lower priority request in flight.
427 	 */
428 	if (i915_request_is_active(prev))
429 		return true;
430 
431 	return rq_prio(prev) >= rq_prio(next);
432 }
433 
434 /*
435  * The context descriptor encodes various attributes of a context,
436  * including its GTT address and some flags. Because it's fairly
437  * expensive to calculate, we'll just do it once and cache the result,
438  * which remains valid until the context is unpinned.
439  *
440  * This is what a descriptor looks like, from LSB to MSB::
441  *
442  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
443  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
444  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
445  *      bits 53-54:    mbz, reserved for use by hardware
446  *      bits 55-63:    group ID, currently unused and set to 0
447  *
448  * Starting from Gen11, the upper dword of the descriptor has a new format:
449  *
450  *      bits 32-36:    reserved
451  *      bits 37-47:    SW context ID
452  *      bits 48:53:    engine instance
453  *      bit 54:        mbz, reserved for use by hardware
454  *      bits 55-60:    SW counter
455  *      bits 61-63:    engine class
456  *
457  * engine info, SW context ID and SW counter need to form a unique number
458  * (Context ID) per lrc.
459  */
460 static u64
461 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
462 {
463 	u64 desc;
464 
465 	desc = INTEL_LEGACY_32B_CONTEXT;
466 	if (i915_vm_is_4lvl(ce->vm))
467 		desc = INTEL_LEGACY_64B_CONTEXT;
468 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
469 
470 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
471 	if (IS_GEN(engine->i915, 8))
472 		desc |= GEN8_CTX_L3LLC_COHERENT;
473 
474 	desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
475 	/*
476 	 * The following 32bits are copied into the OA reports (dword 2).
477 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
478 	 * anything below.
479 	 */
480 	if (INTEL_GEN(engine->i915) >= 11) {
481 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
482 								/* bits 48-53 */
483 
484 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
485 								/* bits 61-63 */
486 	}
487 
488 	return desc;
489 }
490 
491 static u32 *set_offsets(u32 *regs,
492 			const u8 *data,
493 			const struct intel_engine_cs *engine)
494 #define NOP(x) (BIT(7) | (x))
495 #define LRI(count, flags) ((flags) << 6 | (count))
496 #define POSTED BIT(0)
497 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
498 #define REG16(x) \
499 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
500 	(((x) >> 2) & 0x7f)
501 #define END() 0
502 {
503 	const u32 base = engine->mmio_base;
504 
505 	while (*data) {
506 		u8 count, flags;
507 
508 		if (*data & BIT(7)) { /* skip */
509 			regs += *data++ & ~BIT(7);
510 			continue;
511 		}
512 
513 		count = *data & 0x3f;
514 		flags = *data >> 6;
515 		data++;
516 
517 		*regs = MI_LOAD_REGISTER_IMM(count);
518 		if (flags & POSTED)
519 			*regs |= MI_LRI_FORCE_POSTED;
520 		if (INTEL_GEN(engine->i915) >= 11)
521 			*regs |= MI_LRI_CS_MMIO;
522 		regs++;
523 
524 		GEM_BUG_ON(!count);
525 		do {
526 			u32 offset = 0;
527 			u8 v;
528 
529 			do {
530 				v = *data++;
531 				offset <<= 7;
532 				offset |= v & ~BIT(7);
533 			} while (v & BIT(7));
534 
535 			*regs = base + (offset << 2);
536 			regs += 2;
537 		} while (--count);
538 	}
539 
540 	return regs;
541 }
542 
543 static const u8 gen8_xcs_offsets[] = {
544 	NOP(1),
545 	LRI(11, 0),
546 	REG16(0x244),
547 	REG(0x034),
548 	REG(0x030),
549 	REG(0x038),
550 	REG(0x03c),
551 	REG(0x168),
552 	REG(0x140),
553 	REG(0x110),
554 	REG(0x11c),
555 	REG(0x114),
556 	REG(0x118),
557 
558 	NOP(9),
559 	LRI(9, 0),
560 	REG16(0x3a8),
561 	REG16(0x28c),
562 	REG16(0x288),
563 	REG16(0x284),
564 	REG16(0x280),
565 	REG16(0x27c),
566 	REG16(0x278),
567 	REG16(0x274),
568 	REG16(0x270),
569 
570 	NOP(13),
571 	LRI(2, 0),
572 	REG16(0x200),
573 	REG(0x028),
574 
575 	END(),
576 };
577 
578 static const u8 gen9_xcs_offsets[] = {
579 	NOP(1),
580 	LRI(14, POSTED),
581 	REG16(0x244),
582 	REG(0x034),
583 	REG(0x030),
584 	REG(0x038),
585 	REG(0x03c),
586 	REG(0x168),
587 	REG(0x140),
588 	REG(0x110),
589 	REG(0x11c),
590 	REG(0x114),
591 	REG(0x118),
592 	REG(0x1c0),
593 	REG(0x1c4),
594 	REG(0x1c8),
595 
596 	NOP(3),
597 	LRI(9, POSTED),
598 	REG16(0x3a8),
599 	REG16(0x28c),
600 	REG16(0x288),
601 	REG16(0x284),
602 	REG16(0x280),
603 	REG16(0x27c),
604 	REG16(0x278),
605 	REG16(0x274),
606 	REG16(0x270),
607 
608 	NOP(13),
609 	LRI(1, POSTED),
610 	REG16(0x200),
611 
612 	NOP(13),
613 	LRI(44, POSTED),
614 	REG(0x028),
615 	REG(0x09c),
616 	REG(0x0c0),
617 	REG(0x178),
618 	REG(0x17c),
619 	REG16(0x358),
620 	REG(0x170),
621 	REG(0x150),
622 	REG(0x154),
623 	REG(0x158),
624 	REG16(0x41c),
625 	REG16(0x600),
626 	REG16(0x604),
627 	REG16(0x608),
628 	REG16(0x60c),
629 	REG16(0x610),
630 	REG16(0x614),
631 	REG16(0x618),
632 	REG16(0x61c),
633 	REG16(0x620),
634 	REG16(0x624),
635 	REG16(0x628),
636 	REG16(0x62c),
637 	REG16(0x630),
638 	REG16(0x634),
639 	REG16(0x638),
640 	REG16(0x63c),
641 	REG16(0x640),
642 	REG16(0x644),
643 	REG16(0x648),
644 	REG16(0x64c),
645 	REG16(0x650),
646 	REG16(0x654),
647 	REG16(0x658),
648 	REG16(0x65c),
649 	REG16(0x660),
650 	REG16(0x664),
651 	REG16(0x668),
652 	REG16(0x66c),
653 	REG16(0x670),
654 	REG16(0x674),
655 	REG16(0x678),
656 	REG16(0x67c),
657 	REG(0x068),
658 
659 	END(),
660 };
661 
662 static const u8 gen12_xcs_offsets[] = {
663 	NOP(1),
664 	LRI(13, POSTED),
665 	REG16(0x244),
666 	REG(0x034),
667 	REG(0x030),
668 	REG(0x038),
669 	REG(0x03c),
670 	REG(0x168),
671 	REG(0x140),
672 	REG(0x110),
673 	REG(0x1c0),
674 	REG(0x1c4),
675 	REG(0x1c8),
676 	REG(0x180),
677 	REG16(0x2b4),
678 
679 	NOP(5),
680 	LRI(9, POSTED),
681 	REG16(0x3a8),
682 	REG16(0x28c),
683 	REG16(0x288),
684 	REG16(0x284),
685 	REG16(0x280),
686 	REG16(0x27c),
687 	REG16(0x278),
688 	REG16(0x274),
689 	REG16(0x270),
690 
691 	END(),
692 };
693 
694 static const u8 gen8_rcs_offsets[] = {
695 	NOP(1),
696 	LRI(14, POSTED),
697 	REG16(0x244),
698 	REG(0x034),
699 	REG(0x030),
700 	REG(0x038),
701 	REG(0x03c),
702 	REG(0x168),
703 	REG(0x140),
704 	REG(0x110),
705 	REG(0x11c),
706 	REG(0x114),
707 	REG(0x118),
708 	REG(0x1c0),
709 	REG(0x1c4),
710 	REG(0x1c8),
711 
712 	NOP(3),
713 	LRI(9, POSTED),
714 	REG16(0x3a8),
715 	REG16(0x28c),
716 	REG16(0x288),
717 	REG16(0x284),
718 	REG16(0x280),
719 	REG16(0x27c),
720 	REG16(0x278),
721 	REG16(0x274),
722 	REG16(0x270),
723 
724 	NOP(13),
725 	LRI(1, 0),
726 	REG(0x0c8),
727 
728 	END(),
729 };
730 
731 static const u8 gen11_rcs_offsets[] = {
732 	NOP(1),
733 	LRI(15, POSTED),
734 	REG16(0x244),
735 	REG(0x034),
736 	REG(0x030),
737 	REG(0x038),
738 	REG(0x03c),
739 	REG(0x168),
740 	REG(0x140),
741 	REG(0x110),
742 	REG(0x11c),
743 	REG(0x114),
744 	REG(0x118),
745 	REG(0x1c0),
746 	REG(0x1c4),
747 	REG(0x1c8),
748 	REG(0x180),
749 
750 	NOP(1),
751 	LRI(9, POSTED),
752 	REG16(0x3a8),
753 	REG16(0x28c),
754 	REG16(0x288),
755 	REG16(0x284),
756 	REG16(0x280),
757 	REG16(0x27c),
758 	REG16(0x278),
759 	REG16(0x274),
760 	REG16(0x270),
761 
762 	LRI(1, POSTED),
763 	REG(0x1b0),
764 
765 	NOP(10),
766 	LRI(1, 0),
767 	REG(0x0c8),
768 
769 	END(),
770 };
771 
772 static const u8 gen12_rcs_offsets[] = {
773 	NOP(1),
774 	LRI(13, POSTED),
775 	REG16(0x244),
776 	REG(0x034),
777 	REG(0x030),
778 	REG(0x038),
779 	REG(0x03c),
780 	REG(0x168),
781 	REG(0x140),
782 	REG(0x110),
783 	REG(0x1c0),
784 	REG(0x1c4),
785 	REG(0x1c8),
786 	REG(0x180),
787 	REG16(0x2b4),
788 
789 	NOP(5),
790 	LRI(9, POSTED),
791 	REG16(0x3a8),
792 	REG16(0x28c),
793 	REG16(0x288),
794 	REG16(0x284),
795 	REG16(0x280),
796 	REG16(0x27c),
797 	REG16(0x278),
798 	REG16(0x274),
799 	REG16(0x270),
800 
801 	LRI(3, POSTED),
802 	REG(0x1b0),
803 	REG16(0x5a8),
804 	REG16(0x5ac),
805 
806 	NOP(6),
807 	LRI(1, 0),
808 	REG(0x0c8),
809 
810 	END(),
811 };
812 
813 #undef END
814 #undef REG16
815 #undef REG
816 #undef LRI
817 #undef NOP
818 
819 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
820 {
821 	/*
822 	 * The gen12+ lists only have the registers we program in the basic
823 	 * default state. We rely on the context image using relative
824 	 * addressing to automatic fixup the register state between the
825 	 * physical engines for virtual engine.
826 	 */
827 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
828 		   !intel_engine_has_relative_mmio(engine));
829 
830 	if (engine->class == RENDER_CLASS) {
831 		if (INTEL_GEN(engine->i915) >= 12)
832 			return gen12_rcs_offsets;
833 		else if (INTEL_GEN(engine->i915) >= 11)
834 			return gen11_rcs_offsets;
835 		else
836 			return gen8_rcs_offsets;
837 	} else {
838 		if (INTEL_GEN(engine->i915) >= 12)
839 			return gen12_xcs_offsets;
840 		else if (INTEL_GEN(engine->i915) >= 9)
841 			return gen9_xcs_offsets;
842 		else
843 			return gen8_xcs_offsets;
844 	}
845 }
846 
847 static void unwind_wa_tail(struct i915_request *rq)
848 {
849 	rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
850 	assert_ring_tail_valid(rq->ring, rq->tail);
851 }
852 
853 static struct i915_request *
854 __unwind_incomplete_requests(struct intel_engine_cs *engine)
855 {
856 	struct i915_request *rq, *rn, *active = NULL;
857 	struct list_head *uninitialized_var(pl);
858 	int prio = I915_PRIORITY_INVALID;
859 
860 	lockdep_assert_held(&engine->active.lock);
861 
862 	list_for_each_entry_safe_reverse(rq, rn,
863 					 &engine->active.requests,
864 					 sched.link) {
865 
866 		if (i915_request_completed(rq))
867 			continue; /* XXX */
868 
869 		__i915_request_unsubmit(rq);
870 		unwind_wa_tail(rq);
871 
872 		/*
873 		 * Push the request back into the queue for later resubmission.
874 		 * If this request is not native to this physical engine (i.e.
875 		 * it came from a virtual source), push it back onto the virtual
876 		 * engine so that it can be moved across onto another physical
877 		 * engine as load dictates.
878 		 */
879 		if (likely(rq->execution_mask == engine->mask)) {
880 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
881 			if (rq_prio(rq) != prio) {
882 				prio = rq_prio(rq);
883 				pl = i915_sched_lookup_priolist(engine, prio);
884 			}
885 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
886 
887 			list_move(&rq->sched.link, pl);
888 			active = rq;
889 		} else {
890 			struct intel_engine_cs *owner = rq->hw_context->engine;
891 
892 			/*
893 			 * Decouple the virtual breadcrumb before moving it
894 			 * back to the virtual engine -- we don't want the
895 			 * request to complete in the background and try
896 			 * and cancel the breadcrumb on the virtual engine
897 			 * (instead of the old engine where it is linked)!
898 			 */
899 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
900 				     &rq->fence.flags)) {
901 				spin_lock_nested(&rq->lock,
902 						 SINGLE_DEPTH_NESTING);
903 				i915_request_cancel_breadcrumb(rq);
904 				spin_unlock(&rq->lock);
905 			}
906 			rq->engine = owner;
907 			owner->submit_request(rq);
908 			active = NULL;
909 		}
910 	}
911 
912 	return active;
913 }
914 
915 struct i915_request *
916 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
917 {
918 	struct intel_engine_cs *engine =
919 		container_of(execlists, typeof(*engine), execlists);
920 
921 	return __unwind_incomplete_requests(engine);
922 }
923 
924 static inline void
925 execlists_context_status_change(struct i915_request *rq, unsigned long status)
926 {
927 	/*
928 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
929 	 * The compiler should eliminate this function as dead-code.
930 	 */
931 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
932 		return;
933 
934 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
935 				   status, rq);
936 }
937 
938 static void intel_engine_context_in(struct intel_engine_cs *engine)
939 {
940 	unsigned long flags;
941 
942 	if (READ_ONCE(engine->stats.enabled) == 0)
943 		return;
944 
945 	write_seqlock_irqsave(&engine->stats.lock, flags);
946 
947 	if (engine->stats.enabled > 0) {
948 		if (engine->stats.active++ == 0)
949 			engine->stats.start = ktime_get();
950 		GEM_BUG_ON(engine->stats.active == 0);
951 	}
952 
953 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
954 }
955 
956 static void intel_engine_context_out(struct intel_engine_cs *engine)
957 {
958 	unsigned long flags;
959 
960 	if (READ_ONCE(engine->stats.enabled) == 0)
961 		return;
962 
963 	write_seqlock_irqsave(&engine->stats.lock, flags);
964 
965 	if (engine->stats.enabled > 0) {
966 		ktime_t last;
967 
968 		if (engine->stats.active && --engine->stats.active == 0) {
969 			/*
970 			 * Decrement the active context count and in case GPU
971 			 * is now idle add up to the running total.
972 			 */
973 			last = ktime_sub(ktime_get(), engine->stats.start);
974 
975 			engine->stats.total = ktime_add(engine->stats.total,
976 							last);
977 		} else if (engine->stats.active == 0) {
978 			/*
979 			 * After turning on engine stats, context out might be
980 			 * the first event in which case we account from the
981 			 * time stats gathering was turned on.
982 			 */
983 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
984 
985 			engine->stats.total = ktime_add(engine->stats.total,
986 							last);
987 		}
988 	}
989 
990 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
991 }
992 
993 static inline struct intel_engine_cs *
994 __execlists_schedule_in(struct i915_request *rq)
995 {
996 	struct intel_engine_cs * const engine = rq->engine;
997 	struct intel_context * const ce = rq->hw_context;
998 
999 	intel_context_get(ce);
1000 
1001 	if (ce->tag) {
1002 		/* Use a fixed tag for OA and friends */
1003 		ce->lrc_desc |= (u64)ce->tag << 32;
1004 	} else {
1005 		/* We don't need a strict matching tag, just different values */
1006 		ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1007 		ce->lrc_desc |=
1008 			(u64)(engine->context_tag++ % NUM_CONTEXT_TAG) <<
1009 			GEN11_SW_CTX_ID_SHIFT;
1010 		BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1011 	}
1012 
1013 	intel_gt_pm_get(engine->gt);
1014 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1015 	intel_engine_context_in(engine);
1016 
1017 	return engine;
1018 }
1019 
1020 static inline struct i915_request *
1021 execlists_schedule_in(struct i915_request *rq, int idx)
1022 {
1023 	struct intel_context * const ce = rq->hw_context;
1024 	struct intel_engine_cs *old;
1025 
1026 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1027 	trace_i915_request_in(rq, idx);
1028 
1029 	old = READ_ONCE(ce->inflight);
1030 	do {
1031 		if (!old) {
1032 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1033 			break;
1034 		}
1035 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1036 
1037 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1038 	return i915_request_get(rq);
1039 }
1040 
1041 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1042 {
1043 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1044 	struct i915_request *next = READ_ONCE(ve->request);
1045 
1046 	if (next && next->execution_mask & ~rq->execution_mask)
1047 		tasklet_schedule(&ve->base.execlists.tasklet);
1048 }
1049 
1050 static void restore_default_state(struct intel_context *ce,
1051 				  struct intel_engine_cs *engine)
1052 {
1053 	u32 *regs = ce->lrc_reg_state;
1054 
1055 	if (engine->pinned_default_state)
1056 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
1057 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1058 		       engine->context_size - PAGE_SIZE);
1059 
1060 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1061 }
1062 
1063 static void reset_active(struct i915_request *rq,
1064 			 struct intel_engine_cs *engine)
1065 {
1066 	struct intel_context * const ce = rq->hw_context;
1067 	u32 head;
1068 
1069 	/*
1070 	 * The executing context has been cancelled. We want to prevent
1071 	 * further execution along this context and propagate the error on
1072 	 * to anything depending on its results.
1073 	 *
1074 	 * In __i915_request_submit(), we apply the -EIO and remove the
1075 	 * requests' payloads for any banned requests. But first, we must
1076 	 * rewind the context back to the start of the incomplete request so
1077 	 * that we do not jump back into the middle of the batch.
1078 	 *
1079 	 * We preserve the breadcrumbs and semaphores of the incomplete
1080 	 * requests so that inter-timeline dependencies (i.e other timelines)
1081 	 * remain correctly ordered. And we defer to __i915_request_submit()
1082 	 * so that all asynchronous waits are correctly handled.
1083 	 */
1084 	GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
1085 		  __func__, engine->name, rq->fence.context, rq->fence.seqno);
1086 
1087 	/* On resubmission of the active request, payload will be scrubbed */
1088 	if (i915_request_completed(rq))
1089 		head = rq->tail;
1090 	else
1091 		head = active_request(ce->timeline, rq)->head;
1092 	ce->ring->head = intel_ring_wrap(ce->ring, head);
1093 	intel_ring_update_space(ce->ring);
1094 
1095 	/* Scrub the context image to prevent replaying the previous batch */
1096 	restore_default_state(ce, engine);
1097 	__execlists_update_reg_state(ce, engine);
1098 
1099 	/* We've switched away, so this should be a no-op, but intent matters */
1100 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1101 }
1102 
1103 static inline void
1104 __execlists_schedule_out(struct i915_request *rq,
1105 			 struct intel_engine_cs * const engine)
1106 {
1107 	struct intel_context * const ce = rq->hw_context;
1108 
1109 	intel_engine_context_out(engine);
1110 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1111 	intel_gt_pm_put(engine->gt);
1112 
1113 	if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
1114 		reset_active(rq, engine);
1115 
1116 	/*
1117 	 * If this is part of a virtual engine, its next request may
1118 	 * have been blocked waiting for access to the active context.
1119 	 * We have to kick all the siblings again in case we need to
1120 	 * switch (e.g. the next request is not runnable on this
1121 	 * engine). Hopefully, we will already have submitted the next
1122 	 * request before the tasklet runs and do not need to rebuild
1123 	 * each virtual tree and kick everyone again.
1124 	 */
1125 	if (ce->engine != engine)
1126 		kick_siblings(rq, ce);
1127 
1128 	intel_context_put(ce);
1129 }
1130 
1131 static inline void
1132 execlists_schedule_out(struct i915_request *rq)
1133 {
1134 	struct intel_context * const ce = rq->hw_context;
1135 	struct intel_engine_cs *cur, *old;
1136 
1137 	trace_i915_request_out(rq);
1138 
1139 	old = READ_ONCE(ce->inflight);
1140 	do
1141 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1142 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1143 	if (!cur)
1144 		__execlists_schedule_out(rq, old);
1145 
1146 	i915_request_put(rq);
1147 }
1148 
1149 static u64 execlists_update_context(const struct i915_request *rq)
1150 {
1151 	struct intel_context *ce = rq->hw_context;
1152 	u64 desc;
1153 
1154 	ce->lrc_reg_state[CTX_RING_TAIL] =
1155 		intel_ring_set_tail(rq->ring, rq->tail);
1156 
1157 	/*
1158 	 * Make sure the context image is complete before we submit it to HW.
1159 	 *
1160 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1161 	 * an uncached write such as our mmio register access, the empirical
1162 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1163 	 * may not be visible to the HW prior to the completion of the UC
1164 	 * register write and that we may begin execution from the context
1165 	 * before its image is complete leading to invalid PD chasing.
1166 	 *
1167 	 * Furthermore, Braswell, at least, wants a full mb to be sure that
1168 	 * the writes are coherent in memory (visible to the GPU) prior to
1169 	 * execution, and not just visible to other CPUs (as is the result of
1170 	 * wmb).
1171 	 */
1172 	mb();
1173 
1174 	desc = ce->lrc_desc;
1175 	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1176 
1177 	/* Wa_1607138340:tgl */
1178 	if (IS_TGL_REVID(rq->i915, TGL_REVID_A0, TGL_REVID_A0))
1179 		desc |= CTX_DESC_FORCE_RESTORE;
1180 
1181 	return desc;
1182 }
1183 
1184 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1185 {
1186 	if (execlists->ctrl_reg) {
1187 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1188 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1189 	} else {
1190 		writel(upper_32_bits(desc), execlists->submit_reg);
1191 		writel(lower_32_bits(desc), execlists->submit_reg);
1192 	}
1193 }
1194 
1195 static __maybe_unused void
1196 trace_ports(const struct intel_engine_execlists *execlists,
1197 	    const char *msg,
1198 	    struct i915_request * const *ports)
1199 {
1200 	const struct intel_engine_cs *engine =
1201 		container_of(execlists, typeof(*engine), execlists);
1202 
1203 	if (!ports[0])
1204 		return;
1205 
1206 	GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n",
1207 		  engine->name, msg,
1208 		  ports[0]->fence.context,
1209 		  ports[0]->fence.seqno,
1210 		  i915_request_completed(ports[0]) ? "!" :
1211 		  i915_request_started(ports[0]) ? "*" :
1212 		  "",
1213 		  ports[1] ? ports[1]->fence.context : 0,
1214 		  ports[1] ? ports[1]->fence.seqno : 0);
1215 }
1216 
1217 static __maybe_unused bool
1218 assert_pending_valid(const struct intel_engine_execlists *execlists,
1219 		     const char *msg)
1220 {
1221 	struct i915_request * const *port, *rq;
1222 	struct intel_context *ce = NULL;
1223 
1224 	trace_ports(execlists, msg, execlists->pending);
1225 
1226 	if (!execlists->pending[0]) {
1227 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1228 		return false;
1229 	}
1230 
1231 	if (execlists->pending[execlists_num_ports(execlists)]) {
1232 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1233 			      execlists_num_ports(execlists));
1234 		return false;
1235 	}
1236 
1237 	for (port = execlists->pending; (rq = *port); port++) {
1238 		if (ce == rq->hw_context) {
1239 			GEM_TRACE_ERR("Duplicate context in pending[%zd]\n",
1240 				      port - execlists->pending);
1241 			return false;
1242 		}
1243 
1244 		ce = rq->hw_context;
1245 		if (i915_request_completed(rq))
1246 			continue;
1247 
1248 		if (i915_active_is_idle(&ce->active)) {
1249 			GEM_TRACE_ERR("Inactive context in pending[%zd]\n",
1250 				      port - execlists->pending);
1251 			return false;
1252 		}
1253 
1254 		if (!i915_vma_is_pinned(ce->state)) {
1255 			GEM_TRACE_ERR("Unpinned context in pending[%zd]\n",
1256 				      port - execlists->pending);
1257 			return false;
1258 		}
1259 
1260 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1261 			GEM_TRACE_ERR("Unpinned ringbuffer in pending[%zd]\n",
1262 				      port - execlists->pending);
1263 			return false;
1264 		}
1265 	}
1266 
1267 	return ce;
1268 }
1269 
1270 static void execlists_submit_ports(struct intel_engine_cs *engine)
1271 {
1272 	struct intel_engine_execlists *execlists = &engine->execlists;
1273 	unsigned int n;
1274 
1275 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1276 
1277 	/*
1278 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1279 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1280 	 * not be relinquished until the device is idle (see
1281 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1282 	 * that all ELSP are drained i.e. we have processed the CSB,
1283 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1284 	 */
1285 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1286 
1287 	/*
1288 	 * ELSQ note: the submit queue is not cleared after being submitted
1289 	 * to the HW so we need to make sure we always clean it up. This is
1290 	 * currently ensured by the fact that we always write the same number
1291 	 * of elsq entries, keep this in mind before changing the loop below.
1292 	 */
1293 	for (n = execlists_num_ports(execlists); n--; ) {
1294 		struct i915_request *rq = execlists->pending[n];
1295 
1296 		write_desc(execlists,
1297 			   rq ? execlists_update_context(rq) : 0,
1298 			   n);
1299 	}
1300 
1301 	/* we need to manually load the submit queue */
1302 	if (execlists->ctrl_reg)
1303 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1304 }
1305 
1306 static bool ctx_single_port_submission(const struct intel_context *ce)
1307 {
1308 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1309 		i915_gem_context_force_single_submission(ce->gem_context));
1310 }
1311 
1312 static bool can_merge_ctx(const struct intel_context *prev,
1313 			  const struct intel_context *next)
1314 {
1315 	if (prev != next)
1316 		return false;
1317 
1318 	if (ctx_single_port_submission(prev))
1319 		return false;
1320 
1321 	return true;
1322 }
1323 
1324 static bool can_merge_rq(const struct i915_request *prev,
1325 			 const struct i915_request *next)
1326 {
1327 	GEM_BUG_ON(prev == next);
1328 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1329 
1330 	/*
1331 	 * We do not submit known completed requests. Therefore if the next
1332 	 * request is already completed, we can pretend to merge it in
1333 	 * with the previous context (and we will skip updating the ELSP
1334 	 * and tracking). Thus hopefully keeping the ELSP full with active
1335 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1336 	 * us.
1337 	 */
1338 	if (i915_request_completed(next))
1339 		return true;
1340 
1341 	if (unlikely((prev->flags ^ next->flags) &
1342 		     (I915_REQUEST_NOPREEMPT | I915_REQUEST_SENTINEL)))
1343 		return false;
1344 
1345 	if (!can_merge_ctx(prev->hw_context, next->hw_context))
1346 		return false;
1347 
1348 	return true;
1349 }
1350 
1351 static void virtual_update_register_offsets(u32 *regs,
1352 					    struct intel_engine_cs *engine)
1353 {
1354 	set_offsets(regs, reg_offsets(engine), engine);
1355 }
1356 
1357 static bool virtual_matches(const struct virtual_engine *ve,
1358 			    const struct i915_request *rq,
1359 			    const struct intel_engine_cs *engine)
1360 {
1361 	const struct intel_engine_cs *inflight;
1362 
1363 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1364 		return false;
1365 
1366 	/*
1367 	 * We track when the HW has completed saving the context image
1368 	 * (i.e. when we have seen the final CS event switching out of
1369 	 * the context) and must not overwrite the context image before
1370 	 * then. This restricts us to only using the active engine
1371 	 * while the previous virtualized request is inflight (so
1372 	 * we reuse the register offsets). This is a very small
1373 	 * hystersis on the greedy seelction algorithm.
1374 	 */
1375 	inflight = intel_context_inflight(&ve->context);
1376 	if (inflight && inflight != engine)
1377 		return false;
1378 
1379 	return true;
1380 }
1381 
1382 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1383 				     struct intel_engine_cs *engine)
1384 {
1385 	struct intel_engine_cs *old = ve->siblings[0];
1386 
1387 	/* All unattached (rq->engine == old) must already be completed */
1388 
1389 	spin_lock(&old->breadcrumbs.irq_lock);
1390 	if (!list_empty(&ve->context.signal_link)) {
1391 		list_move_tail(&ve->context.signal_link,
1392 			       &engine->breadcrumbs.signalers);
1393 		intel_engine_queue_breadcrumbs(engine);
1394 	}
1395 	spin_unlock(&old->breadcrumbs.irq_lock);
1396 }
1397 
1398 static struct i915_request *
1399 last_active(const struct intel_engine_execlists *execlists)
1400 {
1401 	struct i915_request * const *last = READ_ONCE(execlists->active);
1402 
1403 	while (*last && i915_request_completed(*last))
1404 		last++;
1405 
1406 	return *last;
1407 }
1408 
1409 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1410 {
1411 	LIST_HEAD(list);
1412 
1413 	/*
1414 	 * We want to move the interrupted request to the back of
1415 	 * the round-robin list (i.e. its priority level), but
1416 	 * in doing so, we must then move all requests that were in
1417 	 * flight and were waiting for the interrupted request to
1418 	 * be run after it again.
1419 	 */
1420 	do {
1421 		struct i915_dependency *p;
1422 
1423 		GEM_BUG_ON(i915_request_is_active(rq));
1424 		list_move_tail(&rq->sched.link, pl);
1425 
1426 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
1427 			struct i915_request *w =
1428 				container_of(p->waiter, typeof(*w), sched);
1429 
1430 			/* Leave semaphores spinning on the other engines */
1431 			if (w->engine != rq->engine)
1432 				continue;
1433 
1434 			/* No waiter should start before its signaler */
1435 			GEM_BUG_ON(i915_request_started(w) &&
1436 				   !i915_request_completed(rq));
1437 
1438 			GEM_BUG_ON(i915_request_is_active(w));
1439 			if (list_empty(&w->sched.link))
1440 				continue; /* Not yet submitted; unready */
1441 
1442 			if (rq_prio(w) < rq_prio(rq))
1443 				continue;
1444 
1445 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1446 			list_move_tail(&w->sched.link, &list);
1447 		}
1448 
1449 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1450 	} while (rq);
1451 }
1452 
1453 static void defer_active(struct intel_engine_cs *engine)
1454 {
1455 	struct i915_request *rq;
1456 
1457 	rq = __unwind_incomplete_requests(engine);
1458 	if (!rq)
1459 		return;
1460 
1461 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1462 }
1463 
1464 static bool
1465 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1466 {
1467 	int hint;
1468 
1469 	if (!intel_engine_has_timeslices(engine))
1470 		return false;
1471 
1472 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1473 		return false;
1474 
1475 	hint = max(rq_prio(list_next_entry(rq, sched.link)),
1476 		   engine->execlists.queue_priority_hint);
1477 
1478 	return hint >= effective_prio(rq);
1479 }
1480 
1481 static int
1482 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1483 {
1484 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1485 		return INT_MIN;
1486 
1487 	return rq_prio(list_next_entry(rq, sched.link));
1488 }
1489 
1490 static inline unsigned long
1491 timeslice(const struct intel_engine_cs *engine)
1492 {
1493 	return READ_ONCE(engine->props.timeslice_duration_ms);
1494 }
1495 
1496 static unsigned long
1497 active_timeslice(const struct intel_engine_cs *engine)
1498 {
1499 	const struct i915_request *rq = *engine->execlists.active;
1500 
1501 	if (i915_request_completed(rq))
1502 		return 0;
1503 
1504 	if (engine->execlists.switch_priority_hint < effective_prio(rq))
1505 		return 0;
1506 
1507 	return timeslice(engine);
1508 }
1509 
1510 static void set_timeslice(struct intel_engine_cs *engine)
1511 {
1512 	if (!intel_engine_has_timeslices(engine))
1513 		return;
1514 
1515 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1516 }
1517 
1518 static void record_preemption(struct intel_engine_execlists *execlists)
1519 {
1520 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1521 }
1522 
1523 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
1524 {
1525 	struct i915_request *rq;
1526 
1527 	rq = last_active(&engine->execlists);
1528 	if (!rq)
1529 		return 0;
1530 
1531 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1532 	if (unlikely(i915_gem_context_is_banned(rq->gem_context)))
1533 		return 1;
1534 
1535 	return READ_ONCE(engine->props.preempt_timeout_ms);
1536 }
1537 
1538 static void set_preempt_timeout(struct intel_engine_cs *engine)
1539 {
1540 	if (!intel_engine_has_preempt_reset(engine))
1541 		return;
1542 
1543 	set_timer_ms(&engine->execlists.preempt,
1544 		     active_preempt_timeout(engine));
1545 }
1546 
1547 static void execlists_dequeue(struct intel_engine_cs *engine)
1548 {
1549 	struct intel_engine_execlists * const execlists = &engine->execlists;
1550 	struct i915_request **port = execlists->pending;
1551 	struct i915_request ** const last_port = port + execlists->port_mask;
1552 	struct i915_request *last;
1553 	struct rb_node *rb;
1554 	bool submit = false;
1555 
1556 	/*
1557 	 * Hardware submission is through 2 ports. Conceptually each port
1558 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1559 	 * static for a context, and unique to each, so we only execute
1560 	 * requests belonging to a single context from each ring. RING_HEAD
1561 	 * is maintained by the CS in the context image, it marks the place
1562 	 * where it got up to last time, and through RING_TAIL we tell the CS
1563 	 * where we want to execute up to this time.
1564 	 *
1565 	 * In this list the requests are in order of execution. Consecutive
1566 	 * requests from the same context are adjacent in the ringbuffer. We
1567 	 * can combine these requests into a single RING_TAIL update:
1568 	 *
1569 	 *              RING_HEAD...req1...req2
1570 	 *                                    ^- RING_TAIL
1571 	 * since to execute req2 the CS must first execute req1.
1572 	 *
1573 	 * Our goal then is to point each port to the end of a consecutive
1574 	 * sequence of requests as being the most optimal (fewest wake ups
1575 	 * and context switches) submission.
1576 	 */
1577 
1578 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1579 		struct virtual_engine *ve =
1580 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1581 		struct i915_request *rq = READ_ONCE(ve->request);
1582 
1583 		if (!rq) { /* lazily cleanup after another engine handled rq */
1584 			rb_erase_cached(rb, &execlists->virtual);
1585 			RB_CLEAR_NODE(rb);
1586 			rb = rb_first_cached(&execlists->virtual);
1587 			continue;
1588 		}
1589 
1590 		if (!virtual_matches(ve, rq, engine)) {
1591 			rb = rb_next(rb);
1592 			continue;
1593 		}
1594 
1595 		break;
1596 	}
1597 
1598 	/*
1599 	 * If the queue is higher priority than the last
1600 	 * request in the currently active context, submit afresh.
1601 	 * We will resubmit again afterwards in case we need to split
1602 	 * the active context to interject the preemption request,
1603 	 * i.e. we will retrigger preemption following the ack in case
1604 	 * of trouble.
1605 	 */
1606 	last = last_active(execlists);
1607 	if (last) {
1608 		if (need_preempt(engine, last, rb)) {
1609 			GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n",
1610 				  engine->name,
1611 				  last->fence.context,
1612 				  last->fence.seqno,
1613 				  last->sched.attr.priority,
1614 				  execlists->queue_priority_hint);
1615 			record_preemption(execlists);
1616 
1617 			/*
1618 			 * Don't let the RING_HEAD advance past the breadcrumb
1619 			 * as we unwind (and until we resubmit) so that we do
1620 			 * not accidentally tell it to go backwards.
1621 			 */
1622 			ring_set_paused(engine, 1);
1623 
1624 			/*
1625 			 * Note that we have not stopped the GPU at this point,
1626 			 * so we are unwinding the incomplete requests as they
1627 			 * remain inflight and so by the time we do complete
1628 			 * the preemption, some of the unwound requests may
1629 			 * complete!
1630 			 */
1631 			__unwind_incomplete_requests(engine);
1632 
1633 			/*
1634 			 * If we need to return to the preempted context, we
1635 			 * need to skip the lite-restore and force it to
1636 			 * reload the RING_TAIL. Otherwise, the HW has a
1637 			 * tendency to ignore us rewinding the TAIL to the
1638 			 * end of an earlier request.
1639 			 */
1640 			last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1641 			last = NULL;
1642 		} else if (need_timeslice(engine, last) &&
1643 			   timer_expired(&engine->execlists.timer)) {
1644 			GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n",
1645 				  engine->name,
1646 				  last->fence.context,
1647 				  last->fence.seqno,
1648 				  last->sched.attr.priority,
1649 				  execlists->queue_priority_hint);
1650 
1651 			ring_set_paused(engine, 1);
1652 			defer_active(engine);
1653 
1654 			/*
1655 			 * Unlike for preemption, if we rewind and continue
1656 			 * executing the same context as previously active,
1657 			 * the order of execution will remain the same and
1658 			 * the tail will only advance. We do not need to
1659 			 * force a full context restore, as a lite-restore
1660 			 * is sufficient to resample the monotonic TAIL.
1661 			 *
1662 			 * If we switch to any other context, similarly we
1663 			 * will not rewind TAIL of current context, and
1664 			 * normal save/restore will preserve state and allow
1665 			 * us to later continue executing the same request.
1666 			 */
1667 			last = NULL;
1668 		} else {
1669 			/*
1670 			 * Otherwise if we already have a request pending
1671 			 * for execution after the current one, we can
1672 			 * just wait until the next CS event before
1673 			 * queuing more. In either case we will force a
1674 			 * lite-restore preemption event, but if we wait
1675 			 * we hopefully coalesce several updates into a single
1676 			 * submission.
1677 			 */
1678 			if (!list_is_last(&last->sched.link,
1679 					  &engine->active.requests)) {
1680 				/*
1681 				 * Even if ELSP[1] is occupied and not worthy
1682 				 * of timeslices, our queue might be.
1683 				 */
1684 				if (!execlists->timer.expires &&
1685 				    need_timeslice(engine, last))
1686 					set_timer_ms(&execlists->timer,
1687 						     timeslice(engine));
1688 
1689 				return;
1690 			}
1691 
1692 			/*
1693 			 * WaIdleLiteRestore:bdw,skl
1694 			 * Apply the wa NOOPs to prevent
1695 			 * ring:HEAD == rq:TAIL as we resubmit the
1696 			 * request. See gen8_emit_fini_breadcrumb() for
1697 			 * where we prepare the padding after the
1698 			 * end of the request.
1699 			 */
1700 			last->tail = last->wa_tail;
1701 		}
1702 	}
1703 
1704 	while (rb) { /* XXX virtual is always taking precedence */
1705 		struct virtual_engine *ve =
1706 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1707 		struct i915_request *rq;
1708 
1709 		spin_lock(&ve->base.active.lock);
1710 
1711 		rq = ve->request;
1712 		if (unlikely(!rq)) { /* lost the race to a sibling */
1713 			spin_unlock(&ve->base.active.lock);
1714 			rb_erase_cached(rb, &execlists->virtual);
1715 			RB_CLEAR_NODE(rb);
1716 			rb = rb_first_cached(&execlists->virtual);
1717 			continue;
1718 		}
1719 
1720 		GEM_BUG_ON(rq != ve->request);
1721 		GEM_BUG_ON(rq->engine != &ve->base);
1722 		GEM_BUG_ON(rq->hw_context != &ve->context);
1723 
1724 		if (rq_prio(rq) >= queue_prio(execlists)) {
1725 			if (!virtual_matches(ve, rq, engine)) {
1726 				spin_unlock(&ve->base.active.lock);
1727 				rb = rb_next(rb);
1728 				continue;
1729 			}
1730 
1731 			if (last && !can_merge_rq(last, rq)) {
1732 				spin_unlock(&ve->base.active.lock);
1733 				return; /* leave this for another */
1734 			}
1735 
1736 			GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
1737 				  engine->name,
1738 				  rq->fence.context,
1739 				  rq->fence.seqno,
1740 				  i915_request_completed(rq) ? "!" :
1741 				  i915_request_started(rq) ? "*" :
1742 				  "",
1743 				  yesno(engine != ve->siblings[0]));
1744 
1745 			ve->request = NULL;
1746 			ve->base.execlists.queue_priority_hint = INT_MIN;
1747 			rb_erase_cached(rb, &execlists->virtual);
1748 			RB_CLEAR_NODE(rb);
1749 
1750 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1751 			rq->engine = engine;
1752 
1753 			if (engine != ve->siblings[0]) {
1754 				u32 *regs = ve->context.lrc_reg_state;
1755 				unsigned int n;
1756 
1757 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1758 
1759 				if (!intel_engine_has_relative_mmio(engine))
1760 					virtual_update_register_offsets(regs,
1761 									engine);
1762 
1763 				if (!list_empty(&ve->context.signals))
1764 					virtual_xfer_breadcrumbs(ve, engine);
1765 
1766 				/*
1767 				 * Move the bound engine to the top of the list
1768 				 * for future execution. We then kick this
1769 				 * tasklet first before checking others, so that
1770 				 * we preferentially reuse this set of bound
1771 				 * registers.
1772 				 */
1773 				for (n = 1; n < ve->num_siblings; n++) {
1774 					if (ve->siblings[n] == engine) {
1775 						swap(ve->siblings[n],
1776 						     ve->siblings[0]);
1777 						break;
1778 					}
1779 				}
1780 
1781 				GEM_BUG_ON(ve->siblings[0] != engine);
1782 			}
1783 
1784 			if (__i915_request_submit(rq)) {
1785 				submit = true;
1786 				last = rq;
1787 			}
1788 			i915_request_put(rq);
1789 
1790 			/*
1791 			 * Hmm, we have a bunch of virtual engine requests,
1792 			 * but the first one was already completed (thanks
1793 			 * preempt-to-busy!). Keep looking at the veng queue
1794 			 * until we have no more relevant requests (i.e.
1795 			 * the normal submit queue has higher priority).
1796 			 */
1797 			if (!submit) {
1798 				spin_unlock(&ve->base.active.lock);
1799 				rb = rb_first_cached(&execlists->virtual);
1800 				continue;
1801 			}
1802 		}
1803 
1804 		spin_unlock(&ve->base.active.lock);
1805 		break;
1806 	}
1807 
1808 	while ((rb = rb_first_cached(&execlists->queue))) {
1809 		struct i915_priolist *p = to_priolist(rb);
1810 		struct i915_request *rq, *rn;
1811 		int i;
1812 
1813 		priolist_for_each_request_consume(rq, rn, p, i) {
1814 			bool merge = true;
1815 
1816 			/*
1817 			 * Can we combine this request with the current port?
1818 			 * It has to be the same context/ringbuffer and not
1819 			 * have any exceptions (e.g. GVT saying never to
1820 			 * combine contexts).
1821 			 *
1822 			 * If we can combine the requests, we can execute both
1823 			 * by updating the RING_TAIL to point to the end of the
1824 			 * second request, and so we never need to tell the
1825 			 * hardware about the first.
1826 			 */
1827 			if (last && !can_merge_rq(last, rq)) {
1828 				/*
1829 				 * If we are on the second port and cannot
1830 				 * combine this request with the last, then we
1831 				 * are done.
1832 				 */
1833 				if (port == last_port)
1834 					goto done;
1835 
1836 				/*
1837 				 * We must not populate both ELSP[] with the
1838 				 * same LRCA, i.e. we must submit 2 different
1839 				 * contexts if we submit 2 ELSP.
1840 				 */
1841 				if (last->hw_context == rq->hw_context)
1842 					goto done;
1843 
1844 				if (i915_request_has_sentinel(last))
1845 					goto done;
1846 
1847 				/*
1848 				 * If GVT overrides us we only ever submit
1849 				 * port[0], leaving port[1] empty. Note that we
1850 				 * also have to be careful that we don't queue
1851 				 * the same context (even though a different
1852 				 * request) to the second port.
1853 				 */
1854 				if (ctx_single_port_submission(last->hw_context) ||
1855 				    ctx_single_port_submission(rq->hw_context))
1856 					goto done;
1857 
1858 				merge = false;
1859 			}
1860 
1861 			if (__i915_request_submit(rq)) {
1862 				if (!merge) {
1863 					*port = execlists_schedule_in(last, port - execlists->pending);
1864 					port++;
1865 					last = NULL;
1866 				}
1867 
1868 				GEM_BUG_ON(last &&
1869 					   !can_merge_ctx(last->hw_context,
1870 							  rq->hw_context));
1871 
1872 				submit = true;
1873 				last = rq;
1874 			}
1875 		}
1876 
1877 		rb_erase_cached(&p->node, &execlists->queue);
1878 		i915_priolist_free(p);
1879 	}
1880 
1881 done:
1882 	/*
1883 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
1884 	 *
1885 	 * We choose the priority hint such that if we add a request of greater
1886 	 * priority than this, we kick the submission tasklet to decide on
1887 	 * the right order of submitting the requests to hardware. We must
1888 	 * also be prepared to reorder requests as they are in-flight on the
1889 	 * HW. We derive the priority hint then as the first "hole" in
1890 	 * the HW submission ports and if there are no available slots,
1891 	 * the priority of the lowest executing request, i.e. last.
1892 	 *
1893 	 * When we do receive a higher priority request ready to run from the
1894 	 * user, see queue_request(), the priority hint is bumped to that
1895 	 * request triggering preemption on the next dequeue (or subsequent
1896 	 * interrupt for secondary ports).
1897 	 */
1898 	execlists->queue_priority_hint = queue_prio(execlists);
1899 	GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n",
1900 		  engine->name, execlists->queue_priority_hint,
1901 		  yesno(submit));
1902 
1903 	if (submit) {
1904 		*port = execlists_schedule_in(last, port - execlists->pending);
1905 		execlists->switch_priority_hint =
1906 			switch_prio(engine, *execlists->pending);
1907 
1908 		/*
1909 		 * Skip if we ended up with exactly the same set of requests,
1910 		 * e.g. trying to timeslice a pair of ordered contexts
1911 		 */
1912 		if (!memcmp(execlists->active, execlists->pending,
1913 			    (port - execlists->pending + 1) * sizeof(*port))) {
1914 			do
1915 				execlists_schedule_out(fetch_and_zero(port));
1916 			while (port-- != execlists->pending);
1917 
1918 			goto skip_submit;
1919 		}
1920 
1921 		memset(port + 1, 0, (last_port - port) * sizeof(*port));
1922 		execlists_submit_ports(engine);
1923 
1924 		set_preempt_timeout(engine);
1925 	} else {
1926 skip_submit:
1927 		ring_set_paused(engine, 0);
1928 	}
1929 }
1930 
1931 static void
1932 cancel_port_requests(struct intel_engine_execlists * const execlists)
1933 {
1934 	struct i915_request * const *port, *rq;
1935 
1936 	for (port = execlists->pending; (rq = *port); port++)
1937 		execlists_schedule_out(rq);
1938 	memset(execlists->pending, 0, sizeof(execlists->pending));
1939 
1940 	for (port = execlists->active; (rq = *port); port++)
1941 		execlists_schedule_out(rq);
1942 	execlists->active =
1943 		memset(execlists->inflight, 0, sizeof(execlists->inflight));
1944 }
1945 
1946 static inline void
1947 invalidate_csb_entries(const u32 *first, const u32 *last)
1948 {
1949 	clflush((void *)first);
1950 	clflush((void *)last);
1951 }
1952 
1953 static inline bool
1954 reset_in_progress(const struct intel_engine_execlists *execlists)
1955 {
1956 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1957 }
1958 
1959 /*
1960  * Starting with Gen12, the status has a new format:
1961  *
1962  *     bit  0:     switched to new queue
1963  *     bit  1:     reserved
1964  *     bit  2:     semaphore wait mode (poll or signal), only valid when
1965  *                 switch detail is set to "wait on semaphore"
1966  *     bits 3-5:   engine class
1967  *     bits 6-11:  engine instance
1968  *     bits 12-14: reserved
1969  *     bits 15-25: sw context id of the lrc the GT switched to
1970  *     bits 26-31: sw counter of the lrc the GT switched to
1971  *     bits 32-35: context switch detail
1972  *                  - 0: ctx complete
1973  *                  - 1: wait on sync flip
1974  *                  - 2: wait on vblank
1975  *                  - 3: wait on scanline
1976  *                  - 4: wait on semaphore
1977  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
1978  *                       WAIT_FOR_EVENT)
1979  *     bit  36:    reserved
1980  *     bits 37-43: wait detail (for switch detail 1 to 4)
1981  *     bits 44-46: reserved
1982  *     bits 47-57: sw context id of the lrc the GT switched away from
1983  *     bits 58-63: sw counter of the lrc the GT switched away from
1984  */
1985 static inline bool
1986 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1987 {
1988 	u32 lower_dw = csb[0];
1989 	u32 upper_dw = csb[1];
1990 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
1991 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
1992 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
1993 
1994 	/*
1995 	 * The context switch detail is not guaranteed to be 5 when a preemption
1996 	 * occurs, so we can't just check for that. The check below works for
1997 	 * all the cases we care about, including preemptions of WAIT
1998 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
1999 	 * would require some extra handling, but we don't support that.
2000 	 */
2001 	if (!ctx_away_valid || new_queue) {
2002 		GEM_BUG_ON(!ctx_to_valid);
2003 		return true;
2004 	}
2005 
2006 	/*
2007 	 * switch detail = 5 is covered by the case above and we do not expect a
2008 	 * context switch on an unsuccessful wait instruction since we always
2009 	 * use polling mode.
2010 	 */
2011 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2012 	return false;
2013 }
2014 
2015 static inline bool
2016 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2017 {
2018 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2019 }
2020 
2021 static void process_csb(struct intel_engine_cs *engine)
2022 {
2023 	struct intel_engine_execlists * const execlists = &engine->execlists;
2024 	const u32 * const buf = execlists->csb_status;
2025 	const u8 num_entries = execlists->csb_size;
2026 	u8 head, tail;
2027 
2028 	/*
2029 	 * As we modify our execlists state tracking we require exclusive
2030 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2031 	 * and we assume that is only inside the reset paths and so serialised.
2032 	 */
2033 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2034 		   !reset_in_progress(execlists));
2035 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2036 
2037 	/*
2038 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2039 	 * When reading from the csb_write mmio register, we have to be
2040 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2041 	 * the low 4bits. As it happens we know the next 4bits are always
2042 	 * zero and so we can simply masked off the low u8 of the register
2043 	 * and treat it identically to reading from the HWSP (without having
2044 	 * to use explicit shifting and masking, and probably bifurcating
2045 	 * the code to handle the legacy mmio read).
2046 	 */
2047 	head = execlists->csb_head;
2048 	tail = READ_ONCE(*execlists->csb_write);
2049 	GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
2050 	if (unlikely(head == tail))
2051 		return;
2052 
2053 	/*
2054 	 * Hopefully paired with a wmb() in HW!
2055 	 *
2056 	 * We must complete the read of the write pointer before any reads
2057 	 * from the CSB, so that we do not see stale values. Without an rmb
2058 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2059 	 * we perform the READ_ONCE(*csb_write).
2060 	 */
2061 	rmb();
2062 
2063 	do {
2064 		bool promote;
2065 
2066 		if (++head == num_entries)
2067 			head = 0;
2068 
2069 		/*
2070 		 * We are flying near dragons again.
2071 		 *
2072 		 * We hold a reference to the request in execlist_port[]
2073 		 * but no more than that. We are operating in softirq
2074 		 * context and so cannot hold any mutex or sleep. That
2075 		 * prevents us stopping the requests we are processing
2076 		 * in port[] from being retired simultaneously (the
2077 		 * breadcrumb will be complete before we see the
2078 		 * context-switch). As we only hold the reference to the
2079 		 * request, any pointer chasing underneath the request
2080 		 * is subject to a potential use-after-free. Thus we
2081 		 * store all of the bookkeeping within port[] as
2082 		 * required, and avoid using unguarded pointers beneath
2083 		 * request itself. The same applies to the atomic
2084 		 * status notifier.
2085 		 */
2086 
2087 		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
2088 			  engine->name, head,
2089 			  buf[2 * head + 0], buf[2 * head + 1]);
2090 
2091 		if (INTEL_GEN(engine->i915) >= 12)
2092 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2093 		else
2094 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2095 		if (promote) {
2096 			if (!inject_preempt_hang(execlists))
2097 				ring_set_paused(engine, 0);
2098 
2099 			/* cancel old inflight, prepare for switch */
2100 			trace_ports(execlists, "preempted", execlists->active);
2101 			while (*execlists->active)
2102 				execlists_schedule_out(*execlists->active++);
2103 
2104 			/* switch pending to inflight */
2105 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2106 			execlists->active =
2107 				memcpy(execlists->inflight,
2108 				       execlists->pending,
2109 				       execlists_num_ports(execlists) *
2110 				       sizeof(*execlists->pending));
2111 
2112 			set_timeslice(engine);
2113 
2114 			WRITE_ONCE(execlists->pending[0], NULL);
2115 		} else {
2116 			GEM_BUG_ON(!*execlists->active);
2117 
2118 			/* port0 completed, advanced to port1 */
2119 			trace_ports(execlists, "completed", execlists->active);
2120 
2121 			/*
2122 			 * We rely on the hardware being strongly
2123 			 * ordered, that the breadcrumb write is
2124 			 * coherent (visible from the CPU) before the
2125 			 * user interrupt and CSB is processed.
2126 			 */
2127 			GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2128 				   !reset_in_progress(execlists));
2129 			execlists_schedule_out(*execlists->active++);
2130 
2131 			GEM_BUG_ON(execlists->active - execlists->inflight >
2132 				   execlists_num_ports(execlists));
2133 		}
2134 	} while (head != tail);
2135 
2136 	execlists->csb_head = head;
2137 
2138 	/*
2139 	 * Gen11 has proven to fail wrt global observation point between
2140 	 * entry and tail update, failing on the ordering and thus
2141 	 * we see an old entry in the context status buffer.
2142 	 *
2143 	 * Forcibly evict out entries for the next gpu csb update,
2144 	 * to increase the odds that we get a fresh entries with non
2145 	 * working hardware. The cost for doing so comes out mostly with
2146 	 * the wash as hardware, working or not, will need to do the
2147 	 * invalidation before.
2148 	 */
2149 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2150 }
2151 
2152 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2153 {
2154 	lockdep_assert_held(&engine->active.lock);
2155 	if (!engine->execlists.pending[0]) {
2156 		rcu_read_lock(); /* protect peeking at execlists->active */
2157 		execlists_dequeue(engine);
2158 		rcu_read_unlock();
2159 	}
2160 }
2161 
2162 static noinline void preempt_reset(struct intel_engine_cs *engine)
2163 {
2164 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2165 	unsigned long *lock = &engine->gt->reset.flags;
2166 
2167 	if (i915_modparams.reset < 3)
2168 		return;
2169 
2170 	if (test_and_set_bit(bit, lock))
2171 		return;
2172 
2173 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2174 	tasklet_disable_nosync(&engine->execlists.tasklet);
2175 
2176 	GEM_TRACE("%s: preempt timeout %lu+%ums\n",
2177 		  engine->name,
2178 		  READ_ONCE(engine->props.preempt_timeout_ms),
2179 		  jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2180 	intel_engine_reset(engine, "preemption time out");
2181 
2182 	tasklet_enable(&engine->execlists.tasklet);
2183 	clear_and_wake_up_bit(bit, lock);
2184 }
2185 
2186 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2187 {
2188 	const struct timer_list *t = &engine->execlists.preempt;
2189 
2190 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2191 		return false;
2192 
2193 	if (!timer_expired(t))
2194 		return false;
2195 
2196 	return READ_ONCE(engine->execlists.pending[0]);
2197 }
2198 
2199 /*
2200  * Check the unread Context Status Buffers and manage the submission of new
2201  * contexts to the ELSP accordingly.
2202  */
2203 static void execlists_submission_tasklet(unsigned long data)
2204 {
2205 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2206 	bool timeout = preempt_timeout(engine);
2207 
2208 	process_csb(engine);
2209 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2210 		unsigned long flags;
2211 
2212 		spin_lock_irqsave(&engine->active.lock, flags);
2213 		__execlists_submission_tasklet(engine);
2214 		spin_unlock_irqrestore(&engine->active.lock, flags);
2215 
2216 		/* Recheck after serialising with direct-submission */
2217 		if (timeout && preempt_timeout(engine))
2218 			preempt_reset(engine);
2219 	}
2220 }
2221 
2222 static void __execlists_kick(struct intel_engine_execlists *execlists)
2223 {
2224 	/* Kick the tasklet for some interrupt coalescing and reset handling */
2225 	tasklet_hi_schedule(&execlists->tasklet);
2226 }
2227 
2228 #define execlists_kick(t, member) \
2229 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2230 
2231 static void execlists_timeslice(struct timer_list *timer)
2232 {
2233 	execlists_kick(timer, timer);
2234 }
2235 
2236 static void execlists_preempt(struct timer_list *timer)
2237 {
2238 	execlists_kick(timer, preempt);
2239 }
2240 
2241 static void queue_request(struct intel_engine_cs *engine,
2242 			  struct i915_sched_node *node,
2243 			  int prio)
2244 {
2245 	GEM_BUG_ON(!list_empty(&node->link));
2246 	list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
2247 }
2248 
2249 static void __submit_queue_imm(struct intel_engine_cs *engine)
2250 {
2251 	struct intel_engine_execlists * const execlists = &engine->execlists;
2252 
2253 	if (reset_in_progress(execlists))
2254 		return; /* defer until we restart the engine following reset */
2255 
2256 	if (execlists->tasklet.func == execlists_submission_tasklet)
2257 		__execlists_submission_tasklet(engine);
2258 	else
2259 		tasklet_hi_schedule(&execlists->tasklet);
2260 }
2261 
2262 static void submit_queue(struct intel_engine_cs *engine,
2263 			 const struct i915_request *rq)
2264 {
2265 	struct intel_engine_execlists *execlists = &engine->execlists;
2266 
2267 	if (rq_prio(rq) <= execlists->queue_priority_hint)
2268 		return;
2269 
2270 	execlists->queue_priority_hint = rq_prio(rq);
2271 	__submit_queue_imm(engine);
2272 }
2273 
2274 static void execlists_submit_request(struct i915_request *request)
2275 {
2276 	struct intel_engine_cs *engine = request->engine;
2277 	unsigned long flags;
2278 
2279 	/* Will be called from irq-context when using foreign fences. */
2280 	spin_lock_irqsave(&engine->active.lock, flags);
2281 
2282 	queue_request(engine, &request->sched, rq_prio(request));
2283 
2284 	GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2285 	GEM_BUG_ON(list_empty(&request->sched.link));
2286 
2287 	submit_queue(engine, request);
2288 
2289 	spin_unlock_irqrestore(&engine->active.lock, flags);
2290 }
2291 
2292 static void __execlists_context_fini(struct intel_context *ce)
2293 {
2294 	intel_ring_put(ce->ring);
2295 	i915_vma_put(ce->state);
2296 }
2297 
2298 static void execlists_context_destroy(struct kref *kref)
2299 {
2300 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2301 
2302 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2303 	GEM_BUG_ON(intel_context_is_pinned(ce));
2304 
2305 	if (ce->state)
2306 		__execlists_context_fini(ce);
2307 
2308 	intel_context_fini(ce);
2309 	intel_context_free(ce);
2310 }
2311 
2312 static void
2313 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2314 {
2315 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2316 		return;
2317 
2318 	vaddr += engine->context_size;
2319 
2320 	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
2321 }
2322 
2323 static void
2324 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2325 {
2326 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2327 		return;
2328 
2329 	vaddr += engine->context_size;
2330 
2331 	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE))
2332 		dev_err_once(engine->i915->drm.dev,
2333 			     "%s context redzone overwritten!\n",
2334 			     engine->name);
2335 }
2336 
2337 static void execlists_context_unpin(struct intel_context *ce)
2338 {
2339 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2340 		      ce->engine);
2341 
2342 	i915_gem_object_unpin_map(ce->state->obj);
2343 	intel_ring_reset(ce->ring, ce->ring->tail);
2344 }
2345 
2346 static void
2347 __execlists_update_reg_state(const struct intel_context *ce,
2348 			     const struct intel_engine_cs *engine)
2349 {
2350 	struct intel_ring *ring = ce->ring;
2351 	u32 *regs = ce->lrc_reg_state;
2352 
2353 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
2354 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2355 
2356 	regs[CTX_RING_BUFFER_START] = i915_ggtt_offset(ring->vma);
2357 	regs[CTX_RING_HEAD] = ring->head;
2358 	regs[CTX_RING_TAIL] = ring->tail;
2359 
2360 	/* RPCS */
2361 	if (engine->class == RENDER_CLASS) {
2362 		regs[CTX_R_PWR_CLK_STATE] =
2363 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2364 
2365 		i915_oa_init_reg_state(ce, engine);
2366 	}
2367 }
2368 
2369 static int
2370 __execlists_context_pin(struct intel_context *ce,
2371 			struct intel_engine_cs *engine)
2372 {
2373 	void *vaddr;
2374 	int ret;
2375 
2376 	GEM_BUG_ON(!ce->state);
2377 
2378 	ret = intel_context_active_acquire(ce);
2379 	if (ret)
2380 		goto err;
2381 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2382 
2383 	vaddr = i915_gem_object_pin_map(ce->state->obj,
2384 					i915_coherent_map_type(engine->i915) |
2385 					I915_MAP_OVERRIDE);
2386 	if (IS_ERR(vaddr)) {
2387 		ret = PTR_ERR(vaddr);
2388 		goto unpin_active;
2389 	}
2390 
2391 	ce->lrc_desc = lrc_descriptor(ce, engine);
2392 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2393 	__execlists_update_reg_state(ce, engine);
2394 
2395 	return 0;
2396 
2397 unpin_active:
2398 	intel_context_active_release(ce);
2399 err:
2400 	return ret;
2401 }
2402 
2403 static int execlists_context_pin(struct intel_context *ce)
2404 {
2405 	return __execlists_context_pin(ce, ce->engine);
2406 }
2407 
2408 static int execlists_context_alloc(struct intel_context *ce)
2409 {
2410 	return __execlists_context_alloc(ce, ce->engine);
2411 }
2412 
2413 static void execlists_context_reset(struct intel_context *ce)
2414 {
2415 	/*
2416 	 * Because we emit WA_TAIL_DWORDS there may be a disparity
2417 	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2418 	 * that stored in context. As we only write new commands from
2419 	 * ce->ring->tail onwards, everything before that is junk. If the GPU
2420 	 * starts reading from its RING_HEAD from the context, it may try to
2421 	 * execute that junk and die.
2422 	 *
2423 	 * The contexts that are stilled pinned on resume belong to the
2424 	 * kernel, and are local to each engine. All other contexts will
2425 	 * have their head/tail sanitized upon pinning before use, so they
2426 	 * will never see garbage,
2427 	 *
2428 	 * So to avoid that we reset the context images upon resume. For
2429 	 * simplicity, we just zero everything out.
2430 	 */
2431 	intel_ring_reset(ce->ring, 0);
2432 	__execlists_update_reg_state(ce, ce->engine);
2433 }
2434 
2435 static const struct intel_context_ops execlists_context_ops = {
2436 	.alloc = execlists_context_alloc,
2437 
2438 	.pin = execlists_context_pin,
2439 	.unpin = execlists_context_unpin,
2440 
2441 	.enter = intel_context_enter_engine,
2442 	.exit = intel_context_exit_engine,
2443 
2444 	.reset = execlists_context_reset,
2445 	.destroy = execlists_context_destroy,
2446 };
2447 
2448 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2449 {
2450 	u32 *cs;
2451 
2452 	GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2453 
2454 	cs = intel_ring_begin(rq, 6);
2455 	if (IS_ERR(cs))
2456 		return PTR_ERR(cs);
2457 
2458 	/*
2459 	 * Check if we have been preempted before we even get started.
2460 	 *
2461 	 * After this point i915_request_started() reports true, even if
2462 	 * we get preempted and so are no longer running.
2463 	 */
2464 	*cs++ = MI_ARB_CHECK;
2465 	*cs++ = MI_NOOP;
2466 
2467 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2468 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
2469 	*cs++ = 0;
2470 	*cs++ = rq->fence.seqno - 1;
2471 
2472 	intel_ring_advance(rq, cs);
2473 
2474 	/* Record the updated position of the request's payload */
2475 	rq->infix = intel_ring_offset(rq, cs);
2476 
2477 	return 0;
2478 }
2479 
2480 static int execlists_request_alloc(struct i915_request *request)
2481 {
2482 	int ret;
2483 
2484 	GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
2485 
2486 	/*
2487 	 * Flush enough space to reduce the likelihood of waiting after
2488 	 * we start building the request - in which case we will just
2489 	 * have to repeat work.
2490 	 */
2491 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
2492 
2493 	/*
2494 	 * Note that after this point, we have committed to using
2495 	 * this request as it is being used to both track the
2496 	 * state of engine initialisation and liveness of the
2497 	 * golden renderstate above. Think twice before you try
2498 	 * to cancel/unwind this request now.
2499 	 */
2500 
2501 	/* Unconditionally invalidate GPU caches and TLBs. */
2502 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
2503 	if (ret)
2504 		return ret;
2505 
2506 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
2507 	return 0;
2508 }
2509 
2510 /*
2511  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
2512  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
2513  * but there is a slight complication as this is applied in WA batch where the
2514  * values are only initialized once so we cannot take register value at the
2515  * beginning and reuse it further; hence we save its value to memory, upload a
2516  * constant value with bit21 set and then we restore it back with the saved value.
2517  * To simplify the WA, a constant value is formed by using the default value
2518  * of this register. This shouldn't be a problem because we are only modifying
2519  * it for a short period and this batch in non-premptible. We can ofcourse
2520  * use additional instructions that read the actual value of the register
2521  * at that time and set our bit of interest but it makes the WA complicated.
2522  *
2523  * This WA is also required for Gen9 so extracting as a function avoids
2524  * code duplication.
2525  */
2526 static u32 *
2527 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
2528 {
2529 	/* NB no one else is allowed to scribble over scratch + 256! */
2530 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2531 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2532 	*batch++ = intel_gt_scratch_offset(engine->gt,
2533 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2534 	*batch++ = 0;
2535 
2536 	*batch++ = MI_LOAD_REGISTER_IMM(1);
2537 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2538 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
2539 
2540 	batch = gen8_emit_pipe_control(batch,
2541 				       PIPE_CONTROL_CS_STALL |
2542 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
2543 				       0);
2544 
2545 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2546 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2547 	*batch++ = intel_gt_scratch_offset(engine->gt,
2548 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2549 	*batch++ = 0;
2550 
2551 	return batch;
2552 }
2553 
2554 /*
2555  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
2556  * initialized at the beginning and shared across all contexts but this field
2557  * helps us to have multiple batches at different offsets and select them based
2558  * on a criteria. At the moment this batch always start at the beginning of the page
2559  * and at this point we don't have multiple wa_ctx batch buffers.
2560  *
2561  * The number of WA applied are not known at the beginning; we use this field
2562  * to return the no of DWORDS written.
2563  *
2564  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
2565  * so it adds NOOPs as padding to make it cacheline aligned.
2566  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
2567  * makes a complete batch buffer.
2568  */
2569 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2570 {
2571 	/* WaDisableCtxRestoreArbitration:bdw,chv */
2572 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2573 
2574 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
2575 	if (IS_BROADWELL(engine->i915))
2576 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2577 
2578 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
2579 	/* Actual scratch location is at 128 bytes offset */
2580 	batch = gen8_emit_pipe_control(batch,
2581 				       PIPE_CONTROL_FLUSH_L3 |
2582 				       PIPE_CONTROL_STORE_DATA_INDEX |
2583 				       PIPE_CONTROL_CS_STALL |
2584 				       PIPE_CONTROL_QW_WRITE,
2585 				       LRC_PPHWSP_SCRATCH_ADDR);
2586 
2587 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2588 
2589 	/* Pad to end of cacheline */
2590 	while ((unsigned long)batch % CACHELINE_BYTES)
2591 		*batch++ = MI_NOOP;
2592 
2593 	/*
2594 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
2595 	 * execution depends on the length specified in terms of cache lines
2596 	 * in the register CTX_RCS_INDIRECT_CTX
2597 	 */
2598 
2599 	return batch;
2600 }
2601 
2602 struct lri {
2603 	i915_reg_t reg;
2604 	u32 value;
2605 };
2606 
2607 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
2608 {
2609 	GEM_BUG_ON(!count || count > 63);
2610 
2611 	*batch++ = MI_LOAD_REGISTER_IMM(count);
2612 	do {
2613 		*batch++ = i915_mmio_reg_offset(lri->reg);
2614 		*batch++ = lri->value;
2615 	} while (lri++, --count);
2616 	*batch++ = MI_NOOP;
2617 
2618 	return batch;
2619 }
2620 
2621 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2622 {
2623 	static const struct lri lri[] = {
2624 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
2625 		{
2626 			COMMON_SLICE_CHICKEN2,
2627 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
2628 				       0),
2629 		},
2630 
2631 		/* BSpec: 11391 */
2632 		{
2633 			FF_SLICE_CHICKEN,
2634 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
2635 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
2636 		},
2637 
2638 		/* BSpec: 11299 */
2639 		{
2640 			_3D_CHICKEN3,
2641 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
2642 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
2643 		}
2644 	};
2645 
2646 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2647 
2648 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
2649 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2650 
2651 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
2652 
2653 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
2654 	if (HAS_POOLED_EU(engine->i915)) {
2655 		/*
2656 		 * EU pool configuration is setup along with golden context
2657 		 * during context initialization. This value depends on
2658 		 * device type (2x6 or 3x6) and needs to be updated based
2659 		 * on which subslice is disabled especially for 2x6
2660 		 * devices, however it is safe to load default
2661 		 * configuration of 3x6 device instead of masking off
2662 		 * corresponding bits because HW ignores bits of a disabled
2663 		 * subslice and drops down to appropriate config. Please
2664 		 * see render_state_setup() in i915_gem_render_state.c for
2665 		 * possible configurations, to avoid duplication they are
2666 		 * not shown here again.
2667 		 */
2668 		*batch++ = GEN9_MEDIA_POOL_STATE;
2669 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
2670 		*batch++ = 0x00777000;
2671 		*batch++ = 0;
2672 		*batch++ = 0;
2673 		*batch++ = 0;
2674 	}
2675 
2676 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2677 
2678 	/* Pad to end of cacheline */
2679 	while ((unsigned long)batch % CACHELINE_BYTES)
2680 		*batch++ = MI_NOOP;
2681 
2682 	return batch;
2683 }
2684 
2685 static u32 *
2686 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2687 {
2688 	int i;
2689 
2690 	/*
2691 	 * WaPipeControlBefore3DStateSamplePattern: cnl
2692 	 *
2693 	 * Ensure the engine is idle prior to programming a
2694 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
2695 	 */
2696 	batch = gen8_emit_pipe_control(batch,
2697 				       PIPE_CONTROL_CS_STALL,
2698 				       0);
2699 	/*
2700 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
2701 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
2702 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
2703 	 * confusing. Since gen8_emit_pipe_control() already advances the
2704 	 * batch by 6 dwords, we advance the other 10 here, completing a
2705 	 * cacheline. It's not clear if the workaround requires this padding
2706 	 * before other commands, or if it's just the regular padding we would
2707 	 * already have for the workaround bb, so leave it here for now.
2708 	 */
2709 	for (i = 0; i < 10; i++)
2710 		*batch++ = MI_NOOP;
2711 
2712 	/* Pad to end of cacheline */
2713 	while ((unsigned long)batch % CACHELINE_BYTES)
2714 		*batch++ = MI_NOOP;
2715 
2716 	return batch;
2717 }
2718 
2719 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
2720 
2721 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
2722 {
2723 	struct drm_i915_gem_object *obj;
2724 	struct i915_vma *vma;
2725 	int err;
2726 
2727 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
2728 	if (IS_ERR(obj))
2729 		return PTR_ERR(obj);
2730 
2731 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
2732 	if (IS_ERR(vma)) {
2733 		err = PTR_ERR(vma);
2734 		goto err;
2735 	}
2736 
2737 	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2738 	if (err)
2739 		goto err;
2740 
2741 	engine->wa_ctx.vma = vma;
2742 	return 0;
2743 
2744 err:
2745 	i915_gem_object_put(obj);
2746 	return err;
2747 }
2748 
2749 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
2750 {
2751 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
2752 }
2753 
2754 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
2755 
2756 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
2757 {
2758 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2759 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
2760 					    &wa_ctx->per_ctx };
2761 	wa_bb_func_t wa_bb_fn[2];
2762 	struct page *page;
2763 	void *batch, *batch_ptr;
2764 	unsigned int i;
2765 	int ret;
2766 
2767 	if (engine->class != RENDER_CLASS)
2768 		return 0;
2769 
2770 	switch (INTEL_GEN(engine->i915)) {
2771 	case 12:
2772 	case 11:
2773 		return 0;
2774 	case 10:
2775 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
2776 		wa_bb_fn[1] = NULL;
2777 		break;
2778 	case 9:
2779 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
2780 		wa_bb_fn[1] = NULL;
2781 		break;
2782 	case 8:
2783 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
2784 		wa_bb_fn[1] = NULL;
2785 		break;
2786 	default:
2787 		MISSING_CASE(INTEL_GEN(engine->i915));
2788 		return 0;
2789 	}
2790 
2791 	ret = lrc_setup_wa_ctx(engine);
2792 	if (ret) {
2793 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
2794 		return ret;
2795 	}
2796 
2797 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
2798 	batch = batch_ptr = kmap_atomic(page);
2799 
2800 	/*
2801 	 * Emit the two workaround batch buffers, recording the offset from the
2802 	 * start of the workaround batch buffer object for each and their
2803 	 * respective sizes.
2804 	 */
2805 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
2806 		wa_bb[i]->offset = batch_ptr - batch;
2807 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
2808 						  CACHELINE_BYTES))) {
2809 			ret = -EINVAL;
2810 			break;
2811 		}
2812 		if (wa_bb_fn[i])
2813 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
2814 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
2815 	}
2816 
2817 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
2818 
2819 	kunmap_atomic(batch);
2820 	if (ret)
2821 		lrc_destroy_wa_ctx(engine);
2822 
2823 	return ret;
2824 }
2825 
2826 static void enable_execlists(struct intel_engine_cs *engine)
2827 {
2828 	u32 mode;
2829 
2830 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
2831 
2832 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
2833 
2834 	if (INTEL_GEN(engine->i915) >= 11)
2835 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
2836 	else
2837 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
2838 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
2839 
2840 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
2841 
2842 	ENGINE_WRITE_FW(engine,
2843 			RING_HWS_PGA,
2844 			i915_ggtt_offset(engine->status_page.vma));
2845 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
2846 }
2847 
2848 static bool unexpected_starting_state(struct intel_engine_cs *engine)
2849 {
2850 	bool unexpected = false;
2851 
2852 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
2853 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
2854 		unexpected = true;
2855 	}
2856 
2857 	return unexpected;
2858 }
2859 
2860 static int execlists_resume(struct intel_engine_cs *engine)
2861 {
2862 	intel_engine_apply_workarounds(engine);
2863 	intel_engine_apply_whitelist(engine);
2864 
2865 	intel_mocs_init_engine(engine);
2866 
2867 	intel_engine_reset_breadcrumbs(engine);
2868 
2869 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
2870 		struct drm_printer p = drm_debug_printer(__func__);
2871 
2872 		intel_engine_dump(engine, &p, NULL);
2873 	}
2874 
2875 	enable_execlists(engine);
2876 
2877 	return 0;
2878 }
2879 
2880 static void execlists_reset_prepare(struct intel_engine_cs *engine)
2881 {
2882 	struct intel_engine_execlists * const execlists = &engine->execlists;
2883 	unsigned long flags;
2884 
2885 	GEM_TRACE("%s: depth<-%d\n", engine->name,
2886 		  atomic_read(&execlists->tasklet.count));
2887 
2888 	/*
2889 	 * Prevent request submission to the hardware until we have
2890 	 * completed the reset in i915_gem_reset_finish(). If a request
2891 	 * is completed by one engine, it may then queue a request
2892 	 * to a second via its execlists->tasklet *just* as we are
2893 	 * calling engine->resume() and also writing the ELSP.
2894 	 * Turning off the execlists->tasklet until the reset is over
2895 	 * prevents the race.
2896 	 */
2897 	__tasklet_disable_sync_once(&execlists->tasklet);
2898 	GEM_BUG_ON(!reset_in_progress(execlists));
2899 
2900 	/* And flush any current direct submission. */
2901 	spin_lock_irqsave(&engine->active.lock, flags);
2902 	spin_unlock_irqrestore(&engine->active.lock, flags);
2903 
2904 	/*
2905 	 * We stop engines, otherwise we might get failed reset and a
2906 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
2907 	 * from system hang if batchbuffer is progressing when
2908 	 * the reset is issued, regardless of READY_TO_RESET ack.
2909 	 * Thus assume it is best to stop engines on all gens
2910 	 * where we have a gpu reset.
2911 	 *
2912 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
2913 	 *
2914 	 * FIXME: Wa for more modern gens needs to be validated
2915 	 */
2916 	intel_engine_stop_cs(engine);
2917 }
2918 
2919 static void reset_csb_pointers(struct intel_engine_cs *engine)
2920 {
2921 	struct intel_engine_execlists * const execlists = &engine->execlists;
2922 	const unsigned int reset_value = execlists->csb_size - 1;
2923 
2924 	ring_set_paused(engine, 0);
2925 
2926 	/*
2927 	 * After a reset, the HW starts writing into CSB entry [0]. We
2928 	 * therefore have to set our HEAD pointer back one entry so that
2929 	 * the *first* entry we check is entry 0. To complicate this further,
2930 	 * as we don't wait for the first interrupt after reset, we have to
2931 	 * fake the HW write to point back to the last entry so that our
2932 	 * inline comparison of our cached head position against the last HW
2933 	 * write works even before the first interrupt.
2934 	 */
2935 	execlists->csb_head = reset_value;
2936 	WRITE_ONCE(*execlists->csb_write, reset_value);
2937 	wmb(); /* Make sure this is visible to HW (paranoia?) */
2938 
2939 	invalidate_csb_entries(&execlists->csb_status[0],
2940 			       &execlists->csb_status[reset_value]);
2941 }
2942 
2943 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
2944 {
2945 	if (INTEL_GEN(engine->i915) >= 12)
2946 		return 0x60;
2947 	else if (INTEL_GEN(engine->i915) >= 9)
2948 		return 0x54;
2949 	else if (engine->class == RENDER_CLASS)
2950 		return 0x58;
2951 	else
2952 		return -1;
2953 }
2954 
2955 static void __execlists_reset_reg_state(const struct intel_context *ce,
2956 					const struct intel_engine_cs *engine)
2957 {
2958 	u32 *regs = ce->lrc_reg_state;
2959 	int x;
2960 
2961 	x = lrc_ring_mi_mode(engine);
2962 	if (x != -1) {
2963 		regs[x + 1] &= ~STOP_RING;
2964 		regs[x + 1] |= STOP_RING << 16;
2965 	}
2966 }
2967 
2968 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
2969 {
2970 	struct intel_engine_execlists * const execlists = &engine->execlists;
2971 	struct intel_context *ce;
2972 	struct i915_request *rq;
2973 
2974 	mb(); /* paranoia: read the CSB pointers from after the reset */
2975 	clflush(execlists->csb_write);
2976 	mb();
2977 
2978 	process_csb(engine); /* drain preemption events */
2979 
2980 	/* Following the reset, we need to reload the CSB read/write pointers */
2981 	reset_csb_pointers(engine);
2982 
2983 	/*
2984 	 * Save the currently executing context, even if we completed
2985 	 * its request, it was still running at the time of the
2986 	 * reset and will have been clobbered.
2987 	 */
2988 	rq = execlists_active(execlists);
2989 	if (!rq)
2990 		goto unwind;
2991 
2992 	/* We still have requests in-flight; the engine should be active */
2993 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
2994 
2995 	ce = rq->hw_context;
2996 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2997 
2998 	if (i915_request_completed(rq)) {
2999 		/* Idle context; tidy up the ring so we can restart afresh */
3000 		ce->ring->head = intel_ring_wrap(ce->ring, rq->tail);
3001 		goto out_replay;
3002 	}
3003 
3004 	/* Context has requests still in-flight; it should not be idle! */
3005 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3006 	rq = active_request(ce->timeline, rq);
3007 	ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
3008 	GEM_BUG_ON(ce->ring->head == ce->ring->tail);
3009 
3010 	/*
3011 	 * If this request hasn't started yet, e.g. it is waiting on a
3012 	 * semaphore, we need to avoid skipping the request or else we
3013 	 * break the signaling chain. However, if the context is corrupt
3014 	 * the request will not restart and we will be stuck with a wedged
3015 	 * device. It is quite often the case that if we issue a reset
3016 	 * while the GPU is loading the context image, that the context
3017 	 * image becomes corrupt.
3018 	 *
3019 	 * Otherwise, if we have not started yet, the request should replay
3020 	 * perfectly and we do not need to flag the result as being erroneous.
3021 	 */
3022 	if (!i915_request_started(rq))
3023 		goto out_replay;
3024 
3025 	/*
3026 	 * If the request was innocent, we leave the request in the ELSP
3027 	 * and will try to replay it on restarting. The context image may
3028 	 * have been corrupted by the reset, in which case we may have
3029 	 * to service a new GPU hang, but more likely we can continue on
3030 	 * without impact.
3031 	 *
3032 	 * If the request was guilty, we presume the context is corrupt
3033 	 * and have to at least restore the RING register in the context
3034 	 * image back to the expected values to skip over the guilty request.
3035 	 */
3036 	__i915_request_reset(rq, stalled);
3037 	if (!stalled)
3038 		goto out_replay;
3039 
3040 	/*
3041 	 * We want a simple context + ring to execute the breadcrumb update.
3042 	 * We cannot rely on the context being intact across the GPU hang,
3043 	 * so clear it and rebuild just what we need for the breadcrumb.
3044 	 * All pending requests for this context will be zapped, and any
3045 	 * future request will be after userspace has had the opportunity
3046 	 * to recreate its own state.
3047 	 */
3048 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3049 	restore_default_state(ce, engine);
3050 
3051 out_replay:
3052 	GEM_TRACE("%s replay {head:%04x, tail:%04x}\n",
3053 		  engine->name, ce->ring->head, ce->ring->tail);
3054 	intel_ring_update_space(ce->ring);
3055 	__execlists_reset_reg_state(ce, engine);
3056 	__execlists_update_reg_state(ce, engine);
3057 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3058 
3059 unwind:
3060 	/* Push back any incomplete requests for replay after the reset. */
3061 	cancel_port_requests(execlists);
3062 	__unwind_incomplete_requests(engine);
3063 }
3064 
3065 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
3066 {
3067 	unsigned long flags;
3068 
3069 	GEM_TRACE("%s\n", engine->name);
3070 
3071 	spin_lock_irqsave(&engine->active.lock, flags);
3072 
3073 	__execlists_reset(engine, stalled);
3074 
3075 	spin_unlock_irqrestore(&engine->active.lock, flags);
3076 }
3077 
3078 static void nop_submission_tasklet(unsigned long data)
3079 {
3080 	/* The driver is wedged; don't process any more events. */
3081 }
3082 
3083 static void execlists_cancel_requests(struct intel_engine_cs *engine)
3084 {
3085 	struct intel_engine_execlists * const execlists = &engine->execlists;
3086 	struct i915_request *rq, *rn;
3087 	struct rb_node *rb;
3088 	unsigned long flags;
3089 
3090 	GEM_TRACE("%s\n", engine->name);
3091 
3092 	/*
3093 	 * Before we call engine->cancel_requests(), we should have exclusive
3094 	 * access to the submission state. This is arranged for us by the
3095 	 * caller disabling the interrupt generation, the tasklet and other
3096 	 * threads that may then access the same state, giving us a free hand
3097 	 * to reset state. However, we still need to let lockdep be aware that
3098 	 * we know this state may be accessed in hardirq context, so we
3099 	 * disable the irq around this manipulation and we want to keep
3100 	 * the spinlock focused on its duties and not accidentally conflate
3101 	 * coverage to the submission's irq state. (Similarly, although we
3102 	 * shouldn't need to disable irq around the manipulation of the
3103 	 * submission's irq state, we also wish to remind ourselves that
3104 	 * it is irq state.)
3105 	 */
3106 	spin_lock_irqsave(&engine->active.lock, flags);
3107 
3108 	__execlists_reset(engine, true);
3109 
3110 	/* Mark all executing requests as skipped. */
3111 	list_for_each_entry(rq, &engine->active.requests, sched.link)
3112 		mark_eio(rq);
3113 
3114 	/* Flush the queued requests to the timeline list (for retiring). */
3115 	while ((rb = rb_first_cached(&execlists->queue))) {
3116 		struct i915_priolist *p = to_priolist(rb);
3117 		int i;
3118 
3119 		priolist_for_each_request_consume(rq, rn, p, i) {
3120 			mark_eio(rq);
3121 			__i915_request_submit(rq);
3122 		}
3123 
3124 		rb_erase_cached(&p->node, &execlists->queue);
3125 		i915_priolist_free(p);
3126 	}
3127 
3128 	/* Cancel all attached virtual engines */
3129 	while ((rb = rb_first_cached(&execlists->virtual))) {
3130 		struct virtual_engine *ve =
3131 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3132 
3133 		rb_erase_cached(rb, &execlists->virtual);
3134 		RB_CLEAR_NODE(rb);
3135 
3136 		spin_lock(&ve->base.active.lock);
3137 		rq = fetch_and_zero(&ve->request);
3138 		if (rq) {
3139 			mark_eio(rq);
3140 
3141 			rq->engine = engine;
3142 			__i915_request_submit(rq);
3143 			i915_request_put(rq);
3144 
3145 			ve->base.execlists.queue_priority_hint = INT_MIN;
3146 		}
3147 		spin_unlock(&ve->base.active.lock);
3148 	}
3149 
3150 	/* Remaining _unready_ requests will be nop'ed when submitted */
3151 
3152 	execlists->queue_priority_hint = INT_MIN;
3153 	execlists->queue = RB_ROOT_CACHED;
3154 
3155 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3156 	execlists->tasklet.func = nop_submission_tasklet;
3157 
3158 	spin_unlock_irqrestore(&engine->active.lock, flags);
3159 }
3160 
3161 static void execlists_reset_finish(struct intel_engine_cs *engine)
3162 {
3163 	struct intel_engine_execlists * const execlists = &engine->execlists;
3164 
3165 	/*
3166 	 * After a GPU reset, we may have requests to replay. Do so now while
3167 	 * we still have the forcewake to be sure that the GPU is not allowed
3168 	 * to sleep before we restart and reload a context.
3169 	 */
3170 	GEM_BUG_ON(!reset_in_progress(execlists));
3171 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3172 		execlists->tasklet.func(execlists->tasklet.data);
3173 
3174 	if (__tasklet_enable(&execlists->tasklet))
3175 		/* And kick in case we missed a new request submission. */
3176 		tasklet_hi_schedule(&execlists->tasklet);
3177 	GEM_TRACE("%s: depth->%d\n", engine->name,
3178 		  atomic_read(&execlists->tasklet.count));
3179 }
3180 
3181 static int gen8_emit_bb_start(struct i915_request *rq,
3182 			      u64 offset, u32 len,
3183 			      const unsigned int flags)
3184 {
3185 	u32 *cs;
3186 
3187 	cs = intel_ring_begin(rq, 4);
3188 	if (IS_ERR(cs))
3189 		return PTR_ERR(cs);
3190 
3191 	/*
3192 	 * WaDisableCtxRestoreArbitration:bdw,chv
3193 	 *
3194 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3195 	 * particular all the gen that do not need the w/a at all!), if we
3196 	 * took care to make sure that on every switch into this context
3197 	 * (both ordinary and for preemption) that arbitrartion was enabled
3198 	 * we would be fine.  However, for gen8 there is another w/a that
3199 	 * requires us to not preempt inside GPGPU execution, so we keep
3200 	 * arbitration disabled for gen8 batches. Arbitration will be
3201 	 * re-enabled before we close the request
3202 	 * (engine->emit_fini_breadcrumb).
3203 	 */
3204 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3205 
3206 	/* FIXME(BDW+): Address space and security selectors. */
3207 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3208 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3209 	*cs++ = lower_32_bits(offset);
3210 	*cs++ = upper_32_bits(offset);
3211 
3212 	intel_ring_advance(rq, cs);
3213 
3214 	return 0;
3215 }
3216 
3217 static int gen9_emit_bb_start(struct i915_request *rq,
3218 			      u64 offset, u32 len,
3219 			      const unsigned int flags)
3220 {
3221 	u32 *cs;
3222 
3223 	cs = intel_ring_begin(rq, 6);
3224 	if (IS_ERR(cs))
3225 		return PTR_ERR(cs);
3226 
3227 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3228 
3229 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3230 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3231 	*cs++ = lower_32_bits(offset);
3232 	*cs++ = upper_32_bits(offset);
3233 
3234 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3235 	*cs++ = MI_NOOP;
3236 
3237 	intel_ring_advance(rq, cs);
3238 
3239 	return 0;
3240 }
3241 
3242 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3243 {
3244 	ENGINE_WRITE(engine, RING_IMR,
3245 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
3246 	ENGINE_POSTING_READ(engine, RING_IMR);
3247 }
3248 
3249 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3250 {
3251 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3252 }
3253 
3254 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3255 {
3256 	u32 cmd, *cs;
3257 
3258 	cs = intel_ring_begin(request, 4);
3259 	if (IS_ERR(cs))
3260 		return PTR_ERR(cs);
3261 
3262 	cmd = MI_FLUSH_DW + 1;
3263 
3264 	/* We always require a command barrier so that subsequent
3265 	 * commands, such as breadcrumb interrupts, are strictly ordered
3266 	 * wrt the contents of the write cache being flushed to memory
3267 	 * (and thus being coherent from the CPU).
3268 	 */
3269 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3270 
3271 	if (mode & EMIT_INVALIDATE) {
3272 		cmd |= MI_INVALIDATE_TLB;
3273 		if (request->engine->class == VIDEO_DECODE_CLASS)
3274 			cmd |= MI_INVALIDATE_BSD;
3275 	}
3276 
3277 	*cs++ = cmd;
3278 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3279 	*cs++ = 0; /* upper addr */
3280 	*cs++ = 0; /* value */
3281 	intel_ring_advance(request, cs);
3282 
3283 	return 0;
3284 }
3285 
3286 static int gen8_emit_flush_render(struct i915_request *request,
3287 				  u32 mode)
3288 {
3289 	bool vf_flush_wa = false, dc_flush_wa = false;
3290 	u32 *cs, flags = 0;
3291 	int len;
3292 
3293 	flags |= PIPE_CONTROL_CS_STALL;
3294 
3295 	if (mode & EMIT_FLUSH) {
3296 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3297 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3298 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3299 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3300 	}
3301 
3302 	if (mode & EMIT_INVALIDATE) {
3303 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3304 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3305 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3306 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3307 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3308 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3309 		flags |= PIPE_CONTROL_QW_WRITE;
3310 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3311 
3312 		/*
3313 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3314 		 * pipe control.
3315 		 */
3316 		if (IS_GEN(request->i915, 9))
3317 			vf_flush_wa = true;
3318 
3319 		/* WaForGAMHang:kbl */
3320 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3321 			dc_flush_wa = true;
3322 	}
3323 
3324 	len = 6;
3325 
3326 	if (vf_flush_wa)
3327 		len += 6;
3328 
3329 	if (dc_flush_wa)
3330 		len += 12;
3331 
3332 	cs = intel_ring_begin(request, len);
3333 	if (IS_ERR(cs))
3334 		return PTR_ERR(cs);
3335 
3336 	if (vf_flush_wa)
3337 		cs = gen8_emit_pipe_control(cs, 0, 0);
3338 
3339 	if (dc_flush_wa)
3340 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3341 					    0);
3342 
3343 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3344 
3345 	if (dc_flush_wa)
3346 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3347 
3348 	intel_ring_advance(request, cs);
3349 
3350 	return 0;
3351 }
3352 
3353 static int gen11_emit_flush_render(struct i915_request *request,
3354 				   u32 mode)
3355 {
3356 	if (mode & EMIT_FLUSH) {
3357 		u32 *cs;
3358 		u32 flags = 0;
3359 
3360 		flags |= PIPE_CONTROL_CS_STALL;
3361 
3362 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3363 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3364 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3365 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3366 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3367 		flags |= PIPE_CONTROL_QW_WRITE;
3368 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3369 
3370 		cs = intel_ring_begin(request, 6);
3371 		if (IS_ERR(cs))
3372 			return PTR_ERR(cs);
3373 
3374 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3375 		intel_ring_advance(request, cs);
3376 	}
3377 
3378 	if (mode & EMIT_INVALIDATE) {
3379 		u32 *cs;
3380 		u32 flags = 0;
3381 
3382 		flags |= PIPE_CONTROL_CS_STALL;
3383 
3384 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3385 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3386 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3387 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3388 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3389 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3390 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3391 		flags |= PIPE_CONTROL_QW_WRITE;
3392 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3393 
3394 		cs = intel_ring_begin(request, 6);
3395 		if (IS_ERR(cs))
3396 			return PTR_ERR(cs);
3397 
3398 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3399 		intel_ring_advance(request, cs);
3400 	}
3401 
3402 	return 0;
3403 }
3404 
3405 static u32 preparser_disable(bool state)
3406 {
3407 	return MI_ARB_CHECK | 1 << 8 | state;
3408 }
3409 
3410 static int gen12_emit_flush_render(struct i915_request *request,
3411 				   u32 mode)
3412 {
3413 	if (mode & EMIT_FLUSH) {
3414 		u32 flags = 0;
3415 		u32 *cs;
3416 
3417 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3418 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3419 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3420 		/* Wa_1409600907:tgl */
3421 		flags |= PIPE_CONTROL_DEPTH_STALL;
3422 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3423 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3424 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3425 
3426 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3427 		flags |= PIPE_CONTROL_QW_WRITE;
3428 
3429 		flags |= PIPE_CONTROL_CS_STALL;
3430 
3431 		cs = intel_ring_begin(request, 6);
3432 		if (IS_ERR(cs))
3433 			return PTR_ERR(cs);
3434 
3435 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3436 		intel_ring_advance(request, cs);
3437 	}
3438 
3439 	if (mode & EMIT_INVALIDATE) {
3440 		u32 flags = 0;
3441 		u32 *cs;
3442 
3443 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3444 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3445 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3446 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3447 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3448 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3449 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3450 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
3451 
3452 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3453 		flags |= PIPE_CONTROL_QW_WRITE;
3454 
3455 		flags |= PIPE_CONTROL_CS_STALL;
3456 
3457 		cs = intel_ring_begin(request, 8);
3458 		if (IS_ERR(cs))
3459 			return PTR_ERR(cs);
3460 
3461 		/*
3462 		 * Prevent the pre-parser from skipping past the TLB
3463 		 * invalidate and loading a stale page for the batch
3464 		 * buffer / request payload.
3465 		 */
3466 		*cs++ = preparser_disable(true);
3467 
3468 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3469 
3470 		*cs++ = preparser_disable(false);
3471 		intel_ring_advance(request, cs);
3472 
3473 		/*
3474 		 * Wa_1604544889:tgl
3475 		 */
3476 		if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
3477 			flags = 0;
3478 			flags |= PIPE_CONTROL_CS_STALL;
3479 			flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3480 
3481 			flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3482 			flags |= PIPE_CONTROL_QW_WRITE;
3483 
3484 			cs = intel_ring_begin(request, 6);
3485 			if (IS_ERR(cs))
3486 				return PTR_ERR(cs);
3487 
3488 			cs = gen8_emit_pipe_control(cs, flags,
3489 						    LRC_PPHWSP_SCRATCH_ADDR);
3490 			intel_ring_advance(request, cs);
3491 		}
3492 	}
3493 
3494 	return 0;
3495 }
3496 
3497 /*
3498  * Reserve space for 2 NOOPs at the end of each request to be
3499  * used as a workaround for not being allowed to do lite
3500  * restore with HEAD==TAIL (WaIdleLiteRestore).
3501  */
3502 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
3503 {
3504 	/* Ensure there's always at least one preemption point per-request. */
3505 	*cs++ = MI_ARB_CHECK;
3506 	*cs++ = MI_NOOP;
3507 	request->wa_tail = intel_ring_offset(request, cs);
3508 
3509 	return cs;
3510 }
3511 
3512 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
3513 {
3514 	*cs++ = MI_SEMAPHORE_WAIT |
3515 		MI_SEMAPHORE_GLOBAL_GTT |
3516 		MI_SEMAPHORE_POLL |
3517 		MI_SEMAPHORE_SAD_EQ_SDD;
3518 	*cs++ = 0;
3519 	*cs++ = intel_hws_preempt_address(request->engine);
3520 	*cs++ = 0;
3521 
3522 	return cs;
3523 }
3524 
3525 static __always_inline u32*
3526 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
3527 				 u32 *cs)
3528 {
3529 	*cs++ = MI_USER_INTERRUPT;
3530 
3531 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3532 	if (intel_engine_has_semaphores(request->engine))
3533 		cs = emit_preempt_busywait(request, cs);
3534 
3535 	request->tail = intel_ring_offset(request, cs);
3536 	assert_ring_tail_valid(request->ring, request->tail);
3537 
3538 	return gen8_emit_wa_tail(request, cs);
3539 }
3540 
3541 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3542 {
3543 	cs = gen8_emit_ggtt_write(cs,
3544 				  request->fence.seqno,
3545 				  i915_request_active_timeline(request)->hwsp_offset,
3546 				  0);
3547 
3548 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3549 }
3550 
3551 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3552 {
3553 	cs = gen8_emit_pipe_control(cs,
3554 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3555 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3556 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
3557 				    0);
3558 
3559 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
3560 	cs = gen8_emit_ggtt_write_rcs(cs,
3561 				      request->fence.seqno,
3562 				      i915_request_active_timeline(request)->hwsp_offset,
3563 				      PIPE_CONTROL_FLUSH_ENABLE |
3564 				      PIPE_CONTROL_CS_STALL);
3565 
3566 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3567 }
3568 
3569 static u32 *
3570 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3571 {
3572 	cs = gen8_emit_ggtt_write_rcs(cs,
3573 				      request->fence.seqno,
3574 				      i915_request_active_timeline(request)->hwsp_offset,
3575 				      PIPE_CONTROL_CS_STALL |
3576 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
3577 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3578 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3579 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
3580 				      PIPE_CONTROL_FLUSH_ENABLE);
3581 
3582 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3583 }
3584 
3585 /*
3586  * Note that the CS instruction pre-parser will not stall on the breadcrumb
3587  * flush and will continue pre-fetching the instructions after it before the
3588  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
3589  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
3590  * of the next request before the memory has been flushed, we're guaranteed that
3591  * we won't access the batch itself too early.
3592  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
3593  * so, if the current request is modifying an instruction in the next request on
3594  * the same intel_context, we might pre-fetch and then execute the pre-update
3595  * instruction. To avoid this, the users of self-modifying code should either
3596  * disable the parser around the code emitting the memory writes, via a new flag
3597  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
3598  * the in-kernel use-cases we've opted to use a separate context, see
3599  * reloc_gpu() as an example.
3600  * All the above applies only to the instructions themselves. Non-inline data
3601  * used by the instructions is not pre-fetched.
3602  */
3603 
3604 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
3605 {
3606 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
3607 		MI_SEMAPHORE_GLOBAL_GTT |
3608 		MI_SEMAPHORE_POLL |
3609 		MI_SEMAPHORE_SAD_EQ_SDD;
3610 	*cs++ = 0;
3611 	*cs++ = intel_hws_preempt_address(request->engine);
3612 	*cs++ = 0;
3613 	*cs++ = 0;
3614 	*cs++ = MI_NOOP;
3615 
3616 	return cs;
3617 }
3618 
3619 static __always_inline u32*
3620 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
3621 {
3622 	*cs++ = MI_USER_INTERRUPT;
3623 
3624 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3625 	if (intel_engine_has_semaphores(request->engine))
3626 		cs = gen12_emit_preempt_busywait(request, cs);
3627 
3628 	request->tail = intel_ring_offset(request, cs);
3629 	assert_ring_tail_valid(request->ring, request->tail);
3630 
3631 	return gen8_emit_wa_tail(request, cs);
3632 }
3633 
3634 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3635 {
3636 	cs = gen8_emit_ggtt_write(cs,
3637 				  request->fence.seqno,
3638 				  i915_request_active_timeline(request)->hwsp_offset,
3639 				  0);
3640 
3641 	return gen12_emit_fini_breadcrumb_footer(request, cs);
3642 }
3643 
3644 static u32 *
3645 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3646 {
3647 	cs = gen8_emit_ggtt_write_rcs(cs,
3648 				      request->fence.seqno,
3649 				      i915_request_active_timeline(request)->hwsp_offset,
3650 				      PIPE_CONTROL_CS_STALL |
3651 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
3652 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3653 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3654 				      /* Wa_1409600907:tgl */
3655 				      PIPE_CONTROL_DEPTH_STALL |
3656 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
3657 				      PIPE_CONTROL_FLUSH_ENABLE |
3658 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
3659 
3660 	return gen12_emit_fini_breadcrumb_footer(request, cs);
3661 }
3662 
3663 static void execlists_park(struct intel_engine_cs *engine)
3664 {
3665 	cancel_timer(&engine->execlists.timer);
3666 	cancel_timer(&engine->execlists.preempt);
3667 }
3668 
3669 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
3670 {
3671 	engine->submit_request = execlists_submit_request;
3672 	engine->cancel_requests = execlists_cancel_requests;
3673 	engine->schedule = i915_schedule;
3674 	engine->execlists.tasklet.func = execlists_submission_tasklet;
3675 
3676 	engine->reset.prepare = execlists_reset_prepare;
3677 	engine->reset.reset = execlists_reset;
3678 	engine->reset.finish = execlists_reset_finish;
3679 
3680 	engine->park = execlists_park;
3681 	engine->unpark = NULL;
3682 
3683 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
3684 	if (!intel_vgpu_active(engine->i915)) {
3685 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
3686 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
3687 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3688 	}
3689 
3690 	if (INTEL_GEN(engine->i915) >= 12)
3691 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
3692 }
3693 
3694 static void execlists_destroy(struct intel_engine_cs *engine)
3695 {
3696 	intel_engine_cleanup_common(engine);
3697 	lrc_destroy_wa_ctx(engine);
3698 	kfree(engine);
3699 }
3700 
3701 static void
3702 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3703 {
3704 	/* Default vfuncs which can be overriden by each engine. */
3705 
3706 	engine->destroy = execlists_destroy;
3707 	engine->resume = execlists_resume;
3708 
3709 	engine->reset.prepare = execlists_reset_prepare;
3710 	engine->reset.reset = execlists_reset;
3711 	engine->reset.finish = execlists_reset_finish;
3712 
3713 	engine->cops = &execlists_context_ops;
3714 	engine->request_alloc = execlists_request_alloc;
3715 
3716 	engine->emit_flush = gen8_emit_flush;
3717 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3718 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
3719 	if (INTEL_GEN(engine->i915) >= 12)
3720 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
3721 
3722 	engine->set_default_submission = intel_execlists_set_default_submission;
3723 
3724 	if (INTEL_GEN(engine->i915) < 11) {
3725 		engine->irq_enable = gen8_logical_ring_enable_irq;
3726 		engine->irq_disable = gen8_logical_ring_disable_irq;
3727 	} else {
3728 		/*
3729 		 * TODO: On Gen11 interrupt masks need to be clear
3730 		 * to allow C6 entry. Keep interrupts enabled at
3731 		 * and take the hit of generating extra interrupts
3732 		 * until a more refined solution exists.
3733 		 */
3734 	}
3735 	if (IS_GEN(engine->i915, 8))
3736 		engine->emit_bb_start = gen8_emit_bb_start;
3737 	else
3738 		engine->emit_bb_start = gen9_emit_bb_start;
3739 }
3740 
3741 static inline void
3742 logical_ring_default_irqs(struct intel_engine_cs *engine)
3743 {
3744 	unsigned int shift = 0;
3745 
3746 	if (INTEL_GEN(engine->i915) < 11) {
3747 		const u8 irq_shifts[] = {
3748 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
3749 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
3750 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
3751 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
3752 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
3753 		};
3754 
3755 		shift = irq_shifts[engine->id];
3756 	}
3757 
3758 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3759 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3760 }
3761 
3762 static void rcs_submission_override(struct intel_engine_cs *engine)
3763 {
3764 	switch (INTEL_GEN(engine->i915)) {
3765 	case 12:
3766 		engine->emit_flush = gen12_emit_flush_render;
3767 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
3768 		break;
3769 	case 11:
3770 		engine->emit_flush = gen11_emit_flush_render;
3771 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3772 		break;
3773 	default:
3774 		engine->emit_flush = gen8_emit_flush_render;
3775 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3776 		break;
3777 	}
3778 }
3779 
3780 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3781 {
3782 	tasklet_init(&engine->execlists.tasklet,
3783 		     execlists_submission_tasklet, (unsigned long)engine);
3784 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
3785 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
3786 
3787 	logical_ring_default_vfuncs(engine);
3788 	logical_ring_default_irqs(engine);
3789 
3790 	if (engine->class == RENDER_CLASS)
3791 		rcs_submission_override(engine);
3792 
3793 	return 0;
3794 }
3795 
3796 int intel_execlists_submission_init(struct intel_engine_cs *engine)
3797 {
3798 	struct intel_engine_execlists * const execlists = &engine->execlists;
3799 	struct drm_i915_private *i915 = engine->i915;
3800 	struct intel_uncore *uncore = engine->uncore;
3801 	u32 base = engine->mmio_base;
3802 	int ret;
3803 
3804 	ret = intel_engine_init_common(engine);
3805 	if (ret)
3806 		return ret;
3807 
3808 	if (intel_init_workaround_bb(engine))
3809 		/*
3810 		 * We continue even if we fail to initialize WA batch
3811 		 * because we only expect rare glitches but nothing
3812 		 * critical to prevent us from using GPU
3813 		 */
3814 		DRM_ERROR("WA batch buffer initialization failed\n");
3815 
3816 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
3817 		execlists->submit_reg = uncore->regs +
3818 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
3819 		execlists->ctrl_reg = uncore->regs +
3820 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
3821 	} else {
3822 		execlists->submit_reg = uncore->regs +
3823 			i915_mmio_reg_offset(RING_ELSP(base));
3824 	}
3825 
3826 	execlists->csb_status =
3827 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
3828 
3829 	execlists->csb_write =
3830 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
3831 
3832 	if (INTEL_GEN(i915) < 11)
3833 		execlists->csb_size = GEN8_CSB_ENTRIES;
3834 	else
3835 		execlists->csb_size = GEN11_CSB_ENTRIES;
3836 
3837 	reset_csb_pointers(engine);
3838 
3839 	return 0;
3840 }
3841 
3842 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
3843 {
3844 	u32 indirect_ctx_offset;
3845 
3846 	switch (INTEL_GEN(engine->i915)) {
3847 	default:
3848 		MISSING_CASE(INTEL_GEN(engine->i915));
3849 		/* fall through */
3850 	case 12:
3851 		indirect_ctx_offset =
3852 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3853 		break;
3854 	case 11:
3855 		indirect_ctx_offset =
3856 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3857 		break;
3858 	case 10:
3859 		indirect_ctx_offset =
3860 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3861 		break;
3862 	case 9:
3863 		indirect_ctx_offset =
3864 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3865 		break;
3866 	case 8:
3867 		indirect_ctx_offset =
3868 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3869 		break;
3870 	}
3871 
3872 	return indirect_ctx_offset;
3873 }
3874 
3875 
3876 static void init_common_reg_state(u32 * const regs,
3877 				  const struct intel_engine_cs *engine,
3878 				  const struct intel_ring *ring)
3879 {
3880 	regs[CTX_CONTEXT_CONTROL] =
3881 		_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
3882 		_MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
3883 	if (INTEL_GEN(engine->i915) < 11)
3884 		regs[CTX_CONTEXT_CONTROL] |=
3885 			_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
3886 					    CTX_CTRL_RS_CTX_ENABLE);
3887 
3888 	regs[CTX_RING_BUFFER_CONTROL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3889 	regs[CTX_BB_STATE] = RING_BB_PPGTT;
3890 }
3891 
3892 static void init_wa_bb_reg_state(u32 * const regs,
3893 				 const struct intel_engine_cs *engine,
3894 				 u32 pos_bb_per_ctx)
3895 {
3896 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
3897 
3898 	if (wa_ctx->per_ctx.size) {
3899 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3900 
3901 		regs[pos_bb_per_ctx] =
3902 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
3903 	}
3904 
3905 	if (wa_ctx->indirect_ctx.size) {
3906 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3907 
3908 		regs[pos_bb_per_ctx + 2] =
3909 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
3910 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
3911 
3912 		regs[pos_bb_per_ctx + 4] =
3913 			intel_lr_indirect_ctx_offset(engine) << 6;
3914 	}
3915 }
3916 
3917 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
3918 {
3919 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
3920 		/* 64b PPGTT (48bit canonical)
3921 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
3922 		 * other PDP Descriptors are ignored.
3923 		 */
3924 		ASSIGN_CTX_PML4(ppgtt, regs);
3925 	} else {
3926 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
3927 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
3928 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
3929 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
3930 	}
3931 }
3932 
3933 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
3934 {
3935 	if (i915_is_ggtt(vm))
3936 		return i915_vm_to_ggtt(vm)->alias;
3937 	else
3938 		return i915_vm_to_ppgtt(vm);
3939 }
3940 
3941 static void execlists_init_reg_state(u32 *regs,
3942 				     const struct intel_context *ce,
3943 				     const struct intel_engine_cs *engine,
3944 				     const struct intel_ring *ring,
3945 				     bool close)
3946 {
3947 	/*
3948 	 * A context is actually a big batch buffer with several
3949 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
3950 	 * values we are setting here are only for the first context restore:
3951 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
3952 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
3953 	 * we are not initializing here).
3954 	 *
3955 	 * Must keep consistent with virtual_update_register_offsets().
3956 	 */
3957 	u32 *bbe = set_offsets(regs, reg_offsets(engine), engine);
3958 
3959 	if (close) { /* Close the batch; used mainly by live_lrc_layout() */
3960 		*bbe = MI_BATCH_BUFFER_END;
3961 		if (INTEL_GEN(engine->i915) >= 10)
3962 			*bbe |= BIT(0);
3963 	}
3964 
3965 	init_common_reg_state(regs, engine, ring);
3966 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
3967 
3968 	init_wa_bb_reg_state(regs, engine,
3969 			     INTEL_GEN(engine->i915) >= 12 ?
3970 			     GEN12_CTX_BB_PER_CTX_PTR :
3971 			     CTX_BB_PER_CTX_PTR);
3972 }
3973 
3974 static int
3975 populate_lr_context(struct intel_context *ce,
3976 		    struct drm_i915_gem_object *ctx_obj,
3977 		    struct intel_engine_cs *engine,
3978 		    struct intel_ring *ring)
3979 {
3980 	bool inhibit = true;
3981 	void *vaddr;
3982 	u32 *regs;
3983 	int ret;
3984 
3985 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
3986 	if (IS_ERR(vaddr)) {
3987 		ret = PTR_ERR(vaddr);
3988 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
3989 		return ret;
3990 	}
3991 
3992 	set_redzone(vaddr, engine);
3993 
3994 	if (engine->default_state) {
3995 		void *defaults;
3996 
3997 		defaults = i915_gem_object_pin_map(engine->default_state,
3998 						   I915_MAP_WB);
3999 		if (IS_ERR(defaults)) {
4000 			ret = PTR_ERR(defaults);
4001 			goto err_unpin_ctx;
4002 		}
4003 
4004 		memcpy(vaddr, defaults, engine->context_size);
4005 		i915_gem_object_unpin_map(engine->default_state);
4006 		inhibit = false;
4007 	}
4008 
4009 	/* The second page of the context object contains some fields which must
4010 	 * be set up prior to the first execution. */
4011 	regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
4012 	execlists_init_reg_state(regs, ce, engine, ring, inhibit);
4013 	if (inhibit)
4014 		regs[CTX_CONTEXT_CONTROL] |=
4015 			_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4016 
4017 	ret = 0;
4018 err_unpin_ctx:
4019 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4020 	i915_gem_object_unpin_map(ctx_obj);
4021 	return ret;
4022 }
4023 
4024 static int __execlists_context_alloc(struct intel_context *ce,
4025 				     struct intel_engine_cs *engine)
4026 {
4027 	struct drm_i915_gem_object *ctx_obj;
4028 	struct intel_ring *ring;
4029 	struct i915_vma *vma;
4030 	u32 context_size;
4031 	int ret;
4032 
4033 	GEM_BUG_ON(ce->state);
4034 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4035 
4036 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4037 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4038 
4039 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4040 	if (IS_ERR(ctx_obj))
4041 		return PTR_ERR(ctx_obj);
4042 
4043 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4044 	if (IS_ERR(vma)) {
4045 		ret = PTR_ERR(vma);
4046 		goto error_deref_obj;
4047 	}
4048 
4049 	if (!ce->timeline) {
4050 		struct intel_timeline *tl;
4051 
4052 		tl = intel_timeline_create(engine->gt, NULL);
4053 		if (IS_ERR(tl)) {
4054 			ret = PTR_ERR(tl);
4055 			goto error_deref_obj;
4056 		}
4057 
4058 		ce->timeline = tl;
4059 	}
4060 
4061 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4062 	if (IS_ERR(ring)) {
4063 		ret = PTR_ERR(ring);
4064 		goto error_deref_obj;
4065 	}
4066 
4067 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4068 	if (ret) {
4069 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4070 		goto error_ring_free;
4071 	}
4072 
4073 	ce->ring = ring;
4074 	ce->state = vma;
4075 
4076 	return 0;
4077 
4078 error_ring_free:
4079 	intel_ring_put(ring);
4080 error_deref_obj:
4081 	i915_gem_object_put(ctx_obj);
4082 	return ret;
4083 }
4084 
4085 static struct list_head *virtual_queue(struct virtual_engine *ve)
4086 {
4087 	return &ve->base.execlists.default_priolist.requests[0];
4088 }
4089 
4090 static void virtual_context_destroy(struct kref *kref)
4091 {
4092 	struct virtual_engine *ve =
4093 		container_of(kref, typeof(*ve), context.ref);
4094 	unsigned int n;
4095 
4096 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4097 	GEM_BUG_ON(ve->request);
4098 	GEM_BUG_ON(ve->context.inflight);
4099 
4100 	for (n = 0; n < ve->num_siblings; n++) {
4101 		struct intel_engine_cs *sibling = ve->siblings[n];
4102 		struct rb_node *node = &ve->nodes[sibling->id].rb;
4103 
4104 		if (RB_EMPTY_NODE(node))
4105 			continue;
4106 
4107 		spin_lock_irq(&sibling->active.lock);
4108 
4109 		/* Detachment is lazily performed in the execlists tasklet */
4110 		if (!RB_EMPTY_NODE(node))
4111 			rb_erase_cached(node, &sibling->execlists.virtual);
4112 
4113 		spin_unlock_irq(&sibling->active.lock);
4114 	}
4115 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4116 
4117 	if (ve->context.state)
4118 		__execlists_context_fini(&ve->context);
4119 	intel_context_fini(&ve->context);
4120 
4121 	kfree(ve->bonds);
4122 	kfree(ve);
4123 }
4124 
4125 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4126 {
4127 	int swp;
4128 
4129 	/*
4130 	 * Pick a random sibling on starting to help spread the load around.
4131 	 *
4132 	 * New contexts are typically created with exactly the same order
4133 	 * of siblings, and often started in batches. Due to the way we iterate
4134 	 * the array of sibling when submitting requests, sibling[0] is
4135 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4136 	 * randomised across the system, we also help spread the load by the
4137 	 * first engine we inspect being different each time.
4138 	 *
4139 	 * NB This does not force us to execute on this engine, it will just
4140 	 * typically be the first we inspect for submission.
4141 	 */
4142 	swp = prandom_u32_max(ve->num_siblings);
4143 	if (!swp)
4144 		return;
4145 
4146 	swap(ve->siblings[swp], ve->siblings[0]);
4147 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4148 		virtual_update_register_offsets(ve->context.lrc_reg_state,
4149 						ve->siblings[0]);
4150 }
4151 
4152 static int virtual_context_pin(struct intel_context *ce)
4153 {
4154 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4155 	int err;
4156 
4157 	/* Note: we must use a real engine class for setting up reg state */
4158 	err = __execlists_context_pin(ce, ve->siblings[0]);
4159 	if (err)
4160 		return err;
4161 
4162 	virtual_engine_initial_hint(ve);
4163 	return 0;
4164 }
4165 
4166 static void virtual_context_enter(struct intel_context *ce)
4167 {
4168 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4169 	unsigned int n;
4170 
4171 	for (n = 0; n < ve->num_siblings; n++)
4172 		intel_engine_pm_get(ve->siblings[n]);
4173 
4174 	intel_timeline_enter(ce->timeline);
4175 }
4176 
4177 static void virtual_context_exit(struct intel_context *ce)
4178 {
4179 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4180 	unsigned int n;
4181 
4182 	intel_timeline_exit(ce->timeline);
4183 
4184 	for (n = 0; n < ve->num_siblings; n++)
4185 		intel_engine_pm_put(ve->siblings[n]);
4186 }
4187 
4188 static const struct intel_context_ops virtual_context_ops = {
4189 	.pin = virtual_context_pin,
4190 	.unpin = execlists_context_unpin,
4191 
4192 	.enter = virtual_context_enter,
4193 	.exit = virtual_context_exit,
4194 
4195 	.destroy = virtual_context_destroy,
4196 };
4197 
4198 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4199 {
4200 	struct i915_request *rq;
4201 	intel_engine_mask_t mask;
4202 
4203 	rq = READ_ONCE(ve->request);
4204 	if (!rq)
4205 		return 0;
4206 
4207 	/* The rq is ready for submission; rq->execution_mask is now stable. */
4208 	mask = rq->execution_mask;
4209 	if (unlikely(!mask)) {
4210 		/* Invalid selection, submit to a random engine in error */
4211 		i915_request_skip(rq, -ENODEV);
4212 		mask = ve->siblings[0]->mask;
4213 	}
4214 
4215 	GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n",
4216 		  ve->base.name,
4217 		  rq->fence.context, rq->fence.seqno,
4218 		  mask, ve->base.execlists.queue_priority_hint);
4219 
4220 	return mask;
4221 }
4222 
4223 static void virtual_submission_tasklet(unsigned long data)
4224 {
4225 	struct virtual_engine * const ve = (struct virtual_engine *)data;
4226 	const int prio = ve->base.execlists.queue_priority_hint;
4227 	intel_engine_mask_t mask;
4228 	unsigned int n;
4229 
4230 	rcu_read_lock();
4231 	mask = virtual_submission_mask(ve);
4232 	rcu_read_unlock();
4233 	if (unlikely(!mask))
4234 		return;
4235 
4236 	local_irq_disable();
4237 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4238 		struct intel_engine_cs *sibling = ve->siblings[n];
4239 		struct ve_node * const node = &ve->nodes[sibling->id];
4240 		struct rb_node **parent, *rb;
4241 		bool first;
4242 
4243 		if (unlikely(!(mask & sibling->mask))) {
4244 			if (!RB_EMPTY_NODE(&node->rb)) {
4245 				spin_lock(&sibling->active.lock);
4246 				rb_erase_cached(&node->rb,
4247 						&sibling->execlists.virtual);
4248 				RB_CLEAR_NODE(&node->rb);
4249 				spin_unlock(&sibling->active.lock);
4250 			}
4251 			continue;
4252 		}
4253 
4254 		spin_lock(&sibling->active.lock);
4255 
4256 		if (!RB_EMPTY_NODE(&node->rb)) {
4257 			/*
4258 			 * Cheat and avoid rebalancing the tree if we can
4259 			 * reuse this node in situ.
4260 			 */
4261 			first = rb_first_cached(&sibling->execlists.virtual) ==
4262 				&node->rb;
4263 			if (prio == node->prio || (prio > node->prio && first))
4264 				goto submit_engine;
4265 
4266 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4267 		}
4268 
4269 		rb = NULL;
4270 		first = true;
4271 		parent = &sibling->execlists.virtual.rb_root.rb_node;
4272 		while (*parent) {
4273 			struct ve_node *other;
4274 
4275 			rb = *parent;
4276 			other = rb_entry(rb, typeof(*other), rb);
4277 			if (prio > other->prio) {
4278 				parent = &rb->rb_left;
4279 			} else {
4280 				parent = &rb->rb_right;
4281 				first = false;
4282 			}
4283 		}
4284 
4285 		rb_link_node(&node->rb, rb, parent);
4286 		rb_insert_color_cached(&node->rb,
4287 				       &sibling->execlists.virtual,
4288 				       first);
4289 
4290 submit_engine:
4291 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4292 		node->prio = prio;
4293 		if (first && prio > sibling->execlists.queue_priority_hint) {
4294 			sibling->execlists.queue_priority_hint = prio;
4295 			tasklet_hi_schedule(&sibling->execlists.tasklet);
4296 		}
4297 
4298 		spin_unlock(&sibling->active.lock);
4299 	}
4300 	local_irq_enable();
4301 }
4302 
4303 static void virtual_submit_request(struct i915_request *rq)
4304 {
4305 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4306 	struct i915_request *old;
4307 	unsigned long flags;
4308 
4309 	GEM_TRACE("%s: rq=%llx:%lld\n",
4310 		  ve->base.name,
4311 		  rq->fence.context,
4312 		  rq->fence.seqno);
4313 
4314 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4315 
4316 	spin_lock_irqsave(&ve->base.active.lock, flags);
4317 
4318 	old = ve->request;
4319 	if (old) { /* background completion event from preempt-to-busy */
4320 		GEM_BUG_ON(!i915_request_completed(old));
4321 		__i915_request_submit(old);
4322 		i915_request_put(old);
4323 	}
4324 
4325 	if (i915_request_completed(rq)) {
4326 		__i915_request_submit(rq);
4327 
4328 		ve->base.execlists.queue_priority_hint = INT_MIN;
4329 		ve->request = NULL;
4330 	} else {
4331 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
4332 		ve->request = i915_request_get(rq);
4333 
4334 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4335 		list_move_tail(&rq->sched.link, virtual_queue(ve));
4336 
4337 		tasklet_schedule(&ve->base.execlists.tasklet);
4338 	}
4339 
4340 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
4341 }
4342 
4343 static struct ve_bond *
4344 virtual_find_bond(struct virtual_engine *ve,
4345 		  const struct intel_engine_cs *master)
4346 {
4347 	int i;
4348 
4349 	for (i = 0; i < ve->num_bonds; i++) {
4350 		if (ve->bonds[i].master == master)
4351 			return &ve->bonds[i];
4352 	}
4353 
4354 	return NULL;
4355 }
4356 
4357 static void
4358 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4359 {
4360 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4361 	intel_engine_mask_t allowed, exec;
4362 	struct ve_bond *bond;
4363 
4364 	allowed = ~to_request(signal)->engine->mask;
4365 
4366 	bond = virtual_find_bond(ve, to_request(signal)->engine);
4367 	if (bond)
4368 		allowed &= bond->sibling_mask;
4369 
4370 	/* Restrict the bonded request to run on only the available engines */
4371 	exec = READ_ONCE(rq->execution_mask);
4372 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4373 		;
4374 
4375 	/* Prevent the master from being re-run on the bonded engines */
4376 	to_request(signal)->execution_mask &= ~allowed;
4377 }
4378 
4379 struct intel_context *
4380 intel_execlists_create_virtual(struct i915_gem_context *ctx,
4381 			       struct intel_engine_cs **siblings,
4382 			       unsigned int count)
4383 {
4384 	struct virtual_engine *ve;
4385 	unsigned int n;
4386 	int err;
4387 
4388 	if (count == 0)
4389 		return ERR_PTR(-EINVAL);
4390 
4391 	if (count == 1)
4392 		return intel_context_create(ctx, siblings[0]);
4393 
4394 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4395 	if (!ve)
4396 		return ERR_PTR(-ENOMEM);
4397 
4398 	ve->base.i915 = ctx->i915;
4399 	ve->base.gt = siblings[0]->gt;
4400 	ve->base.uncore = siblings[0]->uncore;
4401 	ve->base.id = -1;
4402 	ve->base.class = OTHER_CLASS;
4403 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4404 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4405 
4406 	/*
4407 	 * The decision on whether to submit a request using semaphores
4408 	 * depends on the saturated state of the engine. We only compute
4409 	 * this during HW submission of the request, and we need for this
4410 	 * state to be globally applied to all requests being submitted
4411 	 * to this engine. Virtual engines encompass more than one physical
4412 	 * engine and so we cannot accurately tell in advance if one of those
4413 	 * engines is already saturated and so cannot afford to use a semaphore
4414 	 * and be pessimized in priority for doing so -- if we are the only
4415 	 * context using semaphores after all other clients have stopped, we
4416 	 * will be starved on the saturated system. Such a global switch for
4417 	 * semaphores is less than ideal, but alas is the current compromise.
4418 	 */
4419 	ve->base.saturated = ALL_ENGINES;
4420 
4421 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4422 
4423 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4424 	intel_engine_init_breadcrumbs(&ve->base);
4425 
4426 	intel_engine_init_execlists(&ve->base);
4427 
4428 	ve->base.cops = &virtual_context_ops;
4429 	ve->base.request_alloc = execlists_request_alloc;
4430 
4431 	ve->base.schedule = i915_schedule;
4432 	ve->base.submit_request = virtual_submit_request;
4433 	ve->base.bond_execute = virtual_bond_execute;
4434 
4435 	INIT_LIST_HEAD(virtual_queue(ve));
4436 	ve->base.execlists.queue_priority_hint = INT_MIN;
4437 	tasklet_init(&ve->base.execlists.tasklet,
4438 		     virtual_submission_tasklet,
4439 		     (unsigned long)ve);
4440 
4441 	intel_context_init(&ve->context, ctx, &ve->base);
4442 
4443 	for (n = 0; n < count; n++) {
4444 		struct intel_engine_cs *sibling = siblings[n];
4445 
4446 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
4447 		if (sibling->mask & ve->base.mask) {
4448 			DRM_DEBUG("duplicate %s entry in load balancer\n",
4449 				  sibling->name);
4450 			err = -EINVAL;
4451 			goto err_put;
4452 		}
4453 
4454 		/*
4455 		 * The virtual engine implementation is tightly coupled to
4456 		 * the execlists backend -- we push out request directly
4457 		 * into a tree inside each physical engine. We could support
4458 		 * layering if we handle cloning of the requests and
4459 		 * submitting a copy into each backend.
4460 		 */
4461 		if (sibling->execlists.tasklet.func !=
4462 		    execlists_submission_tasklet) {
4463 			err = -ENODEV;
4464 			goto err_put;
4465 		}
4466 
4467 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4468 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4469 
4470 		ve->siblings[ve->num_siblings++] = sibling;
4471 		ve->base.mask |= sibling->mask;
4472 
4473 		/*
4474 		 * All physical engines must be compatible for their emission
4475 		 * functions (as we build the instructions during request
4476 		 * construction and do not alter them before submission
4477 		 * on the physical engine). We use the engine class as a guide
4478 		 * here, although that could be refined.
4479 		 */
4480 		if (ve->base.class != OTHER_CLASS) {
4481 			if (ve->base.class != sibling->class) {
4482 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
4483 					  sibling->class, ve->base.class);
4484 				err = -EINVAL;
4485 				goto err_put;
4486 			}
4487 			continue;
4488 		}
4489 
4490 		ve->base.class = sibling->class;
4491 		ve->base.uabi_class = sibling->uabi_class;
4492 		snprintf(ve->base.name, sizeof(ve->base.name),
4493 			 "v%dx%d", ve->base.class, count);
4494 		ve->base.context_size = sibling->context_size;
4495 
4496 		ve->base.emit_bb_start = sibling->emit_bb_start;
4497 		ve->base.emit_flush = sibling->emit_flush;
4498 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
4499 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
4500 		ve->base.emit_fini_breadcrumb_dw =
4501 			sibling->emit_fini_breadcrumb_dw;
4502 
4503 		ve->base.flags = sibling->flags;
4504 	}
4505 
4506 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
4507 
4508 	err = __execlists_context_alloc(&ve->context, siblings[0]);
4509 	if (err)
4510 		goto err_put;
4511 
4512 	__set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags);
4513 
4514 	return &ve->context;
4515 
4516 err_put:
4517 	intel_context_put(&ve->context);
4518 	return ERR_PTR(err);
4519 }
4520 
4521 struct intel_context *
4522 intel_execlists_clone_virtual(struct i915_gem_context *ctx,
4523 			      struct intel_engine_cs *src)
4524 {
4525 	struct virtual_engine *se = to_virtual_engine(src);
4526 	struct intel_context *dst;
4527 
4528 	dst = intel_execlists_create_virtual(ctx,
4529 					     se->siblings,
4530 					     se->num_siblings);
4531 	if (IS_ERR(dst))
4532 		return dst;
4533 
4534 	if (se->num_bonds) {
4535 		struct virtual_engine *de = to_virtual_engine(dst->engine);
4536 
4537 		de->bonds = kmemdup(se->bonds,
4538 				    sizeof(*se->bonds) * se->num_bonds,
4539 				    GFP_KERNEL);
4540 		if (!de->bonds) {
4541 			intel_context_put(dst);
4542 			return ERR_PTR(-ENOMEM);
4543 		}
4544 
4545 		de->num_bonds = se->num_bonds;
4546 	}
4547 
4548 	return dst;
4549 }
4550 
4551 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
4552 				     const struct intel_engine_cs *master,
4553 				     const struct intel_engine_cs *sibling)
4554 {
4555 	struct virtual_engine *ve = to_virtual_engine(engine);
4556 	struct ve_bond *bond;
4557 	int n;
4558 
4559 	/* Sanity check the sibling is part of the virtual engine */
4560 	for (n = 0; n < ve->num_siblings; n++)
4561 		if (sibling == ve->siblings[n])
4562 			break;
4563 	if (n == ve->num_siblings)
4564 		return -EINVAL;
4565 
4566 	bond = virtual_find_bond(ve, master);
4567 	if (bond) {
4568 		bond->sibling_mask |= sibling->mask;
4569 		return 0;
4570 	}
4571 
4572 	bond = krealloc(ve->bonds,
4573 			sizeof(*bond) * (ve->num_bonds + 1),
4574 			GFP_KERNEL);
4575 	if (!bond)
4576 		return -ENOMEM;
4577 
4578 	bond[ve->num_bonds].master = master;
4579 	bond[ve->num_bonds].sibling_mask = sibling->mask;
4580 
4581 	ve->bonds = bond;
4582 	ve->num_bonds++;
4583 
4584 	return 0;
4585 }
4586 
4587 struct intel_engine_cs *
4588 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
4589 				 unsigned int sibling)
4590 {
4591 	struct virtual_engine *ve = to_virtual_engine(engine);
4592 
4593 	if (sibling >= ve->num_siblings)
4594 		return NULL;
4595 
4596 	return ve->siblings[sibling];
4597 }
4598 
4599 void intel_execlists_show_requests(struct intel_engine_cs *engine,
4600 				   struct drm_printer *m,
4601 				   void (*show_request)(struct drm_printer *m,
4602 							struct i915_request *rq,
4603 							const char *prefix),
4604 				   unsigned int max)
4605 {
4606 	const struct intel_engine_execlists *execlists = &engine->execlists;
4607 	struct i915_request *rq, *last;
4608 	unsigned long flags;
4609 	unsigned int count;
4610 	struct rb_node *rb;
4611 
4612 	spin_lock_irqsave(&engine->active.lock, flags);
4613 
4614 	last = NULL;
4615 	count = 0;
4616 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
4617 		if (count++ < max - 1)
4618 			show_request(m, rq, "\t\tE ");
4619 		else
4620 			last = rq;
4621 	}
4622 	if (last) {
4623 		if (count > max) {
4624 			drm_printf(m,
4625 				   "\t\t...skipping %d executing requests...\n",
4626 				   count - max);
4627 		}
4628 		show_request(m, last, "\t\tE ");
4629 	}
4630 
4631 	last = NULL;
4632 	count = 0;
4633 	if (execlists->queue_priority_hint != INT_MIN)
4634 		drm_printf(m, "\t\tQueue priority hint: %d\n",
4635 			   execlists->queue_priority_hint);
4636 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
4637 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
4638 		int i;
4639 
4640 		priolist_for_each_request(rq, p, i) {
4641 			if (count++ < max - 1)
4642 				show_request(m, rq, "\t\tQ ");
4643 			else
4644 				last = rq;
4645 		}
4646 	}
4647 	if (last) {
4648 		if (count > max) {
4649 			drm_printf(m,
4650 				   "\t\t...skipping %d queued requests...\n",
4651 				   count - max);
4652 		}
4653 		show_request(m, last, "\t\tQ ");
4654 	}
4655 
4656 	last = NULL;
4657 	count = 0;
4658 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
4659 		struct virtual_engine *ve =
4660 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4661 		struct i915_request *rq = READ_ONCE(ve->request);
4662 
4663 		if (rq) {
4664 			if (count++ < max - 1)
4665 				show_request(m, rq, "\t\tV ");
4666 			else
4667 				last = rq;
4668 		}
4669 	}
4670 	if (last) {
4671 		if (count > max) {
4672 			drm_printf(m,
4673 				   "\t\t...skipping %d virtual requests...\n",
4674 				   count - max);
4675 		}
4676 		show_request(m, last, "\t\tV ");
4677 	}
4678 
4679 	spin_unlock_irqrestore(&engine->active.lock, flags);
4680 }
4681 
4682 void intel_lr_context_reset(struct intel_engine_cs *engine,
4683 			    struct intel_context *ce,
4684 			    u32 head,
4685 			    bool scrub)
4686 {
4687 	GEM_BUG_ON(!intel_context_is_pinned(ce));
4688 
4689 	/*
4690 	 * We want a simple context + ring to execute the breadcrumb update.
4691 	 * We cannot rely on the context being intact across the GPU hang,
4692 	 * so clear it and rebuild just what we need for the breadcrumb.
4693 	 * All pending requests for this context will be zapped, and any
4694 	 * future request will be after userspace has had the opportunity
4695 	 * to recreate its own state.
4696 	 */
4697 	if (scrub)
4698 		restore_default_state(ce, engine);
4699 
4700 	/* Rerun the request; its payload has been neutered (if guilty). */
4701 	ce->ring->head = head;
4702 	intel_ring_update_space(ce->ring);
4703 
4704 	__execlists_update_reg_state(ce, engine);
4705 }
4706 
4707 bool
4708 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
4709 {
4710 	return engine->set_default_submission ==
4711 	       intel_execlists_set_default_submission;
4712 }
4713 
4714 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4715 #include "selftest_lrc.c"
4716 #endif
4717