xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision a01822e94ee53e8ebc9632fe2764048b81921254)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 
151 #define RING_EXECLIST_QFULL		(1 << 0x2)
152 #define RING_EXECLIST1_VALID		(1 << 0x3)
153 #define RING_EXECLIST0_VALID		(1 << 0x4)
154 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
155 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
156 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
157 
158 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
159 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
161 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
162 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
163 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
164 
165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
166 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
167 
168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
169 
170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
172 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
173 #define GEN12_IDLE_CTX_ID		0x7FF
174 #define GEN12_CSB_CTX_VALID(csb_dw) \
175 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
176 
177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
179 
180 struct virtual_engine {
181 	struct intel_engine_cs base;
182 	struct intel_context context;
183 
184 	/*
185 	 * We allow only a single request through the virtual engine at a time
186 	 * (each request in the timeline waits for the completion fence of
187 	 * the previous before being submitted). By restricting ourselves to
188 	 * only submitting a single request, each request is placed on to a
189 	 * physical to maximise load spreading (by virtue of the late greedy
190 	 * scheduling -- each real engine takes the next available request
191 	 * upon idling).
192 	 */
193 	struct i915_request *request;
194 
195 	/*
196 	 * We keep a rbtree of available virtual engines inside each physical
197 	 * engine, sorted by priority. Here we preallocate the nodes we need
198 	 * for the virtual engine, indexed by physical_engine->id.
199 	 */
200 	struct ve_node {
201 		struct rb_node rb;
202 		int prio;
203 	} nodes[I915_NUM_ENGINES];
204 
205 	/*
206 	 * Keep track of bonded pairs -- restrictions upon on our selection
207 	 * of physical engines any particular request may be submitted to.
208 	 * If we receive a submit-fence from a master engine, we will only
209 	 * use one of sibling_mask physical engines.
210 	 */
211 	struct ve_bond {
212 		const struct intel_engine_cs *master;
213 		intel_engine_mask_t sibling_mask;
214 	} *bonds;
215 	unsigned int num_bonds;
216 
217 	/* And finally, which physical engines this virtual engine maps onto. */
218 	unsigned int num_siblings;
219 	struct intel_engine_cs *siblings[0];
220 };
221 
222 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
223 {
224 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
225 	return container_of(engine, struct virtual_engine, base);
226 }
227 
228 static int __execlists_context_alloc(struct intel_context *ce,
229 				     struct intel_engine_cs *engine);
230 
231 static void execlists_init_reg_state(u32 *reg_state,
232 				     const struct intel_context *ce,
233 				     const struct intel_engine_cs *engine,
234 				     const struct intel_ring *ring,
235 				     bool close);
236 static void
237 __execlists_update_reg_state(const struct intel_context *ce,
238 			     const struct intel_engine_cs *engine,
239 			     u32 head);
240 
241 static void mark_eio(struct i915_request *rq)
242 {
243 	if (i915_request_completed(rq))
244 		return;
245 
246 	GEM_BUG_ON(i915_request_signaled(rq));
247 
248 	i915_request_set_error_once(rq, -EIO);
249 	i915_request_mark_complete(rq);
250 }
251 
252 static struct i915_request *
253 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
254 {
255 	struct i915_request *active = rq;
256 
257 	rcu_read_lock();
258 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
259 		if (i915_request_completed(rq))
260 			break;
261 
262 		active = rq;
263 	}
264 	rcu_read_unlock();
265 
266 	return active;
267 }
268 
269 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
270 {
271 	return (i915_ggtt_offset(engine->status_page.vma) +
272 		I915_GEM_HWS_PREEMPT_ADDR);
273 }
274 
275 static inline void
276 ring_set_paused(const struct intel_engine_cs *engine, int state)
277 {
278 	/*
279 	 * We inspect HWS_PREEMPT with a semaphore inside
280 	 * engine->emit_fini_breadcrumb. If the dword is true,
281 	 * the ring is paused as the semaphore will busywait
282 	 * until the dword is false.
283 	 */
284 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
285 	if (state)
286 		wmb();
287 }
288 
289 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
290 {
291 	return rb_entry(rb, struct i915_priolist, node);
292 }
293 
294 static inline int rq_prio(const struct i915_request *rq)
295 {
296 	return READ_ONCE(rq->sched.attr.priority);
297 }
298 
299 static int effective_prio(const struct i915_request *rq)
300 {
301 	int prio = rq_prio(rq);
302 
303 	/*
304 	 * If this request is special and must not be interrupted at any
305 	 * cost, so be it. Note we are only checking the most recent request
306 	 * in the context and so may be masking an earlier vip request. It
307 	 * is hoped that under the conditions where nopreempt is used, this
308 	 * will not matter (i.e. all requests to that context will be
309 	 * nopreempt for as long as desired).
310 	 */
311 	if (i915_request_has_nopreempt(rq))
312 		prio = I915_PRIORITY_UNPREEMPTABLE;
313 
314 	/*
315 	 * On unwinding the active request, we give it a priority bump
316 	 * if it has completed waiting on any semaphore. If we know that
317 	 * the request has already started, we can prevent an unwanted
318 	 * preempt-to-idle cycle by taking that into account now.
319 	 */
320 	if (__i915_request_has_started(rq))
321 		prio |= I915_PRIORITY_NOSEMAPHORE;
322 
323 	/* Restrict mere WAIT boosts from triggering preemption */
324 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
325 	return prio | __NO_PREEMPTION;
326 }
327 
328 static int queue_prio(const struct intel_engine_execlists *execlists)
329 {
330 	struct i915_priolist *p;
331 	struct rb_node *rb;
332 
333 	rb = rb_first_cached(&execlists->queue);
334 	if (!rb)
335 		return INT_MIN;
336 
337 	/*
338 	 * As the priolist[] are inverted, with the highest priority in [0],
339 	 * we have to flip the index value to become priority.
340 	 */
341 	p = to_priolist(rb);
342 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
343 }
344 
345 static inline bool need_preempt(const struct intel_engine_cs *engine,
346 				const struct i915_request *rq,
347 				struct rb_node *rb)
348 {
349 	int last_prio;
350 
351 	if (!intel_engine_has_semaphores(engine))
352 		return false;
353 
354 	/*
355 	 * Check if the current priority hint merits a preemption attempt.
356 	 *
357 	 * We record the highest value priority we saw during rescheduling
358 	 * prior to this dequeue, therefore we know that if it is strictly
359 	 * less than the current tail of ESLP[0], we do not need to force
360 	 * a preempt-to-idle cycle.
361 	 *
362 	 * However, the priority hint is a mere hint that we may need to
363 	 * preempt. If that hint is stale or we may be trying to preempt
364 	 * ourselves, ignore the request.
365 	 *
366 	 * More naturally we would write
367 	 *      prio >= max(0, last);
368 	 * except that we wish to prevent triggering preemption at the same
369 	 * priority level: the task that is running should remain running
370 	 * to preserve FIFO ordering of dependencies.
371 	 */
372 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
373 	if (engine->execlists.queue_priority_hint <= last_prio)
374 		return false;
375 
376 	/*
377 	 * Check against the first request in ELSP[1], it will, thanks to the
378 	 * power of PI, be the highest priority of that context.
379 	 */
380 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
381 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
382 		return true;
383 
384 	if (rb) {
385 		struct virtual_engine *ve =
386 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
387 		bool preempt = false;
388 
389 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
390 			struct i915_request *next;
391 
392 			rcu_read_lock();
393 			next = READ_ONCE(ve->request);
394 			if (next)
395 				preempt = rq_prio(next) > last_prio;
396 			rcu_read_unlock();
397 		}
398 
399 		if (preempt)
400 			return preempt;
401 	}
402 
403 	/*
404 	 * If the inflight context did not trigger the preemption, then maybe
405 	 * it was the set of queued requests? Pick the highest priority in
406 	 * the queue (the first active priolist) and see if it deserves to be
407 	 * running instead of ELSP[0].
408 	 *
409 	 * The highest priority request in the queue can not be either
410 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
411 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
412 	 */
413 	return queue_prio(&engine->execlists) > last_prio;
414 }
415 
416 __maybe_unused static inline bool
417 assert_priority_queue(const struct i915_request *prev,
418 		      const struct i915_request *next)
419 {
420 	/*
421 	 * Without preemption, the prev may refer to the still active element
422 	 * which we refuse to let go.
423 	 *
424 	 * Even with preemption, there are times when we think it is better not
425 	 * to preempt and leave an ostensibly lower priority request in flight.
426 	 */
427 	if (i915_request_is_active(prev))
428 		return true;
429 
430 	return rq_prio(prev) >= rq_prio(next);
431 }
432 
433 /*
434  * The context descriptor encodes various attributes of a context,
435  * including its GTT address and some flags. Because it's fairly
436  * expensive to calculate, we'll just do it once and cache the result,
437  * which remains valid until the context is unpinned.
438  *
439  * This is what a descriptor looks like, from LSB to MSB::
440  *
441  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
442  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
443  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
444  *      bits 53-54:    mbz, reserved for use by hardware
445  *      bits 55-63:    group ID, currently unused and set to 0
446  *
447  * Starting from Gen11, the upper dword of the descriptor has a new format:
448  *
449  *      bits 32-36:    reserved
450  *      bits 37-47:    SW context ID
451  *      bits 48:53:    engine instance
452  *      bit 54:        mbz, reserved for use by hardware
453  *      bits 55-60:    SW counter
454  *      bits 61-63:    engine class
455  *
456  * engine info, SW context ID and SW counter need to form a unique number
457  * (Context ID) per lrc.
458  */
459 static u32
460 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
461 {
462 	u32 desc;
463 
464 	desc = INTEL_LEGACY_32B_CONTEXT;
465 	if (i915_vm_is_4lvl(ce->vm))
466 		desc = INTEL_LEGACY_64B_CONTEXT;
467 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
468 
469 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
470 	if (IS_GEN(engine->i915, 8))
471 		desc |= GEN8_CTX_L3LLC_COHERENT;
472 
473 	return i915_ggtt_offset(ce->state) | desc;
474 }
475 
476 static inline unsigned int dword_in_page(void *addr)
477 {
478 	return offset_in_page(addr) / sizeof(u32);
479 }
480 
481 static void set_offsets(u32 *regs,
482 			const u8 *data,
483 			const struct intel_engine_cs *engine,
484 			bool clear)
485 #define NOP(x) (BIT(7) | (x))
486 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
487 #define POSTED BIT(0)
488 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
489 #define REG16(x) \
490 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
491 	(((x) >> 2) & 0x7f)
492 #define END(x) 0, (x)
493 {
494 	const u32 base = engine->mmio_base;
495 
496 	while (*data) {
497 		u8 count, flags;
498 
499 		if (*data & BIT(7)) { /* skip */
500 			count = *data++ & ~BIT(7);
501 			if (clear)
502 				memset32(regs, MI_NOOP, count);
503 			regs += count;
504 			continue;
505 		}
506 
507 		count = *data & 0x3f;
508 		flags = *data >> 6;
509 		data++;
510 
511 		*regs = MI_LOAD_REGISTER_IMM(count);
512 		if (flags & POSTED)
513 			*regs |= MI_LRI_FORCE_POSTED;
514 		if (INTEL_GEN(engine->i915) >= 11)
515 			*regs |= MI_LRI_CS_MMIO;
516 		regs++;
517 
518 		GEM_BUG_ON(!count);
519 		do {
520 			u32 offset = 0;
521 			u8 v;
522 
523 			do {
524 				v = *data++;
525 				offset <<= 7;
526 				offset |= v & ~BIT(7);
527 			} while (v & BIT(7));
528 
529 			regs[0] = base + (offset << 2);
530 			if (clear)
531 				regs[1] = 0;
532 			regs += 2;
533 		} while (--count);
534 	}
535 
536 	if (clear) {
537 		u8 count = *++data;
538 
539 		/* Clear past the tail for HW access */
540 		GEM_BUG_ON(dword_in_page(regs) > count);
541 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
542 
543 		/* Close the batch; used mainly by live_lrc_layout() */
544 		*regs = MI_BATCH_BUFFER_END;
545 		if (INTEL_GEN(engine->i915) >= 10)
546 			*regs |= BIT(0);
547 	}
548 }
549 
550 static const u8 gen8_xcs_offsets[] = {
551 	NOP(1),
552 	LRI(11, 0),
553 	REG16(0x244),
554 	REG(0x034),
555 	REG(0x030),
556 	REG(0x038),
557 	REG(0x03c),
558 	REG(0x168),
559 	REG(0x140),
560 	REG(0x110),
561 	REG(0x11c),
562 	REG(0x114),
563 	REG(0x118),
564 
565 	NOP(9),
566 	LRI(9, 0),
567 	REG16(0x3a8),
568 	REG16(0x28c),
569 	REG16(0x288),
570 	REG16(0x284),
571 	REG16(0x280),
572 	REG16(0x27c),
573 	REG16(0x278),
574 	REG16(0x274),
575 	REG16(0x270),
576 
577 	NOP(13),
578 	LRI(2, 0),
579 	REG16(0x200),
580 	REG(0x028),
581 
582 	END(80)
583 };
584 
585 static const u8 gen9_xcs_offsets[] = {
586 	NOP(1),
587 	LRI(14, POSTED),
588 	REG16(0x244),
589 	REG(0x034),
590 	REG(0x030),
591 	REG(0x038),
592 	REG(0x03c),
593 	REG(0x168),
594 	REG(0x140),
595 	REG(0x110),
596 	REG(0x11c),
597 	REG(0x114),
598 	REG(0x118),
599 	REG(0x1c0),
600 	REG(0x1c4),
601 	REG(0x1c8),
602 
603 	NOP(3),
604 	LRI(9, POSTED),
605 	REG16(0x3a8),
606 	REG16(0x28c),
607 	REG16(0x288),
608 	REG16(0x284),
609 	REG16(0x280),
610 	REG16(0x27c),
611 	REG16(0x278),
612 	REG16(0x274),
613 	REG16(0x270),
614 
615 	NOP(13),
616 	LRI(1, POSTED),
617 	REG16(0x200),
618 
619 	NOP(13),
620 	LRI(44, POSTED),
621 	REG(0x028),
622 	REG(0x09c),
623 	REG(0x0c0),
624 	REG(0x178),
625 	REG(0x17c),
626 	REG16(0x358),
627 	REG(0x170),
628 	REG(0x150),
629 	REG(0x154),
630 	REG(0x158),
631 	REG16(0x41c),
632 	REG16(0x600),
633 	REG16(0x604),
634 	REG16(0x608),
635 	REG16(0x60c),
636 	REG16(0x610),
637 	REG16(0x614),
638 	REG16(0x618),
639 	REG16(0x61c),
640 	REG16(0x620),
641 	REG16(0x624),
642 	REG16(0x628),
643 	REG16(0x62c),
644 	REG16(0x630),
645 	REG16(0x634),
646 	REG16(0x638),
647 	REG16(0x63c),
648 	REG16(0x640),
649 	REG16(0x644),
650 	REG16(0x648),
651 	REG16(0x64c),
652 	REG16(0x650),
653 	REG16(0x654),
654 	REG16(0x658),
655 	REG16(0x65c),
656 	REG16(0x660),
657 	REG16(0x664),
658 	REG16(0x668),
659 	REG16(0x66c),
660 	REG16(0x670),
661 	REG16(0x674),
662 	REG16(0x678),
663 	REG16(0x67c),
664 	REG(0x068),
665 
666 	END(176)
667 };
668 
669 static const u8 gen12_xcs_offsets[] = {
670 	NOP(1),
671 	LRI(13, POSTED),
672 	REG16(0x244),
673 	REG(0x034),
674 	REG(0x030),
675 	REG(0x038),
676 	REG(0x03c),
677 	REG(0x168),
678 	REG(0x140),
679 	REG(0x110),
680 	REG(0x1c0),
681 	REG(0x1c4),
682 	REG(0x1c8),
683 	REG(0x180),
684 	REG16(0x2b4),
685 
686 	NOP(5),
687 	LRI(9, POSTED),
688 	REG16(0x3a8),
689 	REG16(0x28c),
690 	REG16(0x288),
691 	REG16(0x284),
692 	REG16(0x280),
693 	REG16(0x27c),
694 	REG16(0x278),
695 	REG16(0x274),
696 	REG16(0x270),
697 
698 	END(80)
699 };
700 
701 static const u8 gen8_rcs_offsets[] = {
702 	NOP(1),
703 	LRI(14, POSTED),
704 	REG16(0x244),
705 	REG(0x034),
706 	REG(0x030),
707 	REG(0x038),
708 	REG(0x03c),
709 	REG(0x168),
710 	REG(0x140),
711 	REG(0x110),
712 	REG(0x11c),
713 	REG(0x114),
714 	REG(0x118),
715 	REG(0x1c0),
716 	REG(0x1c4),
717 	REG(0x1c8),
718 
719 	NOP(3),
720 	LRI(9, POSTED),
721 	REG16(0x3a8),
722 	REG16(0x28c),
723 	REG16(0x288),
724 	REG16(0x284),
725 	REG16(0x280),
726 	REG16(0x27c),
727 	REG16(0x278),
728 	REG16(0x274),
729 	REG16(0x270),
730 
731 	NOP(13),
732 	LRI(1, 0),
733 	REG(0x0c8),
734 
735 	END(80)
736 };
737 
738 static const u8 gen9_rcs_offsets[] = {
739 	NOP(1),
740 	LRI(14, POSTED),
741 	REG16(0x244),
742 	REG(0x34),
743 	REG(0x30),
744 	REG(0x38),
745 	REG(0x3c),
746 	REG(0x168),
747 	REG(0x140),
748 	REG(0x110),
749 	REG(0x11c),
750 	REG(0x114),
751 	REG(0x118),
752 	REG(0x1c0),
753 	REG(0x1c4),
754 	REG(0x1c8),
755 
756 	NOP(3),
757 	LRI(9, POSTED),
758 	REG16(0x3a8),
759 	REG16(0x28c),
760 	REG16(0x288),
761 	REG16(0x284),
762 	REG16(0x280),
763 	REG16(0x27c),
764 	REG16(0x278),
765 	REG16(0x274),
766 	REG16(0x270),
767 
768 	NOP(13),
769 	LRI(1, 0),
770 	REG(0xc8),
771 
772 	NOP(13),
773 	LRI(44, POSTED),
774 	REG(0x28),
775 	REG(0x9c),
776 	REG(0xc0),
777 	REG(0x178),
778 	REG(0x17c),
779 	REG16(0x358),
780 	REG(0x170),
781 	REG(0x150),
782 	REG(0x154),
783 	REG(0x158),
784 	REG16(0x41c),
785 	REG16(0x600),
786 	REG16(0x604),
787 	REG16(0x608),
788 	REG16(0x60c),
789 	REG16(0x610),
790 	REG16(0x614),
791 	REG16(0x618),
792 	REG16(0x61c),
793 	REG16(0x620),
794 	REG16(0x624),
795 	REG16(0x628),
796 	REG16(0x62c),
797 	REG16(0x630),
798 	REG16(0x634),
799 	REG16(0x638),
800 	REG16(0x63c),
801 	REG16(0x640),
802 	REG16(0x644),
803 	REG16(0x648),
804 	REG16(0x64c),
805 	REG16(0x650),
806 	REG16(0x654),
807 	REG16(0x658),
808 	REG16(0x65c),
809 	REG16(0x660),
810 	REG16(0x664),
811 	REG16(0x668),
812 	REG16(0x66c),
813 	REG16(0x670),
814 	REG16(0x674),
815 	REG16(0x678),
816 	REG16(0x67c),
817 	REG(0x68),
818 
819 	END(176)
820 };
821 
822 static const u8 gen11_rcs_offsets[] = {
823 	NOP(1),
824 	LRI(15, POSTED),
825 	REG16(0x244),
826 	REG(0x034),
827 	REG(0x030),
828 	REG(0x038),
829 	REG(0x03c),
830 	REG(0x168),
831 	REG(0x140),
832 	REG(0x110),
833 	REG(0x11c),
834 	REG(0x114),
835 	REG(0x118),
836 	REG(0x1c0),
837 	REG(0x1c4),
838 	REG(0x1c8),
839 	REG(0x180),
840 
841 	NOP(1),
842 	LRI(9, POSTED),
843 	REG16(0x3a8),
844 	REG16(0x28c),
845 	REG16(0x288),
846 	REG16(0x284),
847 	REG16(0x280),
848 	REG16(0x27c),
849 	REG16(0x278),
850 	REG16(0x274),
851 	REG16(0x270),
852 
853 	LRI(1, POSTED),
854 	REG(0x1b0),
855 
856 	NOP(10),
857 	LRI(1, 0),
858 	REG(0x0c8),
859 
860 	END(80)
861 };
862 
863 static const u8 gen12_rcs_offsets[] = {
864 	NOP(1),
865 	LRI(13, POSTED),
866 	REG16(0x244),
867 	REG(0x034),
868 	REG(0x030),
869 	REG(0x038),
870 	REG(0x03c),
871 	REG(0x168),
872 	REG(0x140),
873 	REG(0x110),
874 	REG(0x1c0),
875 	REG(0x1c4),
876 	REG(0x1c8),
877 	REG(0x180),
878 	REG16(0x2b4),
879 
880 	NOP(5),
881 	LRI(9, POSTED),
882 	REG16(0x3a8),
883 	REG16(0x28c),
884 	REG16(0x288),
885 	REG16(0x284),
886 	REG16(0x280),
887 	REG16(0x27c),
888 	REG16(0x278),
889 	REG16(0x274),
890 	REG16(0x270),
891 
892 	LRI(3, POSTED),
893 	REG(0x1b0),
894 	REG16(0x5a8),
895 	REG16(0x5ac),
896 
897 	NOP(6),
898 	LRI(1, 0),
899 	REG(0x0c8),
900 
901 	END(80)
902 };
903 
904 #undef END
905 #undef REG16
906 #undef REG
907 #undef LRI
908 #undef NOP
909 
910 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
911 {
912 	/*
913 	 * The gen12+ lists only have the registers we program in the basic
914 	 * default state. We rely on the context image using relative
915 	 * addressing to automatic fixup the register state between the
916 	 * physical engines for virtual engine.
917 	 */
918 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
919 		   !intel_engine_has_relative_mmio(engine));
920 
921 	if (engine->class == RENDER_CLASS) {
922 		if (INTEL_GEN(engine->i915) >= 12)
923 			return gen12_rcs_offsets;
924 		else if (INTEL_GEN(engine->i915) >= 11)
925 			return gen11_rcs_offsets;
926 		else if (INTEL_GEN(engine->i915) >= 9)
927 			return gen9_rcs_offsets;
928 		else
929 			return gen8_rcs_offsets;
930 	} else {
931 		if (INTEL_GEN(engine->i915) >= 12)
932 			return gen12_xcs_offsets;
933 		else if (INTEL_GEN(engine->i915) >= 9)
934 			return gen9_xcs_offsets;
935 		else
936 			return gen8_xcs_offsets;
937 	}
938 }
939 
940 static struct i915_request *
941 __unwind_incomplete_requests(struct intel_engine_cs *engine)
942 {
943 	struct i915_request *rq, *rn, *active = NULL;
944 	struct list_head *uninitialized_var(pl);
945 	int prio = I915_PRIORITY_INVALID;
946 
947 	lockdep_assert_held(&engine->active.lock);
948 
949 	list_for_each_entry_safe_reverse(rq, rn,
950 					 &engine->active.requests,
951 					 sched.link) {
952 		if (i915_request_completed(rq))
953 			continue; /* XXX */
954 
955 		__i915_request_unsubmit(rq);
956 
957 		/*
958 		 * Push the request back into the queue for later resubmission.
959 		 * If this request is not native to this physical engine (i.e.
960 		 * it came from a virtual source), push it back onto the virtual
961 		 * engine so that it can be moved across onto another physical
962 		 * engine as load dictates.
963 		 */
964 		if (likely(rq->execution_mask == engine->mask)) {
965 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
966 			if (rq_prio(rq) != prio) {
967 				prio = rq_prio(rq);
968 				pl = i915_sched_lookup_priolist(engine, prio);
969 			}
970 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
971 
972 			list_move(&rq->sched.link, pl);
973 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
974 
975 			active = rq;
976 		} else {
977 			struct intel_engine_cs *owner = rq->context->engine;
978 
979 			/*
980 			 * Decouple the virtual breadcrumb before moving it
981 			 * back to the virtual engine -- we don't want the
982 			 * request to complete in the background and try
983 			 * and cancel the breadcrumb on the virtual engine
984 			 * (instead of the old engine where it is linked)!
985 			 */
986 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
987 				     &rq->fence.flags)) {
988 				spin_lock_nested(&rq->lock,
989 						 SINGLE_DEPTH_NESTING);
990 				i915_request_cancel_breadcrumb(rq);
991 				spin_unlock(&rq->lock);
992 			}
993 			WRITE_ONCE(rq->engine, owner);
994 			owner->submit_request(rq);
995 			active = NULL;
996 		}
997 	}
998 
999 	return active;
1000 }
1001 
1002 struct i915_request *
1003 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1004 {
1005 	struct intel_engine_cs *engine =
1006 		container_of(execlists, typeof(*engine), execlists);
1007 
1008 	return __unwind_incomplete_requests(engine);
1009 }
1010 
1011 static inline void
1012 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1013 {
1014 	/*
1015 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1016 	 * The compiler should eliminate this function as dead-code.
1017 	 */
1018 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1019 		return;
1020 
1021 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1022 				   status, rq);
1023 }
1024 
1025 static void intel_engine_context_in(struct intel_engine_cs *engine)
1026 {
1027 	unsigned long flags;
1028 
1029 	if (READ_ONCE(engine->stats.enabled) == 0)
1030 		return;
1031 
1032 	write_seqlock_irqsave(&engine->stats.lock, flags);
1033 
1034 	if (engine->stats.enabled > 0) {
1035 		if (engine->stats.active++ == 0)
1036 			engine->stats.start = ktime_get();
1037 		GEM_BUG_ON(engine->stats.active == 0);
1038 	}
1039 
1040 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1041 }
1042 
1043 static void intel_engine_context_out(struct intel_engine_cs *engine)
1044 {
1045 	unsigned long flags;
1046 
1047 	if (READ_ONCE(engine->stats.enabled) == 0)
1048 		return;
1049 
1050 	write_seqlock_irqsave(&engine->stats.lock, flags);
1051 
1052 	if (engine->stats.enabled > 0) {
1053 		ktime_t last;
1054 
1055 		if (engine->stats.active && --engine->stats.active == 0) {
1056 			/*
1057 			 * Decrement the active context count and in case GPU
1058 			 * is now idle add up to the running total.
1059 			 */
1060 			last = ktime_sub(ktime_get(), engine->stats.start);
1061 
1062 			engine->stats.total = ktime_add(engine->stats.total,
1063 							last);
1064 		} else if (engine->stats.active == 0) {
1065 			/*
1066 			 * After turning on engine stats, context out might be
1067 			 * the first event in which case we account from the
1068 			 * time stats gathering was turned on.
1069 			 */
1070 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1071 
1072 			engine->stats.total = ktime_add(engine->stats.total,
1073 							last);
1074 		}
1075 	}
1076 
1077 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1078 }
1079 
1080 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1081 {
1082 	if (INTEL_GEN(engine->i915) >= 12)
1083 		return 0x60;
1084 	else if (INTEL_GEN(engine->i915) >= 9)
1085 		return 0x54;
1086 	else if (engine->class == RENDER_CLASS)
1087 		return 0x58;
1088 	else
1089 		return -1;
1090 }
1091 
1092 static void
1093 execlists_check_context(const struct intel_context *ce,
1094 			const struct intel_engine_cs *engine)
1095 {
1096 	const struct intel_ring *ring = ce->ring;
1097 	u32 *regs = ce->lrc_reg_state;
1098 	bool valid = true;
1099 	int x;
1100 
1101 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1102 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1103 		       engine->name,
1104 		       regs[CTX_RING_START],
1105 		       i915_ggtt_offset(ring->vma));
1106 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1107 		valid = false;
1108 	}
1109 
1110 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1111 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1112 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1113 		       engine->name,
1114 		       regs[CTX_RING_CTL],
1115 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1116 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1117 		valid = false;
1118 	}
1119 
1120 	x = lrc_ring_mi_mode(engine);
1121 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1122 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1123 		       engine->name, regs[x + 1]);
1124 		regs[x + 1] &= ~STOP_RING;
1125 		regs[x + 1] |= STOP_RING << 16;
1126 		valid = false;
1127 	}
1128 
1129 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1130 }
1131 
1132 static void restore_default_state(struct intel_context *ce,
1133 				  struct intel_engine_cs *engine)
1134 {
1135 	u32 *regs = ce->lrc_reg_state;
1136 
1137 	if (engine->pinned_default_state)
1138 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
1139 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1140 		       engine->context_size - PAGE_SIZE);
1141 
1142 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1143 }
1144 
1145 static void reset_active(struct i915_request *rq,
1146 			 struct intel_engine_cs *engine)
1147 {
1148 	struct intel_context * const ce = rq->context;
1149 	u32 head;
1150 
1151 	/*
1152 	 * The executing context has been cancelled. We want to prevent
1153 	 * further execution along this context and propagate the error on
1154 	 * to anything depending on its results.
1155 	 *
1156 	 * In __i915_request_submit(), we apply the -EIO and remove the
1157 	 * requests' payloads for any banned requests. But first, we must
1158 	 * rewind the context back to the start of the incomplete request so
1159 	 * that we do not jump back into the middle of the batch.
1160 	 *
1161 	 * We preserve the breadcrumbs and semaphores of the incomplete
1162 	 * requests so that inter-timeline dependencies (i.e other timelines)
1163 	 * remain correctly ordered. And we defer to __i915_request_submit()
1164 	 * so that all asynchronous waits are correctly handled.
1165 	 */
1166 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1167 		     rq->fence.context, rq->fence.seqno);
1168 
1169 	/* On resubmission of the active request, payload will be scrubbed */
1170 	if (i915_request_completed(rq))
1171 		head = rq->tail;
1172 	else
1173 		head = active_request(ce->timeline, rq)->head;
1174 	head = intel_ring_wrap(ce->ring, head);
1175 
1176 	/* Scrub the context image to prevent replaying the previous batch */
1177 	restore_default_state(ce, engine);
1178 	__execlists_update_reg_state(ce, engine, head);
1179 
1180 	/* We've switched away, so this should be a no-op, but intent matters */
1181 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1182 }
1183 
1184 static u32 intel_context_get_runtime(const struct intel_context *ce)
1185 {
1186 	/*
1187 	 * We can use either ppHWSP[16] which is recorded before the context
1188 	 * switch (and so excludes the cost of context switches) or use the
1189 	 * value from the context image itself, which is saved/restored earlier
1190 	 * and so includes the cost of the save.
1191 	 */
1192 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1193 }
1194 
1195 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1196 {
1197 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1198 	ce->runtime.num_underflow += dt < 0;
1199 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1200 #endif
1201 }
1202 
1203 static void intel_context_update_runtime(struct intel_context *ce)
1204 {
1205 	u32 old;
1206 	s32 dt;
1207 
1208 	if (intel_context_is_barrier(ce))
1209 		return;
1210 
1211 	old = ce->runtime.last;
1212 	ce->runtime.last = intel_context_get_runtime(ce);
1213 	dt = ce->runtime.last - old;
1214 
1215 	if (unlikely(dt <= 0)) {
1216 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1217 			 old, ce->runtime.last, dt);
1218 		st_update_runtime_underflow(ce, dt);
1219 		return;
1220 	}
1221 
1222 	ewma_runtime_add(&ce->runtime.avg, dt);
1223 	ce->runtime.total += dt;
1224 }
1225 
1226 static inline struct intel_engine_cs *
1227 __execlists_schedule_in(struct i915_request *rq)
1228 {
1229 	struct intel_engine_cs * const engine = rq->engine;
1230 	struct intel_context * const ce = rq->context;
1231 
1232 	intel_context_get(ce);
1233 
1234 	if (unlikely(intel_context_is_banned(ce)))
1235 		reset_active(rq, engine);
1236 
1237 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1238 		execlists_check_context(ce, engine);
1239 
1240 	if (ce->tag) {
1241 		/* Use a fixed tag for OA and friends */
1242 		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1243 		ce->lrc.ccid = ce->tag;
1244 	} else {
1245 		/* We don't need a strict matching tag, just different values */
1246 		unsigned int tag = ffs(engine->context_tag);
1247 
1248 		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1249 		clear_bit(tag - 1, &engine->context_tag);
1250 		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1251 
1252 		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1253 	}
1254 
1255 	ce->lrc.ccid |= engine->execlists.ccid;
1256 
1257 	__intel_gt_pm_get(engine->gt);
1258 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1259 	intel_engine_context_in(engine);
1260 
1261 	return engine;
1262 }
1263 
1264 static inline struct i915_request *
1265 execlists_schedule_in(struct i915_request *rq, int idx)
1266 {
1267 	struct intel_context * const ce = rq->context;
1268 	struct intel_engine_cs *old;
1269 
1270 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1271 	trace_i915_request_in(rq, idx);
1272 
1273 	old = READ_ONCE(ce->inflight);
1274 	do {
1275 		if (!old) {
1276 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1277 			break;
1278 		}
1279 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1280 
1281 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1282 	return i915_request_get(rq);
1283 }
1284 
1285 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1286 {
1287 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1288 	struct i915_request *next = READ_ONCE(ve->request);
1289 
1290 	if (next && next->execution_mask & ~rq->execution_mask)
1291 		tasklet_schedule(&ve->base.execlists.tasklet);
1292 }
1293 
1294 static inline void
1295 __execlists_schedule_out(struct i915_request *rq,
1296 			 struct intel_engine_cs * const engine,
1297 			 unsigned int ccid)
1298 {
1299 	struct intel_context * const ce = rq->context;
1300 
1301 	/*
1302 	 * NB process_csb() is not under the engine->active.lock and hence
1303 	 * schedule_out can race with schedule_in meaning that we should
1304 	 * refrain from doing non-trivial work here.
1305 	 */
1306 
1307 	/*
1308 	 * If we have just completed this context, the engine may now be
1309 	 * idle and we want to re-enter powersaving.
1310 	 */
1311 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1312 	    i915_request_completed(rq))
1313 		intel_engine_add_retire(engine, ce->timeline);
1314 
1315 	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1316 	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1317 	if (ccid < BITS_PER_LONG) {
1318 		GEM_BUG_ON(ccid == 0);
1319 		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1320 		set_bit(ccid - 1, &engine->context_tag);
1321 	}
1322 
1323 	intel_context_update_runtime(ce);
1324 	intel_engine_context_out(engine);
1325 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1326 	intel_gt_pm_put_async(engine->gt);
1327 
1328 	/*
1329 	 * If this is part of a virtual engine, its next request may
1330 	 * have been blocked waiting for access to the active context.
1331 	 * We have to kick all the siblings again in case we need to
1332 	 * switch (e.g. the next request is not runnable on this
1333 	 * engine). Hopefully, we will already have submitted the next
1334 	 * request before the tasklet runs and do not need to rebuild
1335 	 * each virtual tree and kick everyone again.
1336 	 */
1337 	if (ce->engine != engine)
1338 		kick_siblings(rq, ce);
1339 
1340 	intel_context_put(ce);
1341 }
1342 
1343 static inline void
1344 execlists_schedule_out(struct i915_request *rq)
1345 {
1346 	struct intel_context * const ce = rq->context;
1347 	struct intel_engine_cs *cur, *old;
1348 	u32 ccid;
1349 
1350 	trace_i915_request_out(rq);
1351 
1352 	ccid = rq->context->lrc.ccid;
1353 	old = READ_ONCE(ce->inflight);
1354 	do
1355 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1356 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1357 	if (!cur)
1358 		__execlists_schedule_out(rq, old, ccid);
1359 
1360 	i915_request_put(rq);
1361 }
1362 
1363 static u64 execlists_update_context(struct i915_request *rq)
1364 {
1365 	struct intel_context *ce = rq->context;
1366 	u64 desc = ce->lrc.desc;
1367 	u32 tail, prev;
1368 
1369 	/*
1370 	 * WaIdleLiteRestore:bdw,skl
1371 	 *
1372 	 * We should never submit the context with the same RING_TAIL twice
1373 	 * just in case we submit an empty ring, which confuses the HW.
1374 	 *
1375 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1376 	 * the normal request to be able to always advance the RING_TAIL on
1377 	 * subsequent resubmissions (for lite restore). Should that fail us,
1378 	 * and we try and submit the same tail again, force the context
1379 	 * reload.
1380 	 *
1381 	 * If we need to return to a preempted context, we need to skip the
1382 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1383 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1384 	 * an earlier request.
1385 	 */
1386 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1387 	prev = ce->lrc_reg_state[CTX_RING_TAIL];
1388 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1389 		desc |= CTX_DESC_FORCE_RESTORE;
1390 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1391 	rq->tail = rq->wa_tail;
1392 
1393 	/*
1394 	 * Make sure the context image is complete before we submit it to HW.
1395 	 *
1396 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1397 	 * an uncached write such as our mmio register access, the empirical
1398 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1399 	 * may not be visible to the HW prior to the completion of the UC
1400 	 * register write and that we may begin execution from the context
1401 	 * before its image is complete leading to invalid PD chasing.
1402 	 */
1403 	wmb();
1404 
1405 	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1406 	return desc;
1407 }
1408 
1409 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1410 {
1411 	if (execlists->ctrl_reg) {
1412 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1413 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1414 	} else {
1415 		writel(upper_32_bits(desc), execlists->submit_reg);
1416 		writel(lower_32_bits(desc), execlists->submit_reg);
1417 	}
1418 }
1419 
1420 static __maybe_unused void
1421 trace_ports(const struct intel_engine_execlists *execlists,
1422 	    const char *msg,
1423 	    struct i915_request * const *ports)
1424 {
1425 	const struct intel_engine_cs *engine =
1426 		container_of(execlists, typeof(*engine), execlists);
1427 
1428 	if (!ports[0])
1429 		return;
1430 
1431 	ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1432 		     ports[0]->fence.context,
1433 		     ports[0]->fence.seqno,
1434 		     i915_request_completed(ports[0]) ? "!" :
1435 		     i915_request_started(ports[0]) ? "*" :
1436 		     "",
1437 		     ports[1] ? ports[1]->fence.context : 0,
1438 		     ports[1] ? ports[1]->fence.seqno : 0);
1439 }
1440 
1441 static inline bool
1442 reset_in_progress(const struct intel_engine_execlists *execlists)
1443 {
1444 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1445 }
1446 
1447 static __maybe_unused bool
1448 assert_pending_valid(const struct intel_engine_execlists *execlists,
1449 		     const char *msg)
1450 {
1451 	struct i915_request * const *port, *rq;
1452 	struct intel_context *ce = NULL;
1453 	bool sentinel = false;
1454 
1455 	trace_ports(execlists, msg, execlists->pending);
1456 
1457 	/* We may be messing around with the lists during reset, lalala */
1458 	if (reset_in_progress(execlists))
1459 		return true;
1460 
1461 	if (!execlists->pending[0]) {
1462 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1463 		return false;
1464 	}
1465 
1466 	if (execlists->pending[execlists_num_ports(execlists)]) {
1467 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1468 			      execlists_num_ports(execlists));
1469 		return false;
1470 	}
1471 
1472 	for (port = execlists->pending; (rq = *port); port++) {
1473 		unsigned long flags;
1474 		bool ok = true;
1475 
1476 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1477 		GEM_BUG_ON(!i915_request_is_active(rq));
1478 
1479 		if (ce == rq->context) {
1480 			GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1481 				      ce->timeline->fence_context,
1482 				      port - execlists->pending);
1483 			return false;
1484 		}
1485 		ce = rq->context;
1486 
1487 		/*
1488 		 * Sentinels are supposed to be lonely so they flush the
1489 		 * current exection off the HW. Check that they are the
1490 		 * only request in the pending submission.
1491 		 */
1492 		if (sentinel) {
1493 			GEM_TRACE_ERR("context:%llx after sentinel in pending[%zd]\n",
1494 				      ce->timeline->fence_context,
1495 				      port - execlists->pending);
1496 			return false;
1497 		}
1498 
1499 		sentinel = i915_request_has_sentinel(rq);
1500 		if (sentinel && port != execlists->pending) {
1501 			GEM_TRACE_ERR("sentinel context:%llx not in prime position[%zd]\n",
1502 				      ce->timeline->fence_context,
1503 				      port - execlists->pending);
1504 			return false;
1505 		}
1506 
1507 		/* Hold tightly onto the lock to prevent concurrent retires! */
1508 		if (!spin_trylock_irqsave(&rq->lock, flags))
1509 			continue;
1510 
1511 		if (i915_request_completed(rq))
1512 			goto unlock;
1513 
1514 		if (i915_active_is_idle(&ce->active) &&
1515 		    !intel_context_is_barrier(ce)) {
1516 			GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1517 				      ce->timeline->fence_context,
1518 				      port - execlists->pending);
1519 			ok = false;
1520 			goto unlock;
1521 		}
1522 
1523 		if (!i915_vma_is_pinned(ce->state)) {
1524 			GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1525 				      ce->timeline->fence_context,
1526 				      port - execlists->pending);
1527 			ok = false;
1528 			goto unlock;
1529 		}
1530 
1531 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1532 			GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1533 				      ce->timeline->fence_context,
1534 				      port - execlists->pending);
1535 			ok = false;
1536 			goto unlock;
1537 		}
1538 
1539 unlock:
1540 		spin_unlock_irqrestore(&rq->lock, flags);
1541 		if (!ok)
1542 			return false;
1543 	}
1544 
1545 	return ce;
1546 }
1547 
1548 static void execlists_submit_ports(struct intel_engine_cs *engine)
1549 {
1550 	struct intel_engine_execlists *execlists = &engine->execlists;
1551 	unsigned int n;
1552 
1553 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1554 
1555 	/*
1556 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1557 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1558 	 * not be relinquished until the device is idle (see
1559 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1560 	 * that all ELSP are drained i.e. we have processed the CSB,
1561 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1562 	 */
1563 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1564 
1565 	/*
1566 	 * ELSQ note: the submit queue is not cleared after being submitted
1567 	 * to the HW so we need to make sure we always clean it up. This is
1568 	 * currently ensured by the fact that we always write the same number
1569 	 * of elsq entries, keep this in mind before changing the loop below.
1570 	 */
1571 	for (n = execlists_num_ports(execlists); n--; ) {
1572 		struct i915_request *rq = execlists->pending[n];
1573 
1574 		write_desc(execlists,
1575 			   rq ? execlists_update_context(rq) : 0,
1576 			   n);
1577 	}
1578 
1579 	/* we need to manually load the submit queue */
1580 	if (execlists->ctrl_reg)
1581 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1582 }
1583 
1584 static bool ctx_single_port_submission(const struct intel_context *ce)
1585 {
1586 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1587 		intel_context_force_single_submission(ce));
1588 }
1589 
1590 static bool can_merge_ctx(const struct intel_context *prev,
1591 			  const struct intel_context *next)
1592 {
1593 	if (prev != next)
1594 		return false;
1595 
1596 	if (ctx_single_port_submission(prev))
1597 		return false;
1598 
1599 	return true;
1600 }
1601 
1602 static unsigned long i915_request_flags(const struct i915_request *rq)
1603 {
1604 	return READ_ONCE(rq->fence.flags);
1605 }
1606 
1607 static bool can_merge_rq(const struct i915_request *prev,
1608 			 const struct i915_request *next)
1609 {
1610 	GEM_BUG_ON(prev == next);
1611 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1612 
1613 	/*
1614 	 * We do not submit known completed requests. Therefore if the next
1615 	 * request is already completed, we can pretend to merge it in
1616 	 * with the previous context (and we will skip updating the ELSP
1617 	 * and tracking). Thus hopefully keeping the ELSP full with active
1618 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1619 	 * us.
1620 	 */
1621 	if (i915_request_completed(next))
1622 		return true;
1623 
1624 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1625 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1626 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1627 		return false;
1628 
1629 	if (!can_merge_ctx(prev->context, next->context))
1630 		return false;
1631 
1632 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1633 	return true;
1634 }
1635 
1636 static void virtual_update_register_offsets(u32 *regs,
1637 					    struct intel_engine_cs *engine)
1638 {
1639 	set_offsets(regs, reg_offsets(engine), engine, false);
1640 }
1641 
1642 static bool virtual_matches(const struct virtual_engine *ve,
1643 			    const struct i915_request *rq,
1644 			    const struct intel_engine_cs *engine)
1645 {
1646 	const struct intel_engine_cs *inflight;
1647 
1648 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1649 		return false;
1650 
1651 	/*
1652 	 * We track when the HW has completed saving the context image
1653 	 * (i.e. when we have seen the final CS event switching out of
1654 	 * the context) and must not overwrite the context image before
1655 	 * then. This restricts us to only using the active engine
1656 	 * while the previous virtualized request is inflight (so
1657 	 * we reuse the register offsets). This is a very small
1658 	 * hystersis on the greedy seelction algorithm.
1659 	 */
1660 	inflight = intel_context_inflight(&ve->context);
1661 	if (inflight && inflight != engine)
1662 		return false;
1663 
1664 	return true;
1665 }
1666 
1667 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1668 				     struct i915_request *rq)
1669 {
1670 	struct intel_engine_cs *old = ve->siblings[0];
1671 
1672 	/* All unattached (rq->engine == old) must already be completed */
1673 
1674 	spin_lock(&old->breadcrumbs.irq_lock);
1675 	if (!list_empty(&ve->context.signal_link)) {
1676 		list_del_init(&ve->context.signal_link);
1677 
1678 		/*
1679 		 * We cannot acquire the new engine->breadcrumbs.irq_lock
1680 		 * (as we are holding a breadcrumbs.irq_lock already),
1681 		 * so attach this request to the signaler on submission.
1682 		 * The queued irq_work will occur when we finally drop
1683 		 * the engine->active.lock after dequeue.
1684 		 */
1685 		set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags);
1686 
1687 		/* Also transfer the pending irq_work for the old breadcrumb. */
1688 		intel_engine_signal_breadcrumbs(rq->engine);
1689 	}
1690 	spin_unlock(&old->breadcrumbs.irq_lock);
1691 }
1692 
1693 #define for_each_waiter(p__, rq__) \
1694 	list_for_each_entry_lockless(p__, \
1695 				     &(rq__)->sched.waiters_list, \
1696 				     wait_link)
1697 
1698 #define for_each_signaler(p__, rq__) \
1699 	list_for_each_entry_rcu(p__, \
1700 				&(rq__)->sched.signalers_list, \
1701 				signal_link)
1702 
1703 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1704 {
1705 	LIST_HEAD(list);
1706 
1707 	/*
1708 	 * We want to move the interrupted request to the back of
1709 	 * the round-robin list (i.e. its priority level), but
1710 	 * in doing so, we must then move all requests that were in
1711 	 * flight and were waiting for the interrupted request to
1712 	 * be run after it again.
1713 	 */
1714 	do {
1715 		struct i915_dependency *p;
1716 
1717 		GEM_BUG_ON(i915_request_is_active(rq));
1718 		list_move_tail(&rq->sched.link, pl);
1719 
1720 		for_each_waiter(p, rq) {
1721 			struct i915_request *w =
1722 				container_of(p->waiter, typeof(*w), sched);
1723 
1724 			if (p->flags & I915_DEPENDENCY_WEAK)
1725 				continue;
1726 
1727 			/* Leave semaphores spinning on the other engines */
1728 			if (w->engine != rq->engine)
1729 				continue;
1730 
1731 			/* No waiter should start before its signaler */
1732 			GEM_BUG_ON(i915_request_started(w) &&
1733 				   !i915_request_completed(rq));
1734 
1735 			GEM_BUG_ON(i915_request_is_active(w));
1736 			if (!i915_request_is_ready(w))
1737 				continue;
1738 
1739 			if (rq_prio(w) < rq_prio(rq))
1740 				continue;
1741 
1742 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1743 			list_move_tail(&w->sched.link, &list);
1744 		}
1745 
1746 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1747 	} while (rq);
1748 }
1749 
1750 static void defer_active(struct intel_engine_cs *engine)
1751 {
1752 	struct i915_request *rq;
1753 
1754 	rq = __unwind_incomplete_requests(engine);
1755 	if (!rq)
1756 		return;
1757 
1758 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1759 }
1760 
1761 static bool
1762 need_timeslice(const struct intel_engine_cs *engine,
1763 	       const struct i915_request *rq)
1764 {
1765 	int hint;
1766 
1767 	if (!intel_engine_has_timeslices(engine))
1768 		return false;
1769 
1770 	hint = engine->execlists.queue_priority_hint;
1771 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1772 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1773 
1774 	return hint >= effective_prio(rq);
1775 }
1776 
1777 static bool
1778 timeslice_yield(const struct intel_engine_execlists *el,
1779 		const struct i915_request *rq)
1780 {
1781 	/*
1782 	 * Once bitten, forever smitten!
1783 	 *
1784 	 * If the active context ever busy-waited on a semaphore,
1785 	 * it will be treated as a hog until the end of its timeslice (i.e.
1786 	 * until it is scheduled out and replaced by a new submission,
1787 	 * possibly even its own lite-restore). The HW only sends an interrupt
1788 	 * on the first miss, and we do know if that semaphore has been
1789 	 * signaled, or even if it is now stuck on another semaphore. Play
1790 	 * safe, yield if it might be stuck -- it will be given a fresh
1791 	 * timeslice in the near future.
1792 	 */
1793 	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1794 }
1795 
1796 static bool
1797 timeslice_expired(const struct intel_engine_execlists *el,
1798 		  const struct i915_request *rq)
1799 {
1800 	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1801 }
1802 
1803 static int
1804 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1805 {
1806 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1807 		return INT_MIN;
1808 
1809 	return rq_prio(list_next_entry(rq, sched.link));
1810 }
1811 
1812 static inline unsigned long
1813 timeslice(const struct intel_engine_cs *engine)
1814 {
1815 	return READ_ONCE(engine->props.timeslice_duration_ms);
1816 }
1817 
1818 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1819 {
1820 	const struct intel_engine_execlists *execlists = &engine->execlists;
1821 	const struct i915_request *rq = *execlists->active;
1822 
1823 	if (!rq || i915_request_completed(rq))
1824 		return 0;
1825 
1826 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1827 		return 0;
1828 
1829 	return timeslice(engine);
1830 }
1831 
1832 static void set_timeslice(struct intel_engine_cs *engine)
1833 {
1834 	if (!intel_engine_has_timeslices(engine))
1835 		return;
1836 
1837 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1838 }
1839 
1840 static void start_timeslice(struct intel_engine_cs *engine)
1841 {
1842 	struct intel_engine_execlists *execlists = &engine->execlists;
1843 	int prio = queue_prio(execlists);
1844 
1845 	WRITE_ONCE(execlists->switch_priority_hint, prio);
1846 	if (prio == INT_MIN)
1847 		return;
1848 
1849 	if (timer_pending(&execlists->timer))
1850 		return;
1851 
1852 	set_timer_ms(&execlists->timer, timeslice(engine));
1853 }
1854 
1855 static void record_preemption(struct intel_engine_execlists *execlists)
1856 {
1857 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1858 }
1859 
1860 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
1861 					    const struct i915_request *rq)
1862 {
1863 	if (!rq)
1864 		return 0;
1865 
1866 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1867 	if (unlikely(intel_context_is_banned(rq->context)))
1868 		return 1;
1869 
1870 	return READ_ONCE(engine->props.preempt_timeout_ms);
1871 }
1872 
1873 static void set_preempt_timeout(struct intel_engine_cs *engine,
1874 				const struct i915_request *rq)
1875 {
1876 	if (!intel_engine_has_preempt_reset(engine))
1877 		return;
1878 
1879 	set_timer_ms(&engine->execlists.preempt,
1880 		     active_preempt_timeout(engine, rq));
1881 }
1882 
1883 static inline void clear_ports(struct i915_request **ports, int count)
1884 {
1885 	memset_p((void **)ports, NULL, count);
1886 }
1887 
1888 static void execlists_dequeue(struct intel_engine_cs *engine)
1889 {
1890 	struct intel_engine_execlists * const execlists = &engine->execlists;
1891 	struct i915_request **port = execlists->pending;
1892 	struct i915_request ** const last_port = port + execlists->port_mask;
1893 	struct i915_request * const *active;
1894 	struct i915_request *last;
1895 	struct rb_node *rb;
1896 	bool submit = false;
1897 
1898 	/*
1899 	 * Hardware submission is through 2 ports. Conceptually each port
1900 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1901 	 * static for a context, and unique to each, so we only execute
1902 	 * requests belonging to a single context from each ring. RING_HEAD
1903 	 * is maintained by the CS in the context image, it marks the place
1904 	 * where it got up to last time, and through RING_TAIL we tell the CS
1905 	 * where we want to execute up to this time.
1906 	 *
1907 	 * In this list the requests are in order of execution. Consecutive
1908 	 * requests from the same context are adjacent in the ringbuffer. We
1909 	 * can combine these requests into a single RING_TAIL update:
1910 	 *
1911 	 *              RING_HEAD...req1...req2
1912 	 *                                    ^- RING_TAIL
1913 	 * since to execute req2 the CS must first execute req1.
1914 	 *
1915 	 * Our goal then is to point each port to the end of a consecutive
1916 	 * sequence of requests as being the most optimal (fewest wake ups
1917 	 * and context switches) submission.
1918 	 */
1919 
1920 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1921 		struct virtual_engine *ve =
1922 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1923 		struct i915_request *rq = READ_ONCE(ve->request);
1924 
1925 		if (!rq) { /* lazily cleanup after another engine handled rq */
1926 			rb_erase_cached(rb, &execlists->virtual);
1927 			RB_CLEAR_NODE(rb);
1928 			rb = rb_first_cached(&execlists->virtual);
1929 			continue;
1930 		}
1931 
1932 		if (!virtual_matches(ve, rq, engine)) {
1933 			rb = rb_next(rb);
1934 			continue;
1935 		}
1936 
1937 		break;
1938 	}
1939 
1940 	/*
1941 	 * If the queue is higher priority than the last
1942 	 * request in the currently active context, submit afresh.
1943 	 * We will resubmit again afterwards in case we need to split
1944 	 * the active context to interject the preemption request,
1945 	 * i.e. we will retrigger preemption following the ack in case
1946 	 * of trouble.
1947 	 */
1948 	active = READ_ONCE(execlists->active);
1949 	while ((last = *active) && i915_request_completed(last))
1950 		active++;
1951 
1952 	if (last) {
1953 		if (need_preempt(engine, last, rb)) {
1954 			ENGINE_TRACE(engine,
1955 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1956 				     last->fence.context,
1957 				     last->fence.seqno,
1958 				     last->sched.attr.priority,
1959 				     execlists->queue_priority_hint);
1960 			record_preemption(execlists);
1961 
1962 			/*
1963 			 * Don't let the RING_HEAD advance past the breadcrumb
1964 			 * as we unwind (and until we resubmit) so that we do
1965 			 * not accidentally tell it to go backwards.
1966 			 */
1967 			ring_set_paused(engine, 1);
1968 
1969 			/*
1970 			 * Note that we have not stopped the GPU at this point,
1971 			 * so we are unwinding the incomplete requests as they
1972 			 * remain inflight and so by the time we do complete
1973 			 * the preemption, some of the unwound requests may
1974 			 * complete!
1975 			 */
1976 			__unwind_incomplete_requests(engine);
1977 
1978 			last = NULL;
1979 		} else if (need_timeslice(engine, last) &&
1980 			   timeslice_expired(execlists, last)) {
1981 			ENGINE_TRACE(engine,
1982 				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
1983 				     last->fence.context,
1984 				     last->fence.seqno,
1985 				     last->sched.attr.priority,
1986 				     execlists->queue_priority_hint,
1987 				     yesno(timeslice_yield(execlists, last)));
1988 
1989 			ring_set_paused(engine, 1);
1990 			defer_active(engine);
1991 
1992 			/*
1993 			 * Unlike for preemption, if we rewind and continue
1994 			 * executing the same context as previously active,
1995 			 * the order of execution will remain the same and
1996 			 * the tail will only advance. We do not need to
1997 			 * force a full context restore, as a lite-restore
1998 			 * is sufficient to resample the monotonic TAIL.
1999 			 *
2000 			 * If we switch to any other context, similarly we
2001 			 * will not rewind TAIL of current context, and
2002 			 * normal save/restore will preserve state and allow
2003 			 * us to later continue executing the same request.
2004 			 */
2005 			last = NULL;
2006 		} else {
2007 			/*
2008 			 * Otherwise if we already have a request pending
2009 			 * for execution after the current one, we can
2010 			 * just wait until the next CS event before
2011 			 * queuing more. In either case we will force a
2012 			 * lite-restore preemption event, but if we wait
2013 			 * we hopefully coalesce several updates into a single
2014 			 * submission.
2015 			 */
2016 			if (!list_is_last(&last->sched.link,
2017 					  &engine->active.requests)) {
2018 				/*
2019 				 * Even if ELSP[1] is occupied and not worthy
2020 				 * of timeslices, our queue might be.
2021 				 */
2022 				start_timeslice(engine);
2023 				return;
2024 			}
2025 		}
2026 	}
2027 
2028 	while (rb) { /* XXX virtual is always taking precedence */
2029 		struct virtual_engine *ve =
2030 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2031 		struct i915_request *rq;
2032 
2033 		spin_lock(&ve->base.active.lock);
2034 
2035 		rq = ve->request;
2036 		if (unlikely(!rq)) { /* lost the race to a sibling */
2037 			spin_unlock(&ve->base.active.lock);
2038 			rb_erase_cached(rb, &execlists->virtual);
2039 			RB_CLEAR_NODE(rb);
2040 			rb = rb_first_cached(&execlists->virtual);
2041 			continue;
2042 		}
2043 
2044 		GEM_BUG_ON(rq != ve->request);
2045 		GEM_BUG_ON(rq->engine != &ve->base);
2046 		GEM_BUG_ON(rq->context != &ve->context);
2047 
2048 		if (rq_prio(rq) >= queue_prio(execlists)) {
2049 			if (!virtual_matches(ve, rq, engine)) {
2050 				spin_unlock(&ve->base.active.lock);
2051 				rb = rb_next(rb);
2052 				continue;
2053 			}
2054 
2055 			if (last && !can_merge_rq(last, rq)) {
2056 				spin_unlock(&ve->base.active.lock);
2057 				start_timeslice(engine);
2058 				return; /* leave this for another sibling */
2059 			}
2060 
2061 			ENGINE_TRACE(engine,
2062 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2063 				     rq->fence.context,
2064 				     rq->fence.seqno,
2065 				     i915_request_completed(rq) ? "!" :
2066 				     i915_request_started(rq) ? "*" :
2067 				     "",
2068 				     yesno(engine != ve->siblings[0]));
2069 
2070 			WRITE_ONCE(ve->request, NULL);
2071 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2072 				   INT_MIN);
2073 			rb_erase_cached(rb, &execlists->virtual);
2074 			RB_CLEAR_NODE(rb);
2075 
2076 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2077 			WRITE_ONCE(rq->engine, engine);
2078 
2079 			if (engine != ve->siblings[0]) {
2080 				u32 *regs = ve->context.lrc_reg_state;
2081 				unsigned int n;
2082 
2083 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2084 
2085 				if (!intel_engine_has_relative_mmio(engine))
2086 					virtual_update_register_offsets(regs,
2087 									engine);
2088 
2089 				if (!list_empty(&ve->context.signals))
2090 					virtual_xfer_breadcrumbs(ve, rq);
2091 
2092 				/*
2093 				 * Move the bound engine to the top of the list
2094 				 * for future execution. We then kick this
2095 				 * tasklet first before checking others, so that
2096 				 * we preferentially reuse this set of bound
2097 				 * registers.
2098 				 */
2099 				for (n = 1; n < ve->num_siblings; n++) {
2100 					if (ve->siblings[n] == engine) {
2101 						swap(ve->siblings[n],
2102 						     ve->siblings[0]);
2103 						break;
2104 					}
2105 				}
2106 
2107 				GEM_BUG_ON(ve->siblings[0] != engine);
2108 			}
2109 
2110 			if (__i915_request_submit(rq)) {
2111 				submit = true;
2112 				last = rq;
2113 			}
2114 			i915_request_put(rq);
2115 
2116 			/*
2117 			 * Hmm, we have a bunch of virtual engine requests,
2118 			 * but the first one was already completed (thanks
2119 			 * preempt-to-busy!). Keep looking at the veng queue
2120 			 * until we have no more relevant requests (i.e.
2121 			 * the normal submit queue has higher priority).
2122 			 */
2123 			if (!submit) {
2124 				spin_unlock(&ve->base.active.lock);
2125 				rb = rb_first_cached(&execlists->virtual);
2126 				continue;
2127 			}
2128 		}
2129 
2130 		spin_unlock(&ve->base.active.lock);
2131 		break;
2132 	}
2133 
2134 	while ((rb = rb_first_cached(&execlists->queue))) {
2135 		struct i915_priolist *p = to_priolist(rb);
2136 		struct i915_request *rq, *rn;
2137 		int i;
2138 
2139 		priolist_for_each_request_consume(rq, rn, p, i) {
2140 			bool merge = true;
2141 
2142 			/*
2143 			 * Can we combine this request with the current port?
2144 			 * It has to be the same context/ringbuffer and not
2145 			 * have any exceptions (e.g. GVT saying never to
2146 			 * combine contexts).
2147 			 *
2148 			 * If we can combine the requests, we can execute both
2149 			 * by updating the RING_TAIL to point to the end of the
2150 			 * second request, and so we never need to tell the
2151 			 * hardware about the first.
2152 			 */
2153 			if (last && !can_merge_rq(last, rq)) {
2154 				/*
2155 				 * If we are on the second port and cannot
2156 				 * combine this request with the last, then we
2157 				 * are done.
2158 				 */
2159 				if (port == last_port)
2160 					goto done;
2161 
2162 				/*
2163 				 * We must not populate both ELSP[] with the
2164 				 * same LRCA, i.e. we must submit 2 different
2165 				 * contexts if we submit 2 ELSP.
2166 				 */
2167 				if (last->context == rq->context)
2168 					goto done;
2169 
2170 				if (i915_request_has_sentinel(last))
2171 					goto done;
2172 
2173 				/*
2174 				 * If GVT overrides us we only ever submit
2175 				 * port[0], leaving port[1] empty. Note that we
2176 				 * also have to be careful that we don't queue
2177 				 * the same context (even though a different
2178 				 * request) to the second port.
2179 				 */
2180 				if (ctx_single_port_submission(last->context) ||
2181 				    ctx_single_port_submission(rq->context))
2182 					goto done;
2183 
2184 				merge = false;
2185 			}
2186 
2187 			if (__i915_request_submit(rq)) {
2188 				if (!merge) {
2189 					*port = execlists_schedule_in(last, port - execlists->pending);
2190 					port++;
2191 					last = NULL;
2192 				}
2193 
2194 				GEM_BUG_ON(last &&
2195 					   !can_merge_ctx(last->context,
2196 							  rq->context));
2197 				GEM_BUG_ON(last &&
2198 					   i915_seqno_passed(last->fence.seqno,
2199 							     rq->fence.seqno));
2200 
2201 				submit = true;
2202 				last = rq;
2203 			}
2204 		}
2205 
2206 		rb_erase_cached(&p->node, &execlists->queue);
2207 		i915_priolist_free(p);
2208 	}
2209 
2210 done:
2211 	/*
2212 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2213 	 *
2214 	 * We choose the priority hint such that if we add a request of greater
2215 	 * priority than this, we kick the submission tasklet to decide on
2216 	 * the right order of submitting the requests to hardware. We must
2217 	 * also be prepared to reorder requests as they are in-flight on the
2218 	 * HW. We derive the priority hint then as the first "hole" in
2219 	 * the HW submission ports and if there are no available slots,
2220 	 * the priority of the lowest executing request, i.e. last.
2221 	 *
2222 	 * When we do receive a higher priority request ready to run from the
2223 	 * user, see queue_request(), the priority hint is bumped to that
2224 	 * request triggering preemption on the next dequeue (or subsequent
2225 	 * interrupt for secondary ports).
2226 	 */
2227 	execlists->queue_priority_hint = queue_prio(execlists);
2228 
2229 	if (submit) {
2230 		*port = execlists_schedule_in(last, port - execlists->pending);
2231 		execlists->switch_priority_hint =
2232 			switch_prio(engine, *execlists->pending);
2233 
2234 		/*
2235 		 * Skip if we ended up with exactly the same set of requests,
2236 		 * e.g. trying to timeslice a pair of ordered contexts
2237 		 */
2238 		if (!memcmp(active, execlists->pending,
2239 			    (port - execlists->pending + 1) * sizeof(*port))) {
2240 			do
2241 				execlists_schedule_out(fetch_and_zero(port));
2242 			while (port-- != execlists->pending);
2243 
2244 			goto skip_submit;
2245 		}
2246 		clear_ports(port + 1, last_port - port);
2247 
2248 		WRITE_ONCE(execlists->yield, -1);
2249 		execlists_submit_ports(engine);
2250 		set_preempt_timeout(engine, *active);
2251 	} else {
2252 skip_submit:
2253 		ring_set_paused(engine, 0);
2254 	}
2255 }
2256 
2257 static void
2258 cancel_port_requests(struct intel_engine_execlists * const execlists)
2259 {
2260 	struct i915_request * const *port;
2261 
2262 	for (port = execlists->pending; *port; port++)
2263 		execlists_schedule_out(*port);
2264 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2265 
2266 	/* Mark the end of active before we overwrite *active */
2267 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2268 		execlists_schedule_out(*port);
2269 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2270 
2271 	smp_wmb(); /* complete the seqlock for execlists_active() */
2272 	WRITE_ONCE(execlists->active, execlists->inflight);
2273 }
2274 
2275 static inline void
2276 invalidate_csb_entries(const u32 *first, const u32 *last)
2277 {
2278 	clflush((void *)first);
2279 	clflush((void *)last);
2280 }
2281 
2282 /*
2283  * Starting with Gen12, the status has a new format:
2284  *
2285  *     bit  0:     switched to new queue
2286  *     bit  1:     reserved
2287  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2288  *                 switch detail is set to "wait on semaphore"
2289  *     bits 3-5:   engine class
2290  *     bits 6-11:  engine instance
2291  *     bits 12-14: reserved
2292  *     bits 15-25: sw context id of the lrc the GT switched to
2293  *     bits 26-31: sw counter of the lrc the GT switched to
2294  *     bits 32-35: context switch detail
2295  *                  - 0: ctx complete
2296  *                  - 1: wait on sync flip
2297  *                  - 2: wait on vblank
2298  *                  - 3: wait on scanline
2299  *                  - 4: wait on semaphore
2300  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2301  *                       WAIT_FOR_EVENT)
2302  *     bit  36:    reserved
2303  *     bits 37-43: wait detail (for switch detail 1 to 4)
2304  *     bits 44-46: reserved
2305  *     bits 47-57: sw context id of the lrc the GT switched away from
2306  *     bits 58-63: sw counter of the lrc the GT switched away from
2307  */
2308 static inline bool
2309 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2310 {
2311 	u32 lower_dw = csb[0];
2312 	u32 upper_dw = csb[1];
2313 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2314 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2315 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2316 
2317 	/*
2318 	 * The context switch detail is not guaranteed to be 5 when a preemption
2319 	 * occurs, so we can't just check for that. The check below works for
2320 	 * all the cases we care about, including preemptions of WAIT
2321 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2322 	 * would require some extra handling, but we don't support that.
2323 	 */
2324 	if (!ctx_away_valid || new_queue) {
2325 		GEM_BUG_ON(!ctx_to_valid);
2326 		return true;
2327 	}
2328 
2329 	/*
2330 	 * switch detail = 5 is covered by the case above and we do not expect a
2331 	 * context switch on an unsuccessful wait instruction since we always
2332 	 * use polling mode.
2333 	 */
2334 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2335 	return false;
2336 }
2337 
2338 static inline bool
2339 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2340 {
2341 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2342 }
2343 
2344 static void process_csb(struct intel_engine_cs *engine)
2345 {
2346 	struct intel_engine_execlists * const execlists = &engine->execlists;
2347 	const u32 * const buf = execlists->csb_status;
2348 	const u8 num_entries = execlists->csb_size;
2349 	u8 head, tail;
2350 
2351 	/*
2352 	 * As we modify our execlists state tracking we require exclusive
2353 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2354 	 * and we assume that is only inside the reset paths and so serialised.
2355 	 */
2356 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2357 		   !reset_in_progress(execlists));
2358 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2359 
2360 	/*
2361 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2362 	 * When reading from the csb_write mmio register, we have to be
2363 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2364 	 * the low 4bits. As it happens we know the next 4bits are always
2365 	 * zero and so we can simply masked off the low u8 of the register
2366 	 * and treat it identically to reading from the HWSP (without having
2367 	 * to use explicit shifting and masking, and probably bifurcating
2368 	 * the code to handle the legacy mmio read).
2369 	 */
2370 	head = execlists->csb_head;
2371 	tail = READ_ONCE(*execlists->csb_write);
2372 	if (unlikely(head == tail))
2373 		return;
2374 
2375 	/*
2376 	 * Hopefully paired with a wmb() in HW!
2377 	 *
2378 	 * We must complete the read of the write pointer before any reads
2379 	 * from the CSB, so that we do not see stale values. Without an rmb
2380 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2381 	 * we perform the READ_ONCE(*csb_write).
2382 	 */
2383 	rmb();
2384 
2385 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2386 	do {
2387 		bool promote;
2388 
2389 		if (++head == num_entries)
2390 			head = 0;
2391 
2392 		/*
2393 		 * We are flying near dragons again.
2394 		 *
2395 		 * We hold a reference to the request in execlist_port[]
2396 		 * but no more than that. We are operating in softirq
2397 		 * context and so cannot hold any mutex or sleep. That
2398 		 * prevents us stopping the requests we are processing
2399 		 * in port[] from being retired simultaneously (the
2400 		 * breadcrumb will be complete before we see the
2401 		 * context-switch). As we only hold the reference to the
2402 		 * request, any pointer chasing underneath the request
2403 		 * is subject to a potential use-after-free. Thus we
2404 		 * store all of the bookkeeping within port[] as
2405 		 * required, and avoid using unguarded pointers beneath
2406 		 * request itself. The same applies to the atomic
2407 		 * status notifier.
2408 		 */
2409 
2410 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2411 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2412 
2413 		if (INTEL_GEN(engine->i915) >= 12)
2414 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2415 		else
2416 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2417 		if (promote) {
2418 			struct i915_request * const *old = execlists->active;
2419 
2420 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2421 
2422 			ring_set_paused(engine, 0);
2423 
2424 			/* Point active to the new ELSP; prevent overwriting */
2425 			WRITE_ONCE(execlists->active, execlists->pending);
2426 			smp_wmb(); /* notify execlists_active() */
2427 
2428 			/* cancel old inflight, prepare for switch */
2429 			trace_ports(execlists, "preempted", old);
2430 			while (*old)
2431 				execlists_schedule_out(*old++);
2432 
2433 			/* switch pending to inflight */
2434 			memcpy(execlists->inflight,
2435 			       execlists->pending,
2436 			       execlists_num_ports(execlists) *
2437 			       sizeof(*execlists->pending));
2438 			smp_wmb(); /* complete the seqlock */
2439 			WRITE_ONCE(execlists->active, execlists->inflight);
2440 
2441 			WRITE_ONCE(execlists->pending[0], NULL);
2442 		} else {
2443 			GEM_BUG_ON(!*execlists->active);
2444 
2445 			/* port0 completed, advanced to port1 */
2446 			trace_ports(execlists, "completed", execlists->active);
2447 
2448 			/*
2449 			 * We rely on the hardware being strongly
2450 			 * ordered, that the breadcrumb write is
2451 			 * coherent (visible from the CPU) before the
2452 			 * user interrupt and CSB is processed.
2453 			 */
2454 			if (GEM_SHOW_DEBUG() &&
2455 			    !i915_request_completed(*execlists->active) &&
2456 			    !reset_in_progress(execlists)) {
2457 				struct i915_request *rq __maybe_unused =
2458 					*execlists->active;
2459 				const u32 *regs __maybe_unused =
2460 					rq->context->lrc_reg_state;
2461 
2462 				ENGINE_TRACE(engine,
2463 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2464 					     ENGINE_READ(engine, RING_START),
2465 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2466 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2467 					     ENGINE_READ(engine, RING_CTL),
2468 					     ENGINE_READ(engine, RING_MI_MODE));
2469 				ENGINE_TRACE(engine,
2470 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2471 					     i915_ggtt_offset(rq->ring->vma),
2472 					     rq->head, rq->tail,
2473 					     rq->fence.context,
2474 					     lower_32_bits(rq->fence.seqno),
2475 					     hwsp_seqno(rq));
2476 				ENGINE_TRACE(engine,
2477 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2478 					     regs[CTX_RING_START],
2479 					     regs[CTX_RING_HEAD],
2480 					     regs[CTX_RING_TAIL]);
2481 
2482 				GEM_BUG_ON("context completed before request");
2483 			}
2484 
2485 			execlists_schedule_out(*execlists->active++);
2486 
2487 			GEM_BUG_ON(execlists->active - execlists->inflight >
2488 				   execlists_num_ports(execlists));
2489 		}
2490 	} while (head != tail);
2491 
2492 	execlists->csb_head = head;
2493 	set_timeslice(engine);
2494 
2495 	/*
2496 	 * Gen11 has proven to fail wrt global observation point between
2497 	 * entry and tail update, failing on the ordering and thus
2498 	 * we see an old entry in the context status buffer.
2499 	 *
2500 	 * Forcibly evict out entries for the next gpu csb update,
2501 	 * to increase the odds that we get a fresh entries with non
2502 	 * working hardware. The cost for doing so comes out mostly with
2503 	 * the wash as hardware, working or not, will need to do the
2504 	 * invalidation before.
2505 	 */
2506 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2507 }
2508 
2509 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2510 {
2511 	lockdep_assert_held(&engine->active.lock);
2512 	if (!READ_ONCE(engine->execlists.pending[0])) {
2513 		rcu_read_lock(); /* protect peeking at execlists->active */
2514 		execlists_dequeue(engine);
2515 		rcu_read_unlock();
2516 	}
2517 }
2518 
2519 static void __execlists_hold(struct i915_request *rq)
2520 {
2521 	LIST_HEAD(list);
2522 
2523 	do {
2524 		struct i915_dependency *p;
2525 
2526 		if (i915_request_is_active(rq))
2527 			__i915_request_unsubmit(rq);
2528 
2529 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2530 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2531 		i915_request_set_hold(rq);
2532 		RQ_TRACE(rq, "on hold\n");
2533 
2534 		for_each_waiter(p, rq) {
2535 			struct i915_request *w =
2536 				container_of(p->waiter, typeof(*w), sched);
2537 
2538 			/* Leave semaphores spinning on the other engines */
2539 			if (w->engine != rq->engine)
2540 				continue;
2541 
2542 			if (!i915_request_is_ready(w))
2543 				continue;
2544 
2545 			if (i915_request_completed(w))
2546 				continue;
2547 
2548 			if (i915_request_on_hold(w))
2549 				continue;
2550 
2551 			list_move_tail(&w->sched.link, &list);
2552 		}
2553 
2554 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2555 	} while (rq);
2556 }
2557 
2558 static bool execlists_hold(struct intel_engine_cs *engine,
2559 			   struct i915_request *rq)
2560 {
2561 	spin_lock_irq(&engine->active.lock);
2562 
2563 	if (i915_request_completed(rq)) { /* too late! */
2564 		rq = NULL;
2565 		goto unlock;
2566 	}
2567 
2568 	if (rq->engine != engine) { /* preempted virtual engine */
2569 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2570 
2571 		/*
2572 		 * intel_context_inflight() is only protected by virtue
2573 		 * of process_csb() being called only by the tasklet (or
2574 		 * directly from inside reset while the tasklet is suspended).
2575 		 * Assert that neither of those are allowed to run while we
2576 		 * poke at the request queues.
2577 		 */
2578 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2579 
2580 		/*
2581 		 * An unsubmitted request along a virtual engine will
2582 		 * remain on the active (this) engine until we are able
2583 		 * to process the context switch away (and so mark the
2584 		 * context as no longer in flight). That cannot have happened
2585 		 * yet, otherwise we would not be hanging!
2586 		 */
2587 		spin_lock(&ve->base.active.lock);
2588 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2589 		GEM_BUG_ON(ve->request != rq);
2590 		ve->request = NULL;
2591 		spin_unlock(&ve->base.active.lock);
2592 		i915_request_put(rq);
2593 
2594 		rq->engine = engine;
2595 	}
2596 
2597 	/*
2598 	 * Transfer this request onto the hold queue to prevent it
2599 	 * being resumbitted to HW (and potentially completed) before we have
2600 	 * released it. Since we may have already submitted following
2601 	 * requests, we need to remove those as well.
2602 	 */
2603 	GEM_BUG_ON(i915_request_on_hold(rq));
2604 	GEM_BUG_ON(rq->engine != engine);
2605 	__execlists_hold(rq);
2606 	GEM_BUG_ON(list_empty(&engine->active.hold));
2607 
2608 unlock:
2609 	spin_unlock_irq(&engine->active.lock);
2610 	return rq;
2611 }
2612 
2613 static bool hold_request(const struct i915_request *rq)
2614 {
2615 	struct i915_dependency *p;
2616 	bool result = false;
2617 
2618 	/*
2619 	 * If one of our ancestors is on hold, we must also be on hold,
2620 	 * otherwise we will bypass it and execute before it.
2621 	 */
2622 	rcu_read_lock();
2623 	for_each_signaler(p, rq) {
2624 		const struct i915_request *s =
2625 			container_of(p->signaler, typeof(*s), sched);
2626 
2627 		if (s->engine != rq->engine)
2628 			continue;
2629 
2630 		result = i915_request_on_hold(s);
2631 		if (result)
2632 			break;
2633 	}
2634 	rcu_read_unlock();
2635 
2636 	return result;
2637 }
2638 
2639 static void __execlists_unhold(struct i915_request *rq)
2640 {
2641 	LIST_HEAD(list);
2642 
2643 	do {
2644 		struct i915_dependency *p;
2645 
2646 		RQ_TRACE(rq, "hold release\n");
2647 
2648 		GEM_BUG_ON(!i915_request_on_hold(rq));
2649 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2650 
2651 		i915_request_clear_hold(rq);
2652 		list_move_tail(&rq->sched.link,
2653 			       i915_sched_lookup_priolist(rq->engine,
2654 							  rq_prio(rq)));
2655 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2656 
2657 		/* Also release any children on this engine that are ready */
2658 		for_each_waiter(p, rq) {
2659 			struct i915_request *w =
2660 				container_of(p->waiter, typeof(*w), sched);
2661 
2662 			/* Propagate any change in error status */
2663 			if (rq->fence.error)
2664 				i915_request_set_error_once(w, rq->fence.error);
2665 
2666 			if (w->engine != rq->engine)
2667 				continue;
2668 
2669 			if (!i915_request_on_hold(w))
2670 				continue;
2671 
2672 			/* Check that no other parents are also on hold */
2673 			if (hold_request(w))
2674 				continue;
2675 
2676 			list_move_tail(&w->sched.link, &list);
2677 		}
2678 
2679 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2680 	} while (rq);
2681 }
2682 
2683 static void execlists_unhold(struct intel_engine_cs *engine,
2684 			     struct i915_request *rq)
2685 {
2686 	spin_lock_irq(&engine->active.lock);
2687 
2688 	/*
2689 	 * Move this request back to the priority queue, and all of its
2690 	 * children and grandchildren that were suspended along with it.
2691 	 */
2692 	__execlists_unhold(rq);
2693 
2694 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2695 		engine->execlists.queue_priority_hint = rq_prio(rq);
2696 		tasklet_hi_schedule(&engine->execlists.tasklet);
2697 	}
2698 
2699 	spin_unlock_irq(&engine->active.lock);
2700 }
2701 
2702 struct execlists_capture {
2703 	struct work_struct work;
2704 	struct i915_request *rq;
2705 	struct i915_gpu_coredump *error;
2706 };
2707 
2708 static void execlists_capture_work(struct work_struct *work)
2709 {
2710 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2711 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2712 	struct intel_engine_cs *engine = cap->rq->engine;
2713 	struct intel_gt_coredump *gt = cap->error->gt;
2714 	struct intel_engine_capture_vma *vma;
2715 
2716 	/* Compress all the objects attached to the request, slow! */
2717 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2718 	if (vma) {
2719 		struct i915_vma_compress *compress =
2720 			i915_vma_capture_prepare(gt);
2721 
2722 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2723 		i915_vma_capture_finish(gt, compress);
2724 	}
2725 
2726 	gt->simulated = gt->engine->simulated;
2727 	cap->error->simulated = gt->simulated;
2728 
2729 	/* Publish the error state, and announce it to the world */
2730 	i915_error_state_store(cap->error);
2731 	i915_gpu_coredump_put(cap->error);
2732 
2733 	/* Return this request and all that depend upon it for signaling */
2734 	execlists_unhold(engine, cap->rq);
2735 	i915_request_put(cap->rq);
2736 
2737 	kfree(cap);
2738 }
2739 
2740 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2741 {
2742 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2743 	struct execlists_capture *cap;
2744 
2745 	cap = kmalloc(sizeof(*cap), gfp);
2746 	if (!cap)
2747 		return NULL;
2748 
2749 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2750 	if (!cap->error)
2751 		goto err_cap;
2752 
2753 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2754 	if (!cap->error->gt)
2755 		goto err_gpu;
2756 
2757 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2758 	if (!cap->error->gt->engine)
2759 		goto err_gt;
2760 
2761 	return cap;
2762 
2763 err_gt:
2764 	kfree(cap->error->gt);
2765 err_gpu:
2766 	kfree(cap->error);
2767 err_cap:
2768 	kfree(cap);
2769 	return NULL;
2770 }
2771 
2772 static bool execlists_capture(struct intel_engine_cs *engine)
2773 {
2774 	struct execlists_capture *cap;
2775 
2776 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2777 		return true;
2778 
2779 	/*
2780 	 * We need to _quickly_ capture the engine state before we reset.
2781 	 * We are inside an atomic section (softirq) here and we are delaying
2782 	 * the forced preemption event.
2783 	 */
2784 	cap = capture_regs(engine);
2785 	if (!cap)
2786 		return true;
2787 
2788 	spin_lock_irq(&engine->active.lock);
2789 	cap->rq = execlists_active(&engine->execlists);
2790 	if (cap->rq) {
2791 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2792 		cap->rq = i915_request_get_rcu(cap->rq);
2793 	}
2794 	spin_unlock_irq(&engine->active.lock);
2795 	if (!cap->rq)
2796 		goto err_free;
2797 
2798 	/*
2799 	 * Remove the request from the execlists queue, and take ownership
2800 	 * of the request. We pass it to our worker who will _slowly_ compress
2801 	 * all the pages the _user_ requested for debugging their batch, after
2802 	 * which we return it to the queue for signaling.
2803 	 *
2804 	 * By removing them from the execlists queue, we also remove the
2805 	 * requests from being processed by __unwind_incomplete_requests()
2806 	 * during the intel_engine_reset(), and so they will *not* be replayed
2807 	 * afterwards.
2808 	 *
2809 	 * Note that because we have not yet reset the engine at this point,
2810 	 * it is possible for the request that we have identified as being
2811 	 * guilty, did in fact complete and we will then hit an arbitration
2812 	 * point allowing the outstanding preemption to succeed. The likelihood
2813 	 * of that is very low (as capturing of the engine registers should be
2814 	 * fast enough to run inside an irq-off atomic section!), so we will
2815 	 * simply hold that request accountable for being non-preemptible
2816 	 * long enough to force the reset.
2817 	 */
2818 	if (!execlists_hold(engine, cap->rq))
2819 		goto err_rq;
2820 
2821 	INIT_WORK(&cap->work, execlists_capture_work);
2822 	schedule_work(&cap->work);
2823 	return true;
2824 
2825 err_rq:
2826 	i915_request_put(cap->rq);
2827 err_free:
2828 	i915_gpu_coredump_put(cap->error);
2829 	kfree(cap);
2830 	return false;
2831 }
2832 
2833 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
2834 {
2835 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2836 	unsigned long *lock = &engine->gt->reset.flags;
2837 
2838 	if (!intel_has_reset_engine(engine->gt))
2839 		return;
2840 
2841 	if (test_and_set_bit(bit, lock))
2842 		return;
2843 
2844 	ENGINE_TRACE(engine, "reset for %s\n", msg);
2845 
2846 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2847 	tasklet_disable_nosync(&engine->execlists.tasklet);
2848 
2849 	ring_set_paused(engine, 1); /* Freeze the current request in place */
2850 	if (execlists_capture(engine))
2851 		intel_engine_reset(engine, msg);
2852 	else
2853 		ring_set_paused(engine, 0);
2854 
2855 	tasklet_enable(&engine->execlists.tasklet);
2856 	clear_and_wake_up_bit(bit, lock);
2857 }
2858 
2859 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2860 {
2861 	const struct timer_list *t = &engine->execlists.preempt;
2862 
2863 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2864 		return false;
2865 
2866 	if (!timer_expired(t))
2867 		return false;
2868 
2869 	return READ_ONCE(engine->execlists.pending[0]);
2870 }
2871 
2872 /*
2873  * Check the unread Context Status Buffers and manage the submission of new
2874  * contexts to the ELSP accordingly.
2875  */
2876 static void execlists_submission_tasklet(unsigned long data)
2877 {
2878 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2879 	bool timeout = preempt_timeout(engine);
2880 
2881 	process_csb(engine);
2882 
2883 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
2884 		engine->execlists.error_interrupt = 0;
2885 		if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
2886 			execlists_reset(engine, "CS error");
2887 	}
2888 
2889 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2890 		unsigned long flags;
2891 
2892 		spin_lock_irqsave(&engine->active.lock, flags);
2893 		__execlists_submission_tasklet(engine);
2894 		spin_unlock_irqrestore(&engine->active.lock, flags);
2895 
2896 		/* Recheck after serialising with direct-submission */
2897 		if (unlikely(timeout && preempt_timeout(engine)))
2898 			execlists_reset(engine, "preemption time out");
2899 	}
2900 }
2901 
2902 static void __execlists_kick(struct intel_engine_execlists *execlists)
2903 {
2904 	/* Kick the tasklet for some interrupt coalescing and reset handling */
2905 	tasklet_hi_schedule(&execlists->tasklet);
2906 }
2907 
2908 #define execlists_kick(t, member) \
2909 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2910 
2911 static void execlists_timeslice(struct timer_list *timer)
2912 {
2913 	execlists_kick(timer, timer);
2914 }
2915 
2916 static void execlists_preempt(struct timer_list *timer)
2917 {
2918 	execlists_kick(timer, preempt);
2919 }
2920 
2921 static void queue_request(struct intel_engine_cs *engine,
2922 			  struct i915_request *rq)
2923 {
2924 	GEM_BUG_ON(!list_empty(&rq->sched.link));
2925 	list_add_tail(&rq->sched.link,
2926 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
2927 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2928 }
2929 
2930 static void __submit_queue_imm(struct intel_engine_cs *engine)
2931 {
2932 	struct intel_engine_execlists * const execlists = &engine->execlists;
2933 
2934 	if (reset_in_progress(execlists))
2935 		return; /* defer until we restart the engine following reset */
2936 
2937 	if (execlists->tasklet.func == execlists_submission_tasklet)
2938 		__execlists_submission_tasklet(engine);
2939 	else
2940 		tasklet_hi_schedule(&execlists->tasklet);
2941 }
2942 
2943 static void submit_queue(struct intel_engine_cs *engine,
2944 			 const struct i915_request *rq)
2945 {
2946 	struct intel_engine_execlists *execlists = &engine->execlists;
2947 
2948 	if (rq_prio(rq) <= execlists->queue_priority_hint)
2949 		return;
2950 
2951 	execlists->queue_priority_hint = rq_prio(rq);
2952 	__submit_queue_imm(engine);
2953 }
2954 
2955 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2956 			     const struct i915_request *rq)
2957 {
2958 	GEM_BUG_ON(i915_request_on_hold(rq));
2959 	return !list_empty(&engine->active.hold) && hold_request(rq);
2960 }
2961 
2962 static void execlists_submit_request(struct i915_request *request)
2963 {
2964 	struct intel_engine_cs *engine = request->engine;
2965 	unsigned long flags;
2966 
2967 	/* Will be called from irq-context when using foreign fences. */
2968 	spin_lock_irqsave(&engine->active.lock, flags);
2969 
2970 	if (unlikely(ancestor_on_hold(engine, request))) {
2971 		RQ_TRACE(request, "ancestor on hold\n");
2972 		list_add_tail(&request->sched.link, &engine->active.hold);
2973 		i915_request_set_hold(request);
2974 	} else {
2975 		queue_request(engine, request);
2976 
2977 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2978 		GEM_BUG_ON(list_empty(&request->sched.link));
2979 
2980 		submit_queue(engine, request);
2981 	}
2982 
2983 	spin_unlock_irqrestore(&engine->active.lock, flags);
2984 }
2985 
2986 static void __execlists_context_fini(struct intel_context *ce)
2987 {
2988 	intel_ring_put(ce->ring);
2989 	i915_vma_put(ce->state);
2990 }
2991 
2992 static void execlists_context_destroy(struct kref *kref)
2993 {
2994 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2995 
2996 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2997 	GEM_BUG_ON(intel_context_is_pinned(ce));
2998 
2999 	if (ce->state)
3000 		__execlists_context_fini(ce);
3001 
3002 	intel_context_fini(ce);
3003 	intel_context_free(ce);
3004 }
3005 
3006 static void
3007 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3008 {
3009 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3010 		return;
3011 
3012 	vaddr += engine->context_size;
3013 
3014 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3015 }
3016 
3017 static void
3018 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3019 {
3020 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3021 		return;
3022 
3023 	vaddr += engine->context_size;
3024 
3025 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3026 		dev_err_once(engine->i915->drm.dev,
3027 			     "%s context redzone overwritten!\n",
3028 			     engine->name);
3029 }
3030 
3031 static void execlists_context_unpin(struct intel_context *ce)
3032 {
3033 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
3034 		      ce->engine);
3035 
3036 	i915_gem_object_unpin_map(ce->state->obj);
3037 }
3038 
3039 static void
3040 __execlists_update_reg_state(const struct intel_context *ce,
3041 			     const struct intel_engine_cs *engine,
3042 			     u32 head)
3043 {
3044 	struct intel_ring *ring = ce->ring;
3045 	u32 *regs = ce->lrc_reg_state;
3046 
3047 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3048 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3049 
3050 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3051 	regs[CTX_RING_HEAD] = head;
3052 	regs[CTX_RING_TAIL] = ring->tail;
3053 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3054 
3055 	/* RPCS */
3056 	if (engine->class == RENDER_CLASS) {
3057 		regs[CTX_R_PWR_CLK_STATE] =
3058 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
3059 
3060 		i915_oa_init_reg_state(ce, engine);
3061 	}
3062 }
3063 
3064 static int
3065 __execlists_context_pin(struct intel_context *ce,
3066 			struct intel_engine_cs *engine)
3067 {
3068 	void *vaddr;
3069 
3070 	GEM_BUG_ON(!ce->state);
3071 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3072 
3073 	vaddr = i915_gem_object_pin_map(ce->state->obj,
3074 					i915_coherent_map_type(engine->i915) |
3075 					I915_MAP_OVERRIDE);
3076 	if (IS_ERR(vaddr))
3077 		return PTR_ERR(vaddr);
3078 
3079 	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3080 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
3081 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3082 
3083 	return 0;
3084 }
3085 
3086 static int execlists_context_pin(struct intel_context *ce)
3087 {
3088 	return __execlists_context_pin(ce, ce->engine);
3089 }
3090 
3091 static int execlists_context_alloc(struct intel_context *ce)
3092 {
3093 	return __execlists_context_alloc(ce, ce->engine);
3094 }
3095 
3096 static void execlists_context_reset(struct intel_context *ce)
3097 {
3098 	CE_TRACE(ce, "reset\n");
3099 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3100 
3101 	intel_ring_reset(ce->ring, ce->ring->emit);
3102 
3103 	/* Scrub away the garbage */
3104 	execlists_init_reg_state(ce->lrc_reg_state,
3105 				 ce, ce->engine, ce->ring, true);
3106 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3107 
3108 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3109 }
3110 
3111 static const struct intel_context_ops execlists_context_ops = {
3112 	.alloc = execlists_context_alloc,
3113 
3114 	.pin = execlists_context_pin,
3115 	.unpin = execlists_context_unpin,
3116 
3117 	.enter = intel_context_enter_engine,
3118 	.exit = intel_context_exit_engine,
3119 
3120 	.reset = execlists_context_reset,
3121 	.destroy = execlists_context_destroy,
3122 };
3123 
3124 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3125 {
3126 	u32 *cs;
3127 
3128 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3129 		return 0;
3130 
3131 	cs = intel_ring_begin(rq, 6);
3132 	if (IS_ERR(cs))
3133 		return PTR_ERR(cs);
3134 
3135 	/*
3136 	 * Check if we have been preempted before we even get started.
3137 	 *
3138 	 * After this point i915_request_started() reports true, even if
3139 	 * we get preempted and so are no longer running.
3140 	 */
3141 	*cs++ = MI_ARB_CHECK;
3142 	*cs++ = MI_NOOP;
3143 
3144 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3145 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3146 	*cs++ = 0;
3147 	*cs++ = rq->fence.seqno - 1;
3148 
3149 	intel_ring_advance(rq, cs);
3150 
3151 	/* Record the updated position of the request's payload */
3152 	rq->infix = intel_ring_offset(rq, cs);
3153 
3154 	return 0;
3155 }
3156 
3157 static int execlists_request_alloc(struct i915_request *request)
3158 {
3159 	int ret;
3160 
3161 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3162 
3163 	/*
3164 	 * Flush enough space to reduce the likelihood of waiting after
3165 	 * we start building the request - in which case we will just
3166 	 * have to repeat work.
3167 	 */
3168 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3169 
3170 	/*
3171 	 * Note that after this point, we have committed to using
3172 	 * this request as it is being used to both track the
3173 	 * state of engine initialisation and liveness of the
3174 	 * golden renderstate above. Think twice before you try
3175 	 * to cancel/unwind this request now.
3176 	 */
3177 
3178 	/* Unconditionally invalidate GPU caches and TLBs. */
3179 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3180 	if (ret)
3181 		return ret;
3182 
3183 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3184 	return 0;
3185 }
3186 
3187 /*
3188  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3189  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3190  * but there is a slight complication as this is applied in WA batch where the
3191  * values are only initialized once so we cannot take register value at the
3192  * beginning and reuse it further; hence we save its value to memory, upload a
3193  * constant value with bit21 set and then we restore it back with the saved value.
3194  * To simplify the WA, a constant value is formed by using the default value
3195  * of this register. This shouldn't be a problem because we are only modifying
3196  * it for a short period and this batch in non-premptible. We can ofcourse
3197  * use additional instructions that read the actual value of the register
3198  * at that time and set our bit of interest but it makes the WA complicated.
3199  *
3200  * This WA is also required for Gen9 so extracting as a function avoids
3201  * code duplication.
3202  */
3203 static u32 *
3204 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3205 {
3206 	/* NB no one else is allowed to scribble over scratch + 256! */
3207 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3208 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3209 	*batch++ = intel_gt_scratch_offset(engine->gt,
3210 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3211 	*batch++ = 0;
3212 
3213 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3214 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3215 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3216 
3217 	batch = gen8_emit_pipe_control(batch,
3218 				       PIPE_CONTROL_CS_STALL |
3219 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3220 				       0);
3221 
3222 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3223 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3224 	*batch++ = intel_gt_scratch_offset(engine->gt,
3225 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3226 	*batch++ = 0;
3227 
3228 	return batch;
3229 }
3230 
3231 /*
3232  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3233  * initialized at the beginning and shared across all contexts but this field
3234  * helps us to have multiple batches at different offsets and select them based
3235  * on a criteria. At the moment this batch always start at the beginning of the page
3236  * and at this point we don't have multiple wa_ctx batch buffers.
3237  *
3238  * The number of WA applied are not known at the beginning; we use this field
3239  * to return the no of DWORDS written.
3240  *
3241  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3242  * so it adds NOOPs as padding to make it cacheline aligned.
3243  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3244  * makes a complete batch buffer.
3245  */
3246 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3247 {
3248 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3249 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3250 
3251 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3252 	if (IS_BROADWELL(engine->i915))
3253 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3254 
3255 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3256 	/* Actual scratch location is at 128 bytes offset */
3257 	batch = gen8_emit_pipe_control(batch,
3258 				       PIPE_CONTROL_FLUSH_L3 |
3259 				       PIPE_CONTROL_STORE_DATA_INDEX |
3260 				       PIPE_CONTROL_CS_STALL |
3261 				       PIPE_CONTROL_QW_WRITE,
3262 				       LRC_PPHWSP_SCRATCH_ADDR);
3263 
3264 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3265 
3266 	/* Pad to end of cacheline */
3267 	while ((unsigned long)batch % CACHELINE_BYTES)
3268 		*batch++ = MI_NOOP;
3269 
3270 	/*
3271 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3272 	 * execution depends on the length specified in terms of cache lines
3273 	 * in the register CTX_RCS_INDIRECT_CTX
3274 	 */
3275 
3276 	return batch;
3277 }
3278 
3279 struct lri {
3280 	i915_reg_t reg;
3281 	u32 value;
3282 };
3283 
3284 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3285 {
3286 	GEM_BUG_ON(!count || count > 63);
3287 
3288 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3289 	do {
3290 		*batch++ = i915_mmio_reg_offset(lri->reg);
3291 		*batch++ = lri->value;
3292 	} while (lri++, --count);
3293 	*batch++ = MI_NOOP;
3294 
3295 	return batch;
3296 }
3297 
3298 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3299 {
3300 	static const struct lri lri[] = {
3301 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3302 		{
3303 			COMMON_SLICE_CHICKEN2,
3304 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3305 				       0),
3306 		},
3307 
3308 		/* BSpec: 11391 */
3309 		{
3310 			FF_SLICE_CHICKEN,
3311 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3312 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3313 		},
3314 
3315 		/* BSpec: 11299 */
3316 		{
3317 			_3D_CHICKEN3,
3318 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3319 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3320 		}
3321 	};
3322 
3323 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3324 
3325 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3326 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3327 
3328 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3329 	batch = gen8_emit_pipe_control(batch,
3330 				       PIPE_CONTROL_FLUSH_L3 |
3331 				       PIPE_CONTROL_STORE_DATA_INDEX |
3332 				       PIPE_CONTROL_CS_STALL |
3333 				       PIPE_CONTROL_QW_WRITE,
3334 				       LRC_PPHWSP_SCRATCH_ADDR);
3335 
3336 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3337 
3338 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3339 	if (HAS_POOLED_EU(engine->i915)) {
3340 		/*
3341 		 * EU pool configuration is setup along with golden context
3342 		 * during context initialization. This value depends on
3343 		 * device type (2x6 or 3x6) and needs to be updated based
3344 		 * on which subslice is disabled especially for 2x6
3345 		 * devices, however it is safe to load default
3346 		 * configuration of 3x6 device instead of masking off
3347 		 * corresponding bits because HW ignores bits of a disabled
3348 		 * subslice and drops down to appropriate config. Please
3349 		 * see render_state_setup() in i915_gem_render_state.c for
3350 		 * possible configurations, to avoid duplication they are
3351 		 * not shown here again.
3352 		 */
3353 		*batch++ = GEN9_MEDIA_POOL_STATE;
3354 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3355 		*batch++ = 0x00777000;
3356 		*batch++ = 0;
3357 		*batch++ = 0;
3358 		*batch++ = 0;
3359 	}
3360 
3361 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3362 
3363 	/* Pad to end of cacheline */
3364 	while ((unsigned long)batch % CACHELINE_BYTES)
3365 		*batch++ = MI_NOOP;
3366 
3367 	return batch;
3368 }
3369 
3370 static u32 *
3371 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3372 {
3373 	int i;
3374 
3375 	/*
3376 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3377 	 *
3378 	 * Ensure the engine is idle prior to programming a
3379 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3380 	 */
3381 	batch = gen8_emit_pipe_control(batch,
3382 				       PIPE_CONTROL_CS_STALL,
3383 				       0);
3384 	/*
3385 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3386 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3387 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3388 	 * confusing. Since gen8_emit_pipe_control() already advances the
3389 	 * batch by 6 dwords, we advance the other 10 here, completing a
3390 	 * cacheline. It's not clear if the workaround requires this padding
3391 	 * before other commands, or if it's just the regular padding we would
3392 	 * already have for the workaround bb, so leave it here for now.
3393 	 */
3394 	for (i = 0; i < 10; i++)
3395 		*batch++ = MI_NOOP;
3396 
3397 	/* Pad to end of cacheline */
3398 	while ((unsigned long)batch % CACHELINE_BYTES)
3399 		*batch++ = MI_NOOP;
3400 
3401 	return batch;
3402 }
3403 
3404 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3405 
3406 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3407 {
3408 	struct drm_i915_gem_object *obj;
3409 	struct i915_vma *vma;
3410 	int err;
3411 
3412 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3413 	if (IS_ERR(obj))
3414 		return PTR_ERR(obj);
3415 
3416 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3417 	if (IS_ERR(vma)) {
3418 		err = PTR_ERR(vma);
3419 		goto err;
3420 	}
3421 
3422 	err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3423 	if (err)
3424 		goto err;
3425 
3426 	engine->wa_ctx.vma = vma;
3427 	return 0;
3428 
3429 err:
3430 	i915_gem_object_put(obj);
3431 	return err;
3432 }
3433 
3434 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3435 {
3436 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3437 }
3438 
3439 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3440 
3441 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3442 {
3443 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3444 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3445 					    &wa_ctx->per_ctx };
3446 	wa_bb_func_t wa_bb_fn[2];
3447 	struct page *page;
3448 	void *batch, *batch_ptr;
3449 	unsigned int i;
3450 	int ret;
3451 
3452 	if (engine->class != RENDER_CLASS)
3453 		return 0;
3454 
3455 	switch (INTEL_GEN(engine->i915)) {
3456 	case 12:
3457 	case 11:
3458 		return 0;
3459 	case 10:
3460 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3461 		wa_bb_fn[1] = NULL;
3462 		break;
3463 	case 9:
3464 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3465 		wa_bb_fn[1] = NULL;
3466 		break;
3467 	case 8:
3468 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3469 		wa_bb_fn[1] = NULL;
3470 		break;
3471 	default:
3472 		MISSING_CASE(INTEL_GEN(engine->i915));
3473 		return 0;
3474 	}
3475 
3476 	ret = lrc_setup_wa_ctx(engine);
3477 	if (ret) {
3478 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
3479 		return ret;
3480 	}
3481 
3482 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3483 	batch = batch_ptr = kmap_atomic(page);
3484 
3485 	/*
3486 	 * Emit the two workaround batch buffers, recording the offset from the
3487 	 * start of the workaround batch buffer object for each and their
3488 	 * respective sizes.
3489 	 */
3490 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3491 		wa_bb[i]->offset = batch_ptr - batch;
3492 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3493 						  CACHELINE_BYTES))) {
3494 			ret = -EINVAL;
3495 			break;
3496 		}
3497 		if (wa_bb_fn[i])
3498 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3499 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3500 	}
3501 
3502 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3503 
3504 	kunmap_atomic(batch);
3505 	if (ret)
3506 		lrc_destroy_wa_ctx(engine);
3507 
3508 	return ret;
3509 }
3510 
3511 static void enable_error_interrupt(struct intel_engine_cs *engine)
3512 {
3513 	u32 status;
3514 
3515 	engine->execlists.error_interrupt = 0;
3516 	ENGINE_WRITE(engine, RING_EMR, ~0u);
3517 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
3518 
3519 	status = ENGINE_READ(engine, RING_ESR);
3520 	if (unlikely(status)) {
3521 		dev_err(engine->i915->drm.dev,
3522 			"engine '%s' resumed still in error: %08x\n",
3523 			engine->name, status);
3524 		__intel_gt_reset(engine->gt, engine->mask);
3525 	}
3526 
3527 	/*
3528 	 * On current gen8+, we have 2 signals to play with
3529 	 *
3530 	 * - I915_ERROR_INSTUCTION (bit 0)
3531 	 *
3532 	 *    Generate an error if the command parser encounters an invalid
3533 	 *    instruction
3534 	 *
3535 	 *    This is a fatal error.
3536 	 *
3537 	 * - CP_PRIV (bit 2)
3538 	 *
3539 	 *    Generate an error on privilege violation (where the CP replaces
3540 	 *    the instruction with a no-op). This also fires for writes into
3541 	 *    read-only scratch pages.
3542 	 *
3543 	 *    This is a non-fatal error, parsing continues.
3544 	 *
3545 	 * * there are a few others defined for odd HW that we do not use
3546 	 *
3547 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
3548 	 * error (as the HW is validating and suppressing the mistakes), we
3549 	 * only unmask the instruction error bit.
3550 	 */
3551 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
3552 }
3553 
3554 static void enable_execlists(struct intel_engine_cs *engine)
3555 {
3556 	u32 mode;
3557 
3558 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3559 
3560 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3561 
3562 	if (INTEL_GEN(engine->i915) >= 11)
3563 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3564 	else
3565 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3566 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3567 
3568 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3569 
3570 	ENGINE_WRITE_FW(engine,
3571 			RING_HWS_PGA,
3572 			i915_ggtt_offset(engine->status_page.vma));
3573 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3574 
3575 	enable_error_interrupt(engine);
3576 
3577 	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
3578 }
3579 
3580 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3581 {
3582 	bool unexpected = false;
3583 
3584 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3585 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3586 		unexpected = true;
3587 	}
3588 
3589 	return unexpected;
3590 }
3591 
3592 static int execlists_resume(struct intel_engine_cs *engine)
3593 {
3594 	intel_mocs_init_engine(engine);
3595 
3596 	intel_engine_reset_breadcrumbs(engine);
3597 
3598 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3599 		struct drm_printer p = drm_debug_printer(__func__);
3600 
3601 		intel_engine_dump(engine, &p, NULL);
3602 	}
3603 
3604 	enable_execlists(engine);
3605 
3606 	return 0;
3607 }
3608 
3609 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3610 {
3611 	struct intel_engine_execlists * const execlists = &engine->execlists;
3612 	unsigned long flags;
3613 
3614 	ENGINE_TRACE(engine, "depth<-%d\n",
3615 		     atomic_read(&execlists->tasklet.count));
3616 
3617 	/*
3618 	 * Prevent request submission to the hardware until we have
3619 	 * completed the reset in i915_gem_reset_finish(). If a request
3620 	 * is completed by one engine, it may then queue a request
3621 	 * to a second via its execlists->tasklet *just* as we are
3622 	 * calling engine->resume() and also writing the ELSP.
3623 	 * Turning off the execlists->tasklet until the reset is over
3624 	 * prevents the race.
3625 	 */
3626 	__tasklet_disable_sync_once(&execlists->tasklet);
3627 	GEM_BUG_ON(!reset_in_progress(execlists));
3628 
3629 	/* And flush any current direct submission. */
3630 	spin_lock_irqsave(&engine->active.lock, flags);
3631 	spin_unlock_irqrestore(&engine->active.lock, flags);
3632 
3633 	/*
3634 	 * We stop engines, otherwise we might get failed reset and a
3635 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
3636 	 * from system hang if batchbuffer is progressing when
3637 	 * the reset is issued, regardless of READY_TO_RESET ack.
3638 	 * Thus assume it is best to stop engines on all gens
3639 	 * where we have a gpu reset.
3640 	 *
3641 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3642 	 *
3643 	 * FIXME: Wa for more modern gens needs to be validated
3644 	 */
3645 	intel_engine_stop_cs(engine);
3646 }
3647 
3648 static void reset_csb_pointers(struct intel_engine_cs *engine)
3649 {
3650 	struct intel_engine_execlists * const execlists = &engine->execlists;
3651 	const unsigned int reset_value = execlists->csb_size - 1;
3652 
3653 	ring_set_paused(engine, 0);
3654 
3655 	/*
3656 	 * After a reset, the HW starts writing into CSB entry [0]. We
3657 	 * therefore have to set our HEAD pointer back one entry so that
3658 	 * the *first* entry we check is entry 0. To complicate this further,
3659 	 * as we don't wait for the first interrupt after reset, we have to
3660 	 * fake the HW write to point back to the last entry so that our
3661 	 * inline comparison of our cached head position against the last HW
3662 	 * write works even before the first interrupt.
3663 	 */
3664 	execlists->csb_head = reset_value;
3665 	WRITE_ONCE(*execlists->csb_write, reset_value);
3666 	wmb(); /* Make sure this is visible to HW (paranoia?) */
3667 
3668 	/*
3669 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3670 	 * Bludgeon them with a mmio update to be sure.
3671 	 */
3672 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3673 		     reset_value << 8 | reset_value);
3674 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3675 
3676 	invalidate_csb_entries(&execlists->csb_status[0],
3677 			       &execlists->csb_status[reset_value]);
3678 }
3679 
3680 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3681 {
3682 	int x;
3683 
3684 	x = lrc_ring_mi_mode(engine);
3685 	if (x != -1) {
3686 		regs[x + 1] &= ~STOP_RING;
3687 		regs[x + 1] |= STOP_RING << 16;
3688 	}
3689 }
3690 
3691 static void __execlists_reset_reg_state(const struct intel_context *ce,
3692 					const struct intel_engine_cs *engine)
3693 {
3694 	u32 *regs = ce->lrc_reg_state;
3695 
3696 	__reset_stop_ring(regs, engine);
3697 }
3698 
3699 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3700 {
3701 	struct intel_engine_execlists * const execlists = &engine->execlists;
3702 	struct intel_context *ce;
3703 	struct i915_request *rq;
3704 	u32 head;
3705 
3706 	mb(); /* paranoia: read the CSB pointers from after the reset */
3707 	clflush(execlists->csb_write);
3708 	mb();
3709 
3710 	process_csb(engine); /* drain preemption events */
3711 
3712 	/* Following the reset, we need to reload the CSB read/write pointers */
3713 	reset_csb_pointers(engine);
3714 
3715 	/*
3716 	 * Save the currently executing context, even if we completed
3717 	 * its request, it was still running at the time of the
3718 	 * reset and will have been clobbered.
3719 	 */
3720 	rq = execlists_active(execlists);
3721 	if (!rq)
3722 		goto unwind;
3723 
3724 	ce = rq->context;
3725 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3726 
3727 	if (i915_request_completed(rq)) {
3728 		/* Idle context; tidy up the ring so we can restart afresh */
3729 		head = intel_ring_wrap(ce->ring, rq->tail);
3730 		goto out_replay;
3731 	}
3732 
3733 	/* We still have requests in-flight; the engine should be active */
3734 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3735 
3736 	/* Context has requests still in-flight; it should not be idle! */
3737 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3738 
3739 	rq = active_request(ce->timeline, rq);
3740 	head = intel_ring_wrap(ce->ring, rq->head);
3741 	GEM_BUG_ON(head == ce->ring->tail);
3742 
3743 	/*
3744 	 * If this request hasn't started yet, e.g. it is waiting on a
3745 	 * semaphore, we need to avoid skipping the request or else we
3746 	 * break the signaling chain. However, if the context is corrupt
3747 	 * the request will not restart and we will be stuck with a wedged
3748 	 * device. It is quite often the case that if we issue a reset
3749 	 * while the GPU is loading the context image, that the context
3750 	 * image becomes corrupt.
3751 	 *
3752 	 * Otherwise, if we have not started yet, the request should replay
3753 	 * perfectly and we do not need to flag the result as being erroneous.
3754 	 */
3755 	if (!i915_request_started(rq))
3756 		goto out_replay;
3757 
3758 	/*
3759 	 * If the request was innocent, we leave the request in the ELSP
3760 	 * and will try to replay it on restarting. The context image may
3761 	 * have been corrupted by the reset, in which case we may have
3762 	 * to service a new GPU hang, but more likely we can continue on
3763 	 * without impact.
3764 	 *
3765 	 * If the request was guilty, we presume the context is corrupt
3766 	 * and have to at least restore the RING register in the context
3767 	 * image back to the expected values to skip over the guilty request.
3768 	 */
3769 	__i915_request_reset(rq, stalled);
3770 	if (!stalled)
3771 		goto out_replay;
3772 
3773 	/*
3774 	 * We want a simple context + ring to execute the breadcrumb update.
3775 	 * We cannot rely on the context being intact across the GPU hang,
3776 	 * so clear it and rebuild just what we need for the breadcrumb.
3777 	 * All pending requests for this context will be zapped, and any
3778 	 * future request will be after userspace has had the opportunity
3779 	 * to recreate its own state.
3780 	 */
3781 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3782 	restore_default_state(ce, engine);
3783 
3784 out_replay:
3785 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3786 		     head, ce->ring->tail);
3787 	__execlists_reset_reg_state(ce, engine);
3788 	__execlists_update_reg_state(ce, engine, head);
3789 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3790 
3791 unwind:
3792 	/* Push back any incomplete requests for replay after the reset. */
3793 	cancel_port_requests(execlists);
3794 	__unwind_incomplete_requests(engine);
3795 }
3796 
3797 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3798 {
3799 	unsigned long flags;
3800 
3801 	ENGINE_TRACE(engine, "\n");
3802 
3803 	spin_lock_irqsave(&engine->active.lock, flags);
3804 
3805 	__execlists_reset(engine, stalled);
3806 
3807 	spin_unlock_irqrestore(&engine->active.lock, flags);
3808 }
3809 
3810 static void nop_submission_tasklet(unsigned long data)
3811 {
3812 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3813 
3814 	/* The driver is wedged; don't process any more events. */
3815 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
3816 }
3817 
3818 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3819 {
3820 	struct intel_engine_execlists * const execlists = &engine->execlists;
3821 	struct i915_request *rq, *rn;
3822 	struct rb_node *rb;
3823 	unsigned long flags;
3824 
3825 	ENGINE_TRACE(engine, "\n");
3826 
3827 	/*
3828 	 * Before we call engine->cancel_requests(), we should have exclusive
3829 	 * access to the submission state. This is arranged for us by the
3830 	 * caller disabling the interrupt generation, the tasklet and other
3831 	 * threads that may then access the same state, giving us a free hand
3832 	 * to reset state. However, we still need to let lockdep be aware that
3833 	 * we know this state may be accessed in hardirq context, so we
3834 	 * disable the irq around this manipulation and we want to keep
3835 	 * the spinlock focused on its duties and not accidentally conflate
3836 	 * coverage to the submission's irq state. (Similarly, although we
3837 	 * shouldn't need to disable irq around the manipulation of the
3838 	 * submission's irq state, we also wish to remind ourselves that
3839 	 * it is irq state.)
3840 	 */
3841 	spin_lock_irqsave(&engine->active.lock, flags);
3842 
3843 	__execlists_reset(engine, true);
3844 
3845 	/* Mark all executing requests as skipped. */
3846 	list_for_each_entry(rq, &engine->active.requests, sched.link)
3847 		mark_eio(rq);
3848 
3849 	/* Flush the queued requests to the timeline list (for retiring). */
3850 	while ((rb = rb_first_cached(&execlists->queue))) {
3851 		struct i915_priolist *p = to_priolist(rb);
3852 		int i;
3853 
3854 		priolist_for_each_request_consume(rq, rn, p, i) {
3855 			mark_eio(rq);
3856 			__i915_request_submit(rq);
3857 		}
3858 
3859 		rb_erase_cached(&p->node, &execlists->queue);
3860 		i915_priolist_free(p);
3861 	}
3862 
3863 	/* On-hold requests will be flushed to timeline upon their release */
3864 	list_for_each_entry(rq, &engine->active.hold, sched.link)
3865 		mark_eio(rq);
3866 
3867 	/* Cancel all attached virtual engines */
3868 	while ((rb = rb_first_cached(&execlists->virtual))) {
3869 		struct virtual_engine *ve =
3870 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3871 
3872 		rb_erase_cached(rb, &execlists->virtual);
3873 		RB_CLEAR_NODE(rb);
3874 
3875 		spin_lock(&ve->base.active.lock);
3876 		rq = fetch_and_zero(&ve->request);
3877 		if (rq) {
3878 			mark_eio(rq);
3879 
3880 			rq->engine = engine;
3881 			__i915_request_submit(rq);
3882 			i915_request_put(rq);
3883 
3884 			ve->base.execlists.queue_priority_hint = INT_MIN;
3885 		}
3886 		spin_unlock(&ve->base.active.lock);
3887 	}
3888 
3889 	/* Remaining _unready_ requests will be nop'ed when submitted */
3890 
3891 	execlists->queue_priority_hint = INT_MIN;
3892 	execlists->queue = RB_ROOT_CACHED;
3893 
3894 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3895 	execlists->tasklet.func = nop_submission_tasklet;
3896 
3897 	spin_unlock_irqrestore(&engine->active.lock, flags);
3898 }
3899 
3900 static void execlists_reset_finish(struct intel_engine_cs *engine)
3901 {
3902 	struct intel_engine_execlists * const execlists = &engine->execlists;
3903 
3904 	/*
3905 	 * After a GPU reset, we may have requests to replay. Do so now while
3906 	 * we still have the forcewake to be sure that the GPU is not allowed
3907 	 * to sleep before we restart and reload a context.
3908 	 */
3909 	GEM_BUG_ON(!reset_in_progress(execlists));
3910 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3911 		execlists->tasklet.func(execlists->tasklet.data);
3912 
3913 	if (__tasklet_enable(&execlists->tasklet))
3914 		/* And kick in case we missed a new request submission. */
3915 		tasklet_hi_schedule(&execlists->tasklet);
3916 	ENGINE_TRACE(engine, "depth->%d\n",
3917 		     atomic_read(&execlists->tasklet.count));
3918 }
3919 
3920 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3921 				    u64 offset, u32 len,
3922 				    const unsigned int flags)
3923 {
3924 	u32 *cs;
3925 
3926 	cs = intel_ring_begin(rq, 4);
3927 	if (IS_ERR(cs))
3928 		return PTR_ERR(cs);
3929 
3930 	/*
3931 	 * WaDisableCtxRestoreArbitration:bdw,chv
3932 	 *
3933 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3934 	 * particular all the gen that do not need the w/a at all!), if we
3935 	 * took care to make sure that on every switch into this context
3936 	 * (both ordinary and for preemption) that arbitrartion was enabled
3937 	 * we would be fine.  However, for gen8 there is another w/a that
3938 	 * requires us to not preempt inside GPGPU execution, so we keep
3939 	 * arbitration disabled for gen8 batches. Arbitration will be
3940 	 * re-enabled before we close the request
3941 	 * (engine->emit_fini_breadcrumb).
3942 	 */
3943 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3944 
3945 	/* FIXME(BDW+): Address space and security selectors. */
3946 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3947 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3948 	*cs++ = lower_32_bits(offset);
3949 	*cs++ = upper_32_bits(offset);
3950 
3951 	intel_ring_advance(rq, cs);
3952 
3953 	return 0;
3954 }
3955 
3956 static int gen8_emit_bb_start(struct i915_request *rq,
3957 			      u64 offset, u32 len,
3958 			      const unsigned int flags)
3959 {
3960 	u32 *cs;
3961 
3962 	cs = intel_ring_begin(rq, 6);
3963 	if (IS_ERR(cs))
3964 		return PTR_ERR(cs);
3965 
3966 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3967 
3968 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3969 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3970 	*cs++ = lower_32_bits(offset);
3971 	*cs++ = upper_32_bits(offset);
3972 
3973 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3974 	*cs++ = MI_NOOP;
3975 
3976 	intel_ring_advance(rq, cs);
3977 
3978 	return 0;
3979 }
3980 
3981 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3982 {
3983 	ENGINE_WRITE(engine, RING_IMR,
3984 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
3985 	ENGINE_POSTING_READ(engine, RING_IMR);
3986 }
3987 
3988 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3989 {
3990 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3991 }
3992 
3993 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3994 {
3995 	u32 cmd, *cs;
3996 
3997 	cs = intel_ring_begin(request, 4);
3998 	if (IS_ERR(cs))
3999 		return PTR_ERR(cs);
4000 
4001 	cmd = MI_FLUSH_DW + 1;
4002 
4003 	/* We always require a command barrier so that subsequent
4004 	 * commands, such as breadcrumb interrupts, are strictly ordered
4005 	 * wrt the contents of the write cache being flushed to memory
4006 	 * (and thus being coherent from the CPU).
4007 	 */
4008 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4009 
4010 	if (mode & EMIT_INVALIDATE) {
4011 		cmd |= MI_INVALIDATE_TLB;
4012 		if (request->engine->class == VIDEO_DECODE_CLASS)
4013 			cmd |= MI_INVALIDATE_BSD;
4014 	}
4015 
4016 	*cs++ = cmd;
4017 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4018 	*cs++ = 0; /* upper addr */
4019 	*cs++ = 0; /* value */
4020 	intel_ring_advance(request, cs);
4021 
4022 	return 0;
4023 }
4024 
4025 static int gen8_emit_flush_render(struct i915_request *request,
4026 				  u32 mode)
4027 {
4028 	bool vf_flush_wa = false, dc_flush_wa = false;
4029 	u32 *cs, flags = 0;
4030 	int len;
4031 
4032 	flags |= PIPE_CONTROL_CS_STALL;
4033 
4034 	if (mode & EMIT_FLUSH) {
4035 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4036 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4037 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4038 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4039 	}
4040 
4041 	if (mode & EMIT_INVALIDATE) {
4042 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4043 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4044 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4045 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4046 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4047 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4048 		flags |= PIPE_CONTROL_QW_WRITE;
4049 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4050 
4051 		/*
4052 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4053 		 * pipe control.
4054 		 */
4055 		if (IS_GEN(request->i915, 9))
4056 			vf_flush_wa = true;
4057 
4058 		/* WaForGAMHang:kbl */
4059 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
4060 			dc_flush_wa = true;
4061 	}
4062 
4063 	len = 6;
4064 
4065 	if (vf_flush_wa)
4066 		len += 6;
4067 
4068 	if (dc_flush_wa)
4069 		len += 12;
4070 
4071 	cs = intel_ring_begin(request, len);
4072 	if (IS_ERR(cs))
4073 		return PTR_ERR(cs);
4074 
4075 	if (vf_flush_wa)
4076 		cs = gen8_emit_pipe_control(cs, 0, 0);
4077 
4078 	if (dc_flush_wa)
4079 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4080 					    0);
4081 
4082 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4083 
4084 	if (dc_flush_wa)
4085 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4086 
4087 	intel_ring_advance(request, cs);
4088 
4089 	return 0;
4090 }
4091 
4092 static int gen11_emit_flush_render(struct i915_request *request,
4093 				   u32 mode)
4094 {
4095 	if (mode & EMIT_FLUSH) {
4096 		u32 *cs;
4097 		u32 flags = 0;
4098 
4099 		flags |= PIPE_CONTROL_CS_STALL;
4100 
4101 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4102 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4103 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4104 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4105 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4106 		flags |= PIPE_CONTROL_QW_WRITE;
4107 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4108 
4109 		cs = intel_ring_begin(request, 6);
4110 		if (IS_ERR(cs))
4111 			return PTR_ERR(cs);
4112 
4113 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4114 		intel_ring_advance(request, cs);
4115 	}
4116 
4117 	if (mode & EMIT_INVALIDATE) {
4118 		u32 *cs;
4119 		u32 flags = 0;
4120 
4121 		flags |= PIPE_CONTROL_CS_STALL;
4122 
4123 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4124 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4125 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4126 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4127 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4128 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4129 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4130 		flags |= PIPE_CONTROL_QW_WRITE;
4131 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4132 
4133 		cs = intel_ring_begin(request, 6);
4134 		if (IS_ERR(cs))
4135 			return PTR_ERR(cs);
4136 
4137 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4138 		intel_ring_advance(request, cs);
4139 	}
4140 
4141 	return 0;
4142 }
4143 
4144 static u32 preparser_disable(bool state)
4145 {
4146 	return MI_ARB_CHECK | 1 << 8 | state;
4147 }
4148 
4149 static int gen12_emit_flush_render(struct i915_request *request,
4150 				   u32 mode)
4151 {
4152 	if (mode & EMIT_FLUSH) {
4153 		u32 flags = 0;
4154 		u32 *cs;
4155 
4156 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4157 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4158 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4159 		/* Wa_1409600907:tgl */
4160 		flags |= PIPE_CONTROL_DEPTH_STALL;
4161 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4162 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4163 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4164 
4165 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4166 		flags |= PIPE_CONTROL_QW_WRITE;
4167 
4168 		flags |= PIPE_CONTROL_CS_STALL;
4169 
4170 		cs = intel_ring_begin(request, 6);
4171 		if (IS_ERR(cs))
4172 			return PTR_ERR(cs);
4173 
4174 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4175 		intel_ring_advance(request, cs);
4176 	}
4177 
4178 	if (mode & EMIT_INVALIDATE) {
4179 		u32 flags = 0;
4180 		u32 *cs;
4181 
4182 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4183 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4184 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4185 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4186 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4187 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4188 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4189 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
4190 
4191 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4192 		flags |= PIPE_CONTROL_QW_WRITE;
4193 
4194 		flags |= PIPE_CONTROL_CS_STALL;
4195 
4196 		cs = intel_ring_begin(request, 8);
4197 		if (IS_ERR(cs))
4198 			return PTR_ERR(cs);
4199 
4200 		/*
4201 		 * Prevent the pre-parser from skipping past the TLB
4202 		 * invalidate and loading a stale page for the batch
4203 		 * buffer / request payload.
4204 		 */
4205 		*cs++ = preparser_disable(true);
4206 
4207 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4208 
4209 		*cs++ = preparser_disable(false);
4210 		intel_ring_advance(request, cs);
4211 	}
4212 
4213 	return 0;
4214 }
4215 
4216 /*
4217  * Reserve space for 2 NOOPs at the end of each request to be
4218  * used as a workaround for not being allowed to do lite
4219  * restore with HEAD==TAIL (WaIdleLiteRestore).
4220  */
4221 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4222 {
4223 	/* Ensure there's always at least one preemption point per-request. */
4224 	*cs++ = MI_ARB_CHECK;
4225 	*cs++ = MI_NOOP;
4226 	request->wa_tail = intel_ring_offset(request, cs);
4227 
4228 	return cs;
4229 }
4230 
4231 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4232 {
4233 	*cs++ = MI_SEMAPHORE_WAIT |
4234 		MI_SEMAPHORE_GLOBAL_GTT |
4235 		MI_SEMAPHORE_POLL |
4236 		MI_SEMAPHORE_SAD_EQ_SDD;
4237 	*cs++ = 0;
4238 	*cs++ = intel_hws_preempt_address(request->engine);
4239 	*cs++ = 0;
4240 
4241 	return cs;
4242 }
4243 
4244 static __always_inline u32*
4245 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4246 				 u32 *cs)
4247 {
4248 	*cs++ = MI_USER_INTERRUPT;
4249 
4250 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4251 	if (intel_engine_has_semaphores(request->engine))
4252 		cs = emit_preempt_busywait(request, cs);
4253 
4254 	request->tail = intel_ring_offset(request, cs);
4255 	assert_ring_tail_valid(request->ring, request->tail);
4256 
4257 	return gen8_emit_wa_tail(request, cs);
4258 }
4259 
4260 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4261 {
4262 	cs = gen8_emit_ggtt_write(cs,
4263 				  request->fence.seqno,
4264 				  i915_request_active_timeline(request)->hwsp_offset,
4265 				  0);
4266 
4267 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4268 }
4269 
4270 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4271 {
4272 	cs = gen8_emit_pipe_control(cs,
4273 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4274 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4275 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4276 				    0);
4277 
4278 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4279 	cs = gen8_emit_ggtt_write_rcs(cs,
4280 				      request->fence.seqno,
4281 				      i915_request_active_timeline(request)->hwsp_offset,
4282 				      PIPE_CONTROL_FLUSH_ENABLE |
4283 				      PIPE_CONTROL_CS_STALL);
4284 
4285 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4286 }
4287 
4288 static u32 *
4289 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4290 {
4291 	cs = gen8_emit_ggtt_write_rcs(cs,
4292 				      request->fence.seqno,
4293 				      i915_request_active_timeline(request)->hwsp_offset,
4294 				      PIPE_CONTROL_CS_STALL |
4295 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4296 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4297 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4298 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4299 				      PIPE_CONTROL_FLUSH_ENABLE);
4300 
4301 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4302 }
4303 
4304 /*
4305  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4306  * flush and will continue pre-fetching the instructions after it before the
4307  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4308  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4309  * of the next request before the memory has been flushed, we're guaranteed that
4310  * we won't access the batch itself too early.
4311  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4312  * so, if the current request is modifying an instruction in the next request on
4313  * the same intel_context, we might pre-fetch and then execute the pre-update
4314  * instruction. To avoid this, the users of self-modifying code should either
4315  * disable the parser around the code emitting the memory writes, via a new flag
4316  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4317  * the in-kernel use-cases we've opted to use a separate context, see
4318  * reloc_gpu() as an example.
4319  * All the above applies only to the instructions themselves. Non-inline data
4320  * used by the instructions is not pre-fetched.
4321  */
4322 
4323 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4324 {
4325 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4326 		MI_SEMAPHORE_GLOBAL_GTT |
4327 		MI_SEMAPHORE_POLL |
4328 		MI_SEMAPHORE_SAD_EQ_SDD;
4329 	*cs++ = 0;
4330 	*cs++ = intel_hws_preempt_address(request->engine);
4331 	*cs++ = 0;
4332 	*cs++ = 0;
4333 	*cs++ = MI_NOOP;
4334 
4335 	return cs;
4336 }
4337 
4338 static __always_inline u32*
4339 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4340 {
4341 	*cs++ = MI_USER_INTERRUPT;
4342 
4343 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4344 	if (intel_engine_has_semaphores(request->engine))
4345 		cs = gen12_emit_preempt_busywait(request, cs);
4346 
4347 	request->tail = intel_ring_offset(request, cs);
4348 	assert_ring_tail_valid(request->ring, request->tail);
4349 
4350 	return gen8_emit_wa_tail(request, cs);
4351 }
4352 
4353 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4354 {
4355 	cs = gen8_emit_ggtt_write(cs,
4356 				  request->fence.seqno,
4357 				  i915_request_active_timeline(request)->hwsp_offset,
4358 				  0);
4359 
4360 	return gen12_emit_fini_breadcrumb_footer(request, cs);
4361 }
4362 
4363 static u32 *
4364 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4365 {
4366 	cs = gen8_emit_ggtt_write_rcs(cs,
4367 				      request->fence.seqno,
4368 				      i915_request_active_timeline(request)->hwsp_offset,
4369 				      PIPE_CONTROL_CS_STALL |
4370 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4371 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4372 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4373 				      /* Wa_1409600907:tgl */
4374 				      PIPE_CONTROL_DEPTH_STALL |
4375 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4376 				      PIPE_CONTROL_FLUSH_ENABLE |
4377 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4378 
4379 	return gen12_emit_fini_breadcrumb_footer(request, cs);
4380 }
4381 
4382 static void execlists_park(struct intel_engine_cs *engine)
4383 {
4384 	cancel_timer(&engine->execlists.timer);
4385 	cancel_timer(&engine->execlists.preempt);
4386 }
4387 
4388 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4389 {
4390 	engine->submit_request = execlists_submit_request;
4391 	engine->schedule = i915_schedule;
4392 	engine->execlists.tasklet.func = execlists_submission_tasklet;
4393 
4394 	engine->reset.prepare = execlists_reset_prepare;
4395 	engine->reset.rewind = execlists_reset_rewind;
4396 	engine->reset.cancel = execlists_reset_cancel;
4397 	engine->reset.finish = execlists_reset_finish;
4398 
4399 	engine->park = execlists_park;
4400 	engine->unpark = NULL;
4401 
4402 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4403 	if (!intel_vgpu_active(engine->i915)) {
4404 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4405 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
4406 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4407 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
4408 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
4409 		}
4410 	}
4411 
4412 	if (INTEL_GEN(engine->i915) >= 12)
4413 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4414 
4415 	if (intel_engine_has_preemption(engine))
4416 		engine->emit_bb_start = gen8_emit_bb_start;
4417 	else
4418 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
4419 }
4420 
4421 static void execlists_shutdown(struct intel_engine_cs *engine)
4422 {
4423 	/* Synchronise with residual timers and any softirq they raise */
4424 	del_timer_sync(&engine->execlists.timer);
4425 	del_timer_sync(&engine->execlists.preempt);
4426 	tasklet_kill(&engine->execlists.tasklet);
4427 }
4428 
4429 static void execlists_release(struct intel_engine_cs *engine)
4430 {
4431 	execlists_shutdown(engine);
4432 
4433 	intel_engine_cleanup_common(engine);
4434 	lrc_destroy_wa_ctx(engine);
4435 }
4436 
4437 static void
4438 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4439 {
4440 	/* Default vfuncs which can be overriden by each engine. */
4441 
4442 	engine->resume = execlists_resume;
4443 
4444 	engine->cops = &execlists_context_ops;
4445 	engine->request_alloc = execlists_request_alloc;
4446 
4447 	engine->emit_flush = gen8_emit_flush;
4448 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4449 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4450 	if (INTEL_GEN(engine->i915) >= 12)
4451 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4452 
4453 	engine->set_default_submission = intel_execlists_set_default_submission;
4454 
4455 	if (INTEL_GEN(engine->i915) < 11) {
4456 		engine->irq_enable = gen8_logical_ring_enable_irq;
4457 		engine->irq_disable = gen8_logical_ring_disable_irq;
4458 	} else {
4459 		/*
4460 		 * TODO: On Gen11 interrupt masks need to be clear
4461 		 * to allow C6 entry. Keep interrupts enabled at
4462 		 * and take the hit of generating extra interrupts
4463 		 * until a more refined solution exists.
4464 		 */
4465 	}
4466 }
4467 
4468 static inline void
4469 logical_ring_default_irqs(struct intel_engine_cs *engine)
4470 {
4471 	unsigned int shift = 0;
4472 
4473 	if (INTEL_GEN(engine->i915) < 11) {
4474 		const u8 irq_shifts[] = {
4475 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
4476 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
4477 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
4478 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
4479 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
4480 		};
4481 
4482 		shift = irq_shifts[engine->id];
4483 	}
4484 
4485 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4486 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4487 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
4488 	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
4489 }
4490 
4491 static void rcs_submission_override(struct intel_engine_cs *engine)
4492 {
4493 	switch (INTEL_GEN(engine->i915)) {
4494 	case 12:
4495 		engine->emit_flush = gen12_emit_flush_render;
4496 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4497 		break;
4498 	case 11:
4499 		engine->emit_flush = gen11_emit_flush_render;
4500 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4501 		break;
4502 	default:
4503 		engine->emit_flush = gen8_emit_flush_render;
4504 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4505 		break;
4506 	}
4507 }
4508 
4509 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4510 {
4511 	struct intel_engine_execlists * const execlists = &engine->execlists;
4512 	struct drm_i915_private *i915 = engine->i915;
4513 	struct intel_uncore *uncore = engine->uncore;
4514 	u32 base = engine->mmio_base;
4515 
4516 	tasklet_init(&engine->execlists.tasklet,
4517 		     execlists_submission_tasklet, (unsigned long)engine);
4518 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4519 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4520 
4521 	logical_ring_default_vfuncs(engine);
4522 	logical_ring_default_irqs(engine);
4523 
4524 	if (engine->class == RENDER_CLASS)
4525 		rcs_submission_override(engine);
4526 
4527 	if (intel_init_workaround_bb(engine))
4528 		/*
4529 		 * We continue even if we fail to initialize WA batch
4530 		 * because we only expect rare glitches but nothing
4531 		 * critical to prevent us from using GPU
4532 		 */
4533 		DRM_ERROR("WA batch buffer initialization failed\n");
4534 
4535 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
4536 		execlists->submit_reg = uncore->regs +
4537 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4538 		execlists->ctrl_reg = uncore->regs +
4539 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4540 	} else {
4541 		execlists->submit_reg = uncore->regs +
4542 			i915_mmio_reg_offset(RING_ELSP(base));
4543 	}
4544 
4545 	execlists->csb_status =
4546 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4547 
4548 	execlists->csb_write =
4549 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
4550 
4551 	if (INTEL_GEN(i915) < 11)
4552 		execlists->csb_size = GEN8_CSB_ENTRIES;
4553 	else
4554 		execlists->csb_size = GEN11_CSB_ENTRIES;
4555 
4556 	if (INTEL_GEN(engine->i915) >= 11) {
4557 		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
4558 		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
4559 	}
4560 
4561 	reset_csb_pointers(engine);
4562 
4563 	/* Finally, take ownership and responsibility for cleanup! */
4564 	engine->release = execlists_release;
4565 
4566 	return 0;
4567 }
4568 
4569 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4570 {
4571 	u32 indirect_ctx_offset;
4572 
4573 	switch (INTEL_GEN(engine->i915)) {
4574 	default:
4575 		MISSING_CASE(INTEL_GEN(engine->i915));
4576 		/* fall through */
4577 	case 12:
4578 		indirect_ctx_offset =
4579 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4580 		break;
4581 	case 11:
4582 		indirect_ctx_offset =
4583 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4584 		break;
4585 	case 10:
4586 		indirect_ctx_offset =
4587 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4588 		break;
4589 	case 9:
4590 		indirect_ctx_offset =
4591 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4592 		break;
4593 	case 8:
4594 		indirect_ctx_offset =
4595 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4596 		break;
4597 	}
4598 
4599 	return indirect_ctx_offset;
4600 }
4601 
4602 
4603 static void init_common_reg_state(u32 * const regs,
4604 				  const struct intel_engine_cs *engine,
4605 				  const struct intel_ring *ring,
4606 				  bool inhibit)
4607 {
4608 	u32 ctl;
4609 
4610 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4611 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4612 	if (inhibit)
4613 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4614 	if (INTEL_GEN(engine->i915) < 11)
4615 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4616 					   CTX_CTRL_RS_CTX_ENABLE);
4617 	regs[CTX_CONTEXT_CONTROL] = ctl;
4618 
4619 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4620 }
4621 
4622 static void init_wa_bb_reg_state(u32 * const regs,
4623 				 const struct intel_engine_cs *engine,
4624 				 u32 pos_bb_per_ctx)
4625 {
4626 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4627 
4628 	if (wa_ctx->per_ctx.size) {
4629 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4630 
4631 		regs[pos_bb_per_ctx] =
4632 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4633 	}
4634 
4635 	if (wa_ctx->indirect_ctx.size) {
4636 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4637 
4638 		regs[pos_bb_per_ctx + 2] =
4639 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
4640 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4641 
4642 		regs[pos_bb_per_ctx + 4] =
4643 			intel_lr_indirect_ctx_offset(engine) << 6;
4644 	}
4645 }
4646 
4647 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4648 {
4649 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
4650 		/* 64b PPGTT (48bit canonical)
4651 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
4652 		 * other PDP Descriptors are ignored.
4653 		 */
4654 		ASSIGN_CTX_PML4(ppgtt, regs);
4655 	} else {
4656 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
4657 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
4658 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
4659 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
4660 	}
4661 }
4662 
4663 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4664 {
4665 	if (i915_is_ggtt(vm))
4666 		return i915_vm_to_ggtt(vm)->alias;
4667 	else
4668 		return i915_vm_to_ppgtt(vm);
4669 }
4670 
4671 static void execlists_init_reg_state(u32 *regs,
4672 				     const struct intel_context *ce,
4673 				     const struct intel_engine_cs *engine,
4674 				     const struct intel_ring *ring,
4675 				     bool inhibit)
4676 {
4677 	/*
4678 	 * A context is actually a big batch buffer with several
4679 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4680 	 * values we are setting here are only for the first context restore:
4681 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
4682 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4683 	 * we are not initializing here).
4684 	 *
4685 	 * Must keep consistent with virtual_update_register_offsets().
4686 	 */
4687 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
4688 
4689 	init_common_reg_state(regs, engine, ring, inhibit);
4690 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4691 
4692 	init_wa_bb_reg_state(regs, engine,
4693 			     INTEL_GEN(engine->i915) >= 12 ?
4694 			     GEN12_CTX_BB_PER_CTX_PTR :
4695 			     CTX_BB_PER_CTX_PTR);
4696 
4697 	__reset_stop_ring(regs, engine);
4698 }
4699 
4700 static int
4701 populate_lr_context(struct intel_context *ce,
4702 		    struct drm_i915_gem_object *ctx_obj,
4703 		    struct intel_engine_cs *engine,
4704 		    struct intel_ring *ring)
4705 {
4706 	bool inhibit = true;
4707 	void *vaddr;
4708 	int ret;
4709 
4710 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4711 	if (IS_ERR(vaddr)) {
4712 		ret = PTR_ERR(vaddr);
4713 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4714 		return ret;
4715 	}
4716 
4717 	set_redzone(vaddr, engine);
4718 
4719 	if (engine->default_state) {
4720 		void *defaults;
4721 
4722 		defaults = i915_gem_object_pin_map(engine->default_state,
4723 						   I915_MAP_WB);
4724 		if (IS_ERR(defaults)) {
4725 			ret = PTR_ERR(defaults);
4726 			goto err_unpin_ctx;
4727 		}
4728 
4729 		memcpy(vaddr, defaults, engine->context_size);
4730 		i915_gem_object_unpin_map(engine->default_state);
4731 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
4732 		inhibit = false;
4733 	}
4734 
4735 	/* Clear the ppHWSP (inc. per-context counters) */
4736 	memset(vaddr, 0, PAGE_SIZE);
4737 
4738 	/*
4739 	 * The second page of the context object contains some registers which
4740 	 * must be set up prior to the first execution.
4741 	 */
4742 	execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4743 				 ce, engine, ring, inhibit);
4744 
4745 	ret = 0;
4746 err_unpin_ctx:
4747 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4748 	i915_gem_object_unpin_map(ctx_obj);
4749 	return ret;
4750 }
4751 
4752 static int __execlists_context_alloc(struct intel_context *ce,
4753 				     struct intel_engine_cs *engine)
4754 {
4755 	struct drm_i915_gem_object *ctx_obj;
4756 	struct intel_ring *ring;
4757 	struct i915_vma *vma;
4758 	u32 context_size;
4759 	int ret;
4760 
4761 	GEM_BUG_ON(ce->state);
4762 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4763 
4764 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4765 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4766 
4767 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4768 	if (IS_ERR(ctx_obj))
4769 		return PTR_ERR(ctx_obj);
4770 
4771 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4772 	if (IS_ERR(vma)) {
4773 		ret = PTR_ERR(vma);
4774 		goto error_deref_obj;
4775 	}
4776 
4777 	if (!ce->timeline) {
4778 		struct intel_timeline *tl;
4779 		struct i915_vma *hwsp;
4780 
4781 		/*
4782 		 * Use the static global HWSP for the kernel context, and
4783 		 * a dynamically allocated cacheline for everyone else.
4784 		 */
4785 		hwsp = NULL;
4786 		if (unlikely(intel_context_is_barrier(ce)))
4787 			hwsp = engine->status_page.vma;
4788 
4789 		tl = intel_timeline_create(engine->gt, hwsp);
4790 		if (IS_ERR(tl)) {
4791 			ret = PTR_ERR(tl);
4792 			goto error_deref_obj;
4793 		}
4794 
4795 		ce->timeline = tl;
4796 	}
4797 
4798 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4799 	if (IS_ERR(ring)) {
4800 		ret = PTR_ERR(ring);
4801 		goto error_deref_obj;
4802 	}
4803 
4804 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4805 	if (ret) {
4806 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4807 		goto error_ring_free;
4808 	}
4809 
4810 	ce->ring = ring;
4811 	ce->state = vma;
4812 
4813 	return 0;
4814 
4815 error_ring_free:
4816 	intel_ring_put(ring);
4817 error_deref_obj:
4818 	i915_gem_object_put(ctx_obj);
4819 	return ret;
4820 }
4821 
4822 static struct list_head *virtual_queue(struct virtual_engine *ve)
4823 {
4824 	return &ve->base.execlists.default_priolist.requests[0];
4825 }
4826 
4827 static void virtual_context_destroy(struct kref *kref)
4828 {
4829 	struct virtual_engine *ve =
4830 		container_of(kref, typeof(*ve), context.ref);
4831 	unsigned int n;
4832 
4833 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4834 	GEM_BUG_ON(ve->request);
4835 	GEM_BUG_ON(ve->context.inflight);
4836 
4837 	for (n = 0; n < ve->num_siblings; n++) {
4838 		struct intel_engine_cs *sibling = ve->siblings[n];
4839 		struct rb_node *node = &ve->nodes[sibling->id].rb;
4840 		unsigned long flags;
4841 
4842 		if (RB_EMPTY_NODE(node))
4843 			continue;
4844 
4845 		spin_lock_irqsave(&sibling->active.lock, flags);
4846 
4847 		/* Detachment is lazily performed in the execlists tasklet */
4848 		if (!RB_EMPTY_NODE(node))
4849 			rb_erase_cached(node, &sibling->execlists.virtual);
4850 
4851 		spin_unlock_irqrestore(&sibling->active.lock, flags);
4852 	}
4853 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4854 
4855 	if (ve->context.state)
4856 		__execlists_context_fini(&ve->context);
4857 	intel_context_fini(&ve->context);
4858 
4859 	kfree(ve->bonds);
4860 	kfree(ve);
4861 }
4862 
4863 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4864 {
4865 	int swp;
4866 
4867 	/*
4868 	 * Pick a random sibling on starting to help spread the load around.
4869 	 *
4870 	 * New contexts are typically created with exactly the same order
4871 	 * of siblings, and often started in batches. Due to the way we iterate
4872 	 * the array of sibling when submitting requests, sibling[0] is
4873 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4874 	 * randomised across the system, we also help spread the load by the
4875 	 * first engine we inspect being different each time.
4876 	 *
4877 	 * NB This does not force us to execute on this engine, it will just
4878 	 * typically be the first we inspect for submission.
4879 	 */
4880 	swp = prandom_u32_max(ve->num_siblings);
4881 	if (!swp)
4882 		return;
4883 
4884 	swap(ve->siblings[swp], ve->siblings[0]);
4885 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4886 		virtual_update_register_offsets(ve->context.lrc_reg_state,
4887 						ve->siblings[0]);
4888 }
4889 
4890 static int virtual_context_alloc(struct intel_context *ce)
4891 {
4892 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4893 
4894 	return __execlists_context_alloc(ce, ve->siblings[0]);
4895 }
4896 
4897 static int virtual_context_pin(struct intel_context *ce)
4898 {
4899 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4900 	int err;
4901 
4902 	/* Note: we must use a real engine class for setting up reg state */
4903 	err = __execlists_context_pin(ce, ve->siblings[0]);
4904 	if (err)
4905 		return err;
4906 
4907 	virtual_engine_initial_hint(ve);
4908 	return 0;
4909 }
4910 
4911 static void virtual_context_enter(struct intel_context *ce)
4912 {
4913 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4914 	unsigned int n;
4915 
4916 	for (n = 0; n < ve->num_siblings; n++)
4917 		intel_engine_pm_get(ve->siblings[n]);
4918 
4919 	intel_timeline_enter(ce->timeline);
4920 }
4921 
4922 static void virtual_context_exit(struct intel_context *ce)
4923 {
4924 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4925 	unsigned int n;
4926 
4927 	intel_timeline_exit(ce->timeline);
4928 
4929 	for (n = 0; n < ve->num_siblings; n++)
4930 		intel_engine_pm_put(ve->siblings[n]);
4931 }
4932 
4933 static const struct intel_context_ops virtual_context_ops = {
4934 	.alloc = virtual_context_alloc,
4935 
4936 	.pin = virtual_context_pin,
4937 	.unpin = execlists_context_unpin,
4938 
4939 	.enter = virtual_context_enter,
4940 	.exit = virtual_context_exit,
4941 
4942 	.destroy = virtual_context_destroy,
4943 };
4944 
4945 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4946 {
4947 	struct i915_request *rq;
4948 	intel_engine_mask_t mask;
4949 
4950 	rq = READ_ONCE(ve->request);
4951 	if (!rq)
4952 		return 0;
4953 
4954 	/* The rq is ready for submission; rq->execution_mask is now stable. */
4955 	mask = rq->execution_mask;
4956 	if (unlikely(!mask)) {
4957 		/* Invalid selection, submit to a random engine in error */
4958 		i915_request_set_error_once(rq, -ENODEV);
4959 		mask = ve->siblings[0]->mask;
4960 	}
4961 
4962 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4963 		     rq->fence.context, rq->fence.seqno,
4964 		     mask, ve->base.execlists.queue_priority_hint);
4965 
4966 	return mask;
4967 }
4968 
4969 static void virtual_submission_tasklet(unsigned long data)
4970 {
4971 	struct virtual_engine * const ve = (struct virtual_engine *)data;
4972 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
4973 	intel_engine_mask_t mask;
4974 	unsigned int n;
4975 
4976 	rcu_read_lock();
4977 	mask = virtual_submission_mask(ve);
4978 	rcu_read_unlock();
4979 	if (unlikely(!mask))
4980 		return;
4981 
4982 	local_irq_disable();
4983 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4984 		struct intel_engine_cs *sibling = ve->siblings[n];
4985 		struct ve_node * const node = &ve->nodes[sibling->id];
4986 		struct rb_node **parent, *rb;
4987 		bool first;
4988 
4989 		if (unlikely(!(mask & sibling->mask))) {
4990 			if (!RB_EMPTY_NODE(&node->rb)) {
4991 				spin_lock(&sibling->active.lock);
4992 				rb_erase_cached(&node->rb,
4993 						&sibling->execlists.virtual);
4994 				RB_CLEAR_NODE(&node->rb);
4995 				spin_unlock(&sibling->active.lock);
4996 			}
4997 			continue;
4998 		}
4999 
5000 		spin_lock(&sibling->active.lock);
5001 
5002 		if (!RB_EMPTY_NODE(&node->rb)) {
5003 			/*
5004 			 * Cheat and avoid rebalancing the tree if we can
5005 			 * reuse this node in situ.
5006 			 */
5007 			first = rb_first_cached(&sibling->execlists.virtual) ==
5008 				&node->rb;
5009 			if (prio == node->prio || (prio > node->prio && first))
5010 				goto submit_engine;
5011 
5012 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5013 		}
5014 
5015 		rb = NULL;
5016 		first = true;
5017 		parent = &sibling->execlists.virtual.rb_root.rb_node;
5018 		while (*parent) {
5019 			struct ve_node *other;
5020 
5021 			rb = *parent;
5022 			other = rb_entry(rb, typeof(*other), rb);
5023 			if (prio > other->prio) {
5024 				parent = &rb->rb_left;
5025 			} else {
5026 				parent = &rb->rb_right;
5027 				first = false;
5028 			}
5029 		}
5030 
5031 		rb_link_node(&node->rb, rb, parent);
5032 		rb_insert_color_cached(&node->rb,
5033 				       &sibling->execlists.virtual,
5034 				       first);
5035 
5036 submit_engine:
5037 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5038 		node->prio = prio;
5039 		if (first && prio > sibling->execlists.queue_priority_hint) {
5040 			sibling->execlists.queue_priority_hint = prio;
5041 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5042 		}
5043 
5044 		spin_unlock(&sibling->active.lock);
5045 	}
5046 	local_irq_enable();
5047 }
5048 
5049 static void virtual_submit_request(struct i915_request *rq)
5050 {
5051 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5052 	struct i915_request *old;
5053 	unsigned long flags;
5054 
5055 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5056 		     rq->fence.context,
5057 		     rq->fence.seqno);
5058 
5059 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5060 
5061 	spin_lock_irqsave(&ve->base.active.lock, flags);
5062 
5063 	old = ve->request;
5064 	if (old) { /* background completion event from preempt-to-busy */
5065 		GEM_BUG_ON(!i915_request_completed(old));
5066 		__i915_request_submit(old);
5067 		i915_request_put(old);
5068 	}
5069 
5070 	if (i915_request_completed(rq)) {
5071 		__i915_request_submit(rq);
5072 
5073 		ve->base.execlists.queue_priority_hint = INT_MIN;
5074 		ve->request = NULL;
5075 	} else {
5076 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5077 		ve->request = i915_request_get(rq);
5078 
5079 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5080 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5081 
5082 		tasklet_schedule(&ve->base.execlists.tasklet);
5083 	}
5084 
5085 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5086 }
5087 
5088 static struct ve_bond *
5089 virtual_find_bond(struct virtual_engine *ve,
5090 		  const struct intel_engine_cs *master)
5091 {
5092 	int i;
5093 
5094 	for (i = 0; i < ve->num_bonds; i++) {
5095 		if (ve->bonds[i].master == master)
5096 			return &ve->bonds[i];
5097 	}
5098 
5099 	return NULL;
5100 }
5101 
5102 static void
5103 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5104 {
5105 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5106 	intel_engine_mask_t allowed, exec;
5107 	struct ve_bond *bond;
5108 
5109 	allowed = ~to_request(signal)->engine->mask;
5110 
5111 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5112 	if (bond)
5113 		allowed &= bond->sibling_mask;
5114 
5115 	/* Restrict the bonded request to run on only the available engines */
5116 	exec = READ_ONCE(rq->execution_mask);
5117 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5118 		;
5119 
5120 	/* Prevent the master from being re-run on the bonded engines */
5121 	to_request(signal)->execution_mask &= ~allowed;
5122 }
5123 
5124 struct intel_context *
5125 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5126 			       unsigned int count)
5127 {
5128 	struct virtual_engine *ve;
5129 	unsigned int n;
5130 	int err;
5131 
5132 	if (count == 0)
5133 		return ERR_PTR(-EINVAL);
5134 
5135 	if (count == 1)
5136 		return intel_context_create(siblings[0]);
5137 
5138 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5139 	if (!ve)
5140 		return ERR_PTR(-ENOMEM);
5141 
5142 	ve->base.i915 = siblings[0]->i915;
5143 	ve->base.gt = siblings[0]->gt;
5144 	ve->base.uncore = siblings[0]->uncore;
5145 	ve->base.id = -1;
5146 
5147 	ve->base.class = OTHER_CLASS;
5148 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5149 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5150 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5151 
5152 	/*
5153 	 * The decision on whether to submit a request using semaphores
5154 	 * depends on the saturated state of the engine. We only compute
5155 	 * this during HW submission of the request, and we need for this
5156 	 * state to be globally applied to all requests being submitted
5157 	 * to this engine. Virtual engines encompass more than one physical
5158 	 * engine and so we cannot accurately tell in advance if one of those
5159 	 * engines is already saturated and so cannot afford to use a semaphore
5160 	 * and be pessimized in priority for doing so -- if we are the only
5161 	 * context using semaphores after all other clients have stopped, we
5162 	 * will be starved on the saturated system. Such a global switch for
5163 	 * semaphores is less than ideal, but alas is the current compromise.
5164 	 */
5165 	ve->base.saturated = ALL_ENGINES;
5166 
5167 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5168 
5169 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5170 	intel_engine_init_breadcrumbs(&ve->base);
5171 	intel_engine_init_execlists(&ve->base);
5172 
5173 	ve->base.cops = &virtual_context_ops;
5174 	ve->base.request_alloc = execlists_request_alloc;
5175 
5176 	ve->base.schedule = i915_schedule;
5177 	ve->base.submit_request = virtual_submit_request;
5178 	ve->base.bond_execute = virtual_bond_execute;
5179 
5180 	INIT_LIST_HEAD(virtual_queue(ve));
5181 	ve->base.execlists.queue_priority_hint = INT_MIN;
5182 	tasklet_init(&ve->base.execlists.tasklet,
5183 		     virtual_submission_tasklet,
5184 		     (unsigned long)ve);
5185 
5186 	intel_context_init(&ve->context, &ve->base);
5187 
5188 	for (n = 0; n < count; n++) {
5189 		struct intel_engine_cs *sibling = siblings[n];
5190 
5191 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5192 		if (sibling->mask & ve->base.mask) {
5193 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5194 				  sibling->name);
5195 			err = -EINVAL;
5196 			goto err_put;
5197 		}
5198 
5199 		/*
5200 		 * The virtual engine implementation is tightly coupled to
5201 		 * the execlists backend -- we push out request directly
5202 		 * into a tree inside each physical engine. We could support
5203 		 * layering if we handle cloning of the requests and
5204 		 * submitting a copy into each backend.
5205 		 */
5206 		if (sibling->execlists.tasklet.func !=
5207 		    execlists_submission_tasklet) {
5208 			err = -ENODEV;
5209 			goto err_put;
5210 		}
5211 
5212 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5213 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5214 
5215 		ve->siblings[ve->num_siblings++] = sibling;
5216 		ve->base.mask |= sibling->mask;
5217 
5218 		/*
5219 		 * All physical engines must be compatible for their emission
5220 		 * functions (as we build the instructions during request
5221 		 * construction and do not alter them before submission
5222 		 * on the physical engine). We use the engine class as a guide
5223 		 * here, although that could be refined.
5224 		 */
5225 		if (ve->base.class != OTHER_CLASS) {
5226 			if (ve->base.class != sibling->class) {
5227 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5228 					  sibling->class, ve->base.class);
5229 				err = -EINVAL;
5230 				goto err_put;
5231 			}
5232 			continue;
5233 		}
5234 
5235 		ve->base.class = sibling->class;
5236 		ve->base.uabi_class = sibling->uabi_class;
5237 		snprintf(ve->base.name, sizeof(ve->base.name),
5238 			 "v%dx%d", ve->base.class, count);
5239 		ve->base.context_size = sibling->context_size;
5240 
5241 		ve->base.emit_bb_start = sibling->emit_bb_start;
5242 		ve->base.emit_flush = sibling->emit_flush;
5243 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5244 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5245 		ve->base.emit_fini_breadcrumb_dw =
5246 			sibling->emit_fini_breadcrumb_dw;
5247 
5248 		ve->base.flags = sibling->flags;
5249 	}
5250 
5251 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5252 
5253 	return &ve->context;
5254 
5255 err_put:
5256 	intel_context_put(&ve->context);
5257 	return ERR_PTR(err);
5258 }
5259 
5260 struct intel_context *
5261 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5262 {
5263 	struct virtual_engine *se = to_virtual_engine(src);
5264 	struct intel_context *dst;
5265 
5266 	dst = intel_execlists_create_virtual(se->siblings,
5267 					     se->num_siblings);
5268 	if (IS_ERR(dst))
5269 		return dst;
5270 
5271 	if (se->num_bonds) {
5272 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5273 
5274 		de->bonds = kmemdup(se->bonds,
5275 				    sizeof(*se->bonds) * se->num_bonds,
5276 				    GFP_KERNEL);
5277 		if (!de->bonds) {
5278 			intel_context_put(dst);
5279 			return ERR_PTR(-ENOMEM);
5280 		}
5281 
5282 		de->num_bonds = se->num_bonds;
5283 	}
5284 
5285 	return dst;
5286 }
5287 
5288 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5289 				     const struct intel_engine_cs *master,
5290 				     const struct intel_engine_cs *sibling)
5291 {
5292 	struct virtual_engine *ve = to_virtual_engine(engine);
5293 	struct ve_bond *bond;
5294 	int n;
5295 
5296 	/* Sanity check the sibling is part of the virtual engine */
5297 	for (n = 0; n < ve->num_siblings; n++)
5298 		if (sibling == ve->siblings[n])
5299 			break;
5300 	if (n == ve->num_siblings)
5301 		return -EINVAL;
5302 
5303 	bond = virtual_find_bond(ve, master);
5304 	if (bond) {
5305 		bond->sibling_mask |= sibling->mask;
5306 		return 0;
5307 	}
5308 
5309 	bond = krealloc(ve->bonds,
5310 			sizeof(*bond) * (ve->num_bonds + 1),
5311 			GFP_KERNEL);
5312 	if (!bond)
5313 		return -ENOMEM;
5314 
5315 	bond[ve->num_bonds].master = master;
5316 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5317 
5318 	ve->bonds = bond;
5319 	ve->num_bonds++;
5320 
5321 	return 0;
5322 }
5323 
5324 struct intel_engine_cs *
5325 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5326 				 unsigned int sibling)
5327 {
5328 	struct virtual_engine *ve = to_virtual_engine(engine);
5329 
5330 	if (sibling >= ve->num_siblings)
5331 		return NULL;
5332 
5333 	return ve->siblings[sibling];
5334 }
5335 
5336 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5337 				   struct drm_printer *m,
5338 				   void (*show_request)(struct drm_printer *m,
5339 							struct i915_request *rq,
5340 							const char *prefix),
5341 				   unsigned int max)
5342 {
5343 	const struct intel_engine_execlists *execlists = &engine->execlists;
5344 	struct i915_request *rq, *last;
5345 	unsigned long flags;
5346 	unsigned int count;
5347 	struct rb_node *rb;
5348 
5349 	spin_lock_irqsave(&engine->active.lock, flags);
5350 
5351 	last = NULL;
5352 	count = 0;
5353 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5354 		if (count++ < max - 1)
5355 			show_request(m, rq, "\t\tE ");
5356 		else
5357 			last = rq;
5358 	}
5359 	if (last) {
5360 		if (count > max) {
5361 			drm_printf(m,
5362 				   "\t\t...skipping %d executing requests...\n",
5363 				   count - max);
5364 		}
5365 		show_request(m, last, "\t\tE ");
5366 	}
5367 
5368 	if (execlists->switch_priority_hint != INT_MIN)
5369 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5370 			   READ_ONCE(execlists->switch_priority_hint));
5371 	if (execlists->queue_priority_hint != INT_MIN)
5372 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5373 			   READ_ONCE(execlists->queue_priority_hint));
5374 
5375 	last = NULL;
5376 	count = 0;
5377 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5378 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5379 		int i;
5380 
5381 		priolist_for_each_request(rq, p, i) {
5382 			if (count++ < max - 1)
5383 				show_request(m, rq, "\t\tQ ");
5384 			else
5385 				last = rq;
5386 		}
5387 	}
5388 	if (last) {
5389 		if (count > max) {
5390 			drm_printf(m,
5391 				   "\t\t...skipping %d queued requests...\n",
5392 				   count - max);
5393 		}
5394 		show_request(m, last, "\t\tQ ");
5395 	}
5396 
5397 	last = NULL;
5398 	count = 0;
5399 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5400 		struct virtual_engine *ve =
5401 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5402 		struct i915_request *rq = READ_ONCE(ve->request);
5403 
5404 		if (rq) {
5405 			if (count++ < max - 1)
5406 				show_request(m, rq, "\t\tV ");
5407 			else
5408 				last = rq;
5409 		}
5410 	}
5411 	if (last) {
5412 		if (count > max) {
5413 			drm_printf(m,
5414 				   "\t\t...skipping %d virtual requests...\n",
5415 				   count - max);
5416 		}
5417 		show_request(m, last, "\t\tV ");
5418 	}
5419 
5420 	spin_unlock_irqrestore(&engine->active.lock, flags);
5421 }
5422 
5423 void intel_lr_context_reset(struct intel_engine_cs *engine,
5424 			    struct intel_context *ce,
5425 			    u32 head,
5426 			    bool scrub)
5427 {
5428 	GEM_BUG_ON(!intel_context_is_pinned(ce));
5429 
5430 	/*
5431 	 * We want a simple context + ring to execute the breadcrumb update.
5432 	 * We cannot rely on the context being intact across the GPU hang,
5433 	 * so clear it and rebuild just what we need for the breadcrumb.
5434 	 * All pending requests for this context will be zapped, and any
5435 	 * future request will be after userspace has had the opportunity
5436 	 * to recreate its own state.
5437 	 */
5438 	if (scrub)
5439 		restore_default_state(ce, engine);
5440 
5441 	/* Rerun the request; its payload has been neutered (if guilty). */
5442 	__execlists_update_reg_state(ce, engine, head);
5443 }
5444 
5445 bool
5446 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5447 {
5448 	return engine->set_default_submission ==
5449 	       intel_execlists_set_default_submission;
5450 }
5451 
5452 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5453 #include "selftest_lrc.c"
5454 #endif
5455