xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision e82c878d)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 
151 #define RING_EXECLIST_QFULL		(1 << 0x2)
152 #define RING_EXECLIST1_VALID		(1 << 0x3)
153 #define RING_EXECLIST0_VALID		(1 << 0x4)
154 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
155 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
156 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
157 
158 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
159 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
161 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
162 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
163 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
164 
165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
166 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
167 
168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
169 
170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
172 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
173 #define GEN12_IDLE_CTX_ID		0x7FF
174 #define GEN12_CSB_CTX_VALID(csb_dw) \
175 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
176 
177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
179 
180 struct virtual_engine {
181 	struct intel_engine_cs base;
182 	struct intel_context context;
183 
184 	/*
185 	 * We allow only a single request through the virtual engine at a time
186 	 * (each request in the timeline waits for the completion fence of
187 	 * the previous before being submitted). By restricting ourselves to
188 	 * only submitting a single request, each request is placed on to a
189 	 * physical to maximise load spreading (by virtue of the late greedy
190 	 * scheduling -- each real engine takes the next available request
191 	 * upon idling).
192 	 */
193 	struct i915_request *request;
194 
195 	/*
196 	 * We keep a rbtree of available virtual engines inside each physical
197 	 * engine, sorted by priority. Here we preallocate the nodes we need
198 	 * for the virtual engine, indexed by physical_engine->id.
199 	 */
200 	struct ve_node {
201 		struct rb_node rb;
202 		int prio;
203 	} nodes[I915_NUM_ENGINES];
204 
205 	/*
206 	 * Keep track of bonded pairs -- restrictions upon on our selection
207 	 * of physical engines any particular request may be submitted to.
208 	 * If we receive a submit-fence from a master engine, we will only
209 	 * use one of sibling_mask physical engines.
210 	 */
211 	struct ve_bond {
212 		const struct intel_engine_cs *master;
213 		intel_engine_mask_t sibling_mask;
214 	} *bonds;
215 	unsigned int num_bonds;
216 
217 	/* And finally, which physical engines this virtual engine maps onto. */
218 	unsigned int num_siblings;
219 	struct intel_engine_cs *siblings[0];
220 };
221 
222 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
223 {
224 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
225 	return container_of(engine, struct virtual_engine, base);
226 }
227 
228 static int __execlists_context_alloc(struct intel_context *ce,
229 				     struct intel_engine_cs *engine);
230 
231 static void execlists_init_reg_state(u32 *reg_state,
232 				     const struct intel_context *ce,
233 				     const struct intel_engine_cs *engine,
234 				     const struct intel_ring *ring,
235 				     bool close);
236 static void
237 __execlists_update_reg_state(const struct intel_context *ce,
238 			     const struct intel_engine_cs *engine,
239 			     u32 head);
240 
241 static void mark_eio(struct i915_request *rq)
242 {
243 	if (i915_request_completed(rq))
244 		return;
245 
246 	GEM_BUG_ON(i915_request_signaled(rq));
247 
248 	i915_request_set_error_once(rq, -EIO);
249 	i915_request_mark_complete(rq);
250 }
251 
252 static struct i915_request *
253 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
254 {
255 	struct i915_request *active = rq;
256 
257 	rcu_read_lock();
258 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
259 		if (i915_request_completed(rq))
260 			break;
261 
262 		active = rq;
263 	}
264 	rcu_read_unlock();
265 
266 	return active;
267 }
268 
269 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
270 {
271 	return (i915_ggtt_offset(engine->status_page.vma) +
272 		I915_GEM_HWS_PREEMPT_ADDR);
273 }
274 
275 static inline void
276 ring_set_paused(const struct intel_engine_cs *engine, int state)
277 {
278 	/*
279 	 * We inspect HWS_PREEMPT with a semaphore inside
280 	 * engine->emit_fini_breadcrumb. If the dword is true,
281 	 * the ring is paused as the semaphore will busywait
282 	 * until the dword is false.
283 	 */
284 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
285 	if (state)
286 		wmb();
287 }
288 
289 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
290 {
291 	return rb_entry(rb, struct i915_priolist, node);
292 }
293 
294 static inline int rq_prio(const struct i915_request *rq)
295 {
296 	return READ_ONCE(rq->sched.attr.priority);
297 }
298 
299 static int effective_prio(const struct i915_request *rq)
300 {
301 	int prio = rq_prio(rq);
302 
303 	/*
304 	 * If this request is special and must not be interrupted at any
305 	 * cost, so be it. Note we are only checking the most recent request
306 	 * in the context and so may be masking an earlier vip request. It
307 	 * is hoped that under the conditions where nopreempt is used, this
308 	 * will not matter (i.e. all requests to that context will be
309 	 * nopreempt for as long as desired).
310 	 */
311 	if (i915_request_has_nopreempt(rq))
312 		prio = I915_PRIORITY_UNPREEMPTABLE;
313 
314 	/*
315 	 * On unwinding the active request, we give it a priority bump
316 	 * if it has completed waiting on any semaphore. If we know that
317 	 * the request has already started, we can prevent an unwanted
318 	 * preempt-to-idle cycle by taking that into account now.
319 	 */
320 	if (__i915_request_has_started(rq))
321 		prio |= I915_PRIORITY_NOSEMAPHORE;
322 
323 	/* Restrict mere WAIT boosts from triggering preemption */
324 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
325 	return prio | __NO_PREEMPTION;
326 }
327 
328 static int queue_prio(const struct intel_engine_execlists *execlists)
329 {
330 	struct i915_priolist *p;
331 	struct rb_node *rb;
332 
333 	rb = rb_first_cached(&execlists->queue);
334 	if (!rb)
335 		return INT_MIN;
336 
337 	/*
338 	 * As the priolist[] are inverted, with the highest priority in [0],
339 	 * we have to flip the index value to become priority.
340 	 */
341 	p = to_priolist(rb);
342 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
343 }
344 
345 static inline bool need_preempt(const struct intel_engine_cs *engine,
346 				const struct i915_request *rq,
347 				struct rb_node *rb)
348 {
349 	int last_prio;
350 
351 	if (!intel_engine_has_semaphores(engine))
352 		return false;
353 
354 	/*
355 	 * Check if the current priority hint merits a preemption attempt.
356 	 *
357 	 * We record the highest value priority we saw during rescheduling
358 	 * prior to this dequeue, therefore we know that if it is strictly
359 	 * less than the current tail of ESLP[0], we do not need to force
360 	 * a preempt-to-idle cycle.
361 	 *
362 	 * However, the priority hint is a mere hint that we may need to
363 	 * preempt. If that hint is stale or we may be trying to preempt
364 	 * ourselves, ignore the request.
365 	 *
366 	 * More naturally we would write
367 	 *      prio >= max(0, last);
368 	 * except that we wish to prevent triggering preemption at the same
369 	 * priority level: the task that is running should remain running
370 	 * to preserve FIFO ordering of dependencies.
371 	 */
372 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
373 	if (engine->execlists.queue_priority_hint <= last_prio)
374 		return false;
375 
376 	/*
377 	 * Check against the first request in ELSP[1], it will, thanks to the
378 	 * power of PI, be the highest priority of that context.
379 	 */
380 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
381 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
382 		return true;
383 
384 	if (rb) {
385 		struct virtual_engine *ve =
386 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
387 		bool preempt = false;
388 
389 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
390 			struct i915_request *next;
391 
392 			rcu_read_lock();
393 			next = READ_ONCE(ve->request);
394 			if (next)
395 				preempt = rq_prio(next) > last_prio;
396 			rcu_read_unlock();
397 		}
398 
399 		if (preempt)
400 			return preempt;
401 	}
402 
403 	/*
404 	 * If the inflight context did not trigger the preemption, then maybe
405 	 * it was the set of queued requests? Pick the highest priority in
406 	 * the queue (the first active priolist) and see if it deserves to be
407 	 * running instead of ELSP[0].
408 	 *
409 	 * The highest priority request in the queue can not be either
410 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
411 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
412 	 */
413 	return queue_prio(&engine->execlists) > last_prio;
414 }
415 
416 __maybe_unused static inline bool
417 assert_priority_queue(const struct i915_request *prev,
418 		      const struct i915_request *next)
419 {
420 	/*
421 	 * Without preemption, the prev may refer to the still active element
422 	 * which we refuse to let go.
423 	 *
424 	 * Even with preemption, there are times when we think it is better not
425 	 * to preempt and leave an ostensibly lower priority request in flight.
426 	 */
427 	if (i915_request_is_active(prev))
428 		return true;
429 
430 	return rq_prio(prev) >= rq_prio(next);
431 }
432 
433 /*
434  * The context descriptor encodes various attributes of a context,
435  * including its GTT address and some flags. Because it's fairly
436  * expensive to calculate, we'll just do it once and cache the result,
437  * which remains valid until the context is unpinned.
438  *
439  * This is what a descriptor looks like, from LSB to MSB::
440  *
441  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
442  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
443  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
444  *      bits 53-54:    mbz, reserved for use by hardware
445  *      bits 55-63:    group ID, currently unused and set to 0
446  *
447  * Starting from Gen11, the upper dword of the descriptor has a new format:
448  *
449  *      bits 32-36:    reserved
450  *      bits 37-47:    SW context ID
451  *      bits 48:53:    engine instance
452  *      bit 54:        mbz, reserved for use by hardware
453  *      bits 55-60:    SW counter
454  *      bits 61-63:    engine class
455  *
456  * engine info, SW context ID and SW counter need to form a unique number
457  * (Context ID) per lrc.
458  */
459 static u64
460 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
461 {
462 	u64 desc;
463 
464 	desc = INTEL_LEGACY_32B_CONTEXT;
465 	if (i915_vm_is_4lvl(ce->vm))
466 		desc = INTEL_LEGACY_64B_CONTEXT;
467 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
468 
469 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
470 	if (IS_GEN(engine->i915, 8))
471 		desc |= GEN8_CTX_L3LLC_COHERENT;
472 
473 	desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
474 	/*
475 	 * The following 32bits are copied into the OA reports (dword 2).
476 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
477 	 * anything below.
478 	 */
479 	if (INTEL_GEN(engine->i915) >= 11) {
480 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
481 								/* bits 48-53 */
482 
483 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
484 								/* bits 61-63 */
485 	}
486 
487 	return desc;
488 }
489 
490 static inline unsigned int dword_in_page(void *addr)
491 {
492 	return offset_in_page(addr) / sizeof(u32);
493 }
494 
495 static void set_offsets(u32 *regs,
496 			const u8 *data,
497 			const struct intel_engine_cs *engine,
498 			bool clear)
499 #define NOP(x) (BIT(7) | (x))
500 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
501 #define POSTED BIT(0)
502 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
503 #define REG16(x) \
504 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
505 	(((x) >> 2) & 0x7f)
506 #define END(x) 0, (x)
507 {
508 	const u32 base = engine->mmio_base;
509 
510 	while (*data) {
511 		u8 count, flags;
512 
513 		if (*data & BIT(7)) { /* skip */
514 			count = *data++ & ~BIT(7);
515 			if (clear)
516 				memset32(regs, MI_NOOP, count);
517 			regs += count;
518 			continue;
519 		}
520 
521 		count = *data & 0x3f;
522 		flags = *data >> 6;
523 		data++;
524 
525 		*regs = MI_LOAD_REGISTER_IMM(count);
526 		if (flags & POSTED)
527 			*regs |= MI_LRI_FORCE_POSTED;
528 		if (INTEL_GEN(engine->i915) >= 11)
529 			*regs |= MI_LRI_CS_MMIO;
530 		regs++;
531 
532 		GEM_BUG_ON(!count);
533 		do {
534 			u32 offset = 0;
535 			u8 v;
536 
537 			do {
538 				v = *data++;
539 				offset <<= 7;
540 				offset |= v & ~BIT(7);
541 			} while (v & BIT(7));
542 
543 			regs[0] = base + (offset << 2);
544 			if (clear)
545 				regs[1] = 0;
546 			regs += 2;
547 		} while (--count);
548 	}
549 
550 	if (clear) {
551 		u8 count = *++data;
552 
553 		/* Clear past the tail for HW access */
554 		GEM_BUG_ON(dword_in_page(regs) > count);
555 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
556 
557 		/* Close the batch; used mainly by live_lrc_layout() */
558 		*regs = MI_BATCH_BUFFER_END;
559 		if (INTEL_GEN(engine->i915) >= 10)
560 			*regs |= BIT(0);
561 	}
562 }
563 
564 static const u8 gen8_xcs_offsets[] = {
565 	NOP(1),
566 	LRI(11, 0),
567 	REG16(0x244),
568 	REG(0x034),
569 	REG(0x030),
570 	REG(0x038),
571 	REG(0x03c),
572 	REG(0x168),
573 	REG(0x140),
574 	REG(0x110),
575 	REG(0x11c),
576 	REG(0x114),
577 	REG(0x118),
578 
579 	NOP(9),
580 	LRI(9, 0),
581 	REG16(0x3a8),
582 	REG16(0x28c),
583 	REG16(0x288),
584 	REG16(0x284),
585 	REG16(0x280),
586 	REG16(0x27c),
587 	REG16(0x278),
588 	REG16(0x274),
589 	REG16(0x270),
590 
591 	NOP(13),
592 	LRI(2, 0),
593 	REG16(0x200),
594 	REG(0x028),
595 
596 	END(80)
597 };
598 
599 static const u8 gen9_xcs_offsets[] = {
600 	NOP(1),
601 	LRI(14, POSTED),
602 	REG16(0x244),
603 	REG(0x034),
604 	REG(0x030),
605 	REG(0x038),
606 	REG(0x03c),
607 	REG(0x168),
608 	REG(0x140),
609 	REG(0x110),
610 	REG(0x11c),
611 	REG(0x114),
612 	REG(0x118),
613 	REG(0x1c0),
614 	REG(0x1c4),
615 	REG(0x1c8),
616 
617 	NOP(3),
618 	LRI(9, POSTED),
619 	REG16(0x3a8),
620 	REG16(0x28c),
621 	REG16(0x288),
622 	REG16(0x284),
623 	REG16(0x280),
624 	REG16(0x27c),
625 	REG16(0x278),
626 	REG16(0x274),
627 	REG16(0x270),
628 
629 	NOP(13),
630 	LRI(1, POSTED),
631 	REG16(0x200),
632 
633 	NOP(13),
634 	LRI(44, POSTED),
635 	REG(0x028),
636 	REG(0x09c),
637 	REG(0x0c0),
638 	REG(0x178),
639 	REG(0x17c),
640 	REG16(0x358),
641 	REG(0x170),
642 	REG(0x150),
643 	REG(0x154),
644 	REG(0x158),
645 	REG16(0x41c),
646 	REG16(0x600),
647 	REG16(0x604),
648 	REG16(0x608),
649 	REG16(0x60c),
650 	REG16(0x610),
651 	REG16(0x614),
652 	REG16(0x618),
653 	REG16(0x61c),
654 	REG16(0x620),
655 	REG16(0x624),
656 	REG16(0x628),
657 	REG16(0x62c),
658 	REG16(0x630),
659 	REG16(0x634),
660 	REG16(0x638),
661 	REG16(0x63c),
662 	REG16(0x640),
663 	REG16(0x644),
664 	REG16(0x648),
665 	REG16(0x64c),
666 	REG16(0x650),
667 	REG16(0x654),
668 	REG16(0x658),
669 	REG16(0x65c),
670 	REG16(0x660),
671 	REG16(0x664),
672 	REG16(0x668),
673 	REG16(0x66c),
674 	REG16(0x670),
675 	REG16(0x674),
676 	REG16(0x678),
677 	REG16(0x67c),
678 	REG(0x068),
679 
680 	END(176)
681 };
682 
683 static const u8 gen12_xcs_offsets[] = {
684 	NOP(1),
685 	LRI(13, POSTED),
686 	REG16(0x244),
687 	REG(0x034),
688 	REG(0x030),
689 	REG(0x038),
690 	REG(0x03c),
691 	REG(0x168),
692 	REG(0x140),
693 	REG(0x110),
694 	REG(0x1c0),
695 	REG(0x1c4),
696 	REG(0x1c8),
697 	REG(0x180),
698 	REG16(0x2b4),
699 
700 	NOP(5),
701 	LRI(9, POSTED),
702 	REG16(0x3a8),
703 	REG16(0x28c),
704 	REG16(0x288),
705 	REG16(0x284),
706 	REG16(0x280),
707 	REG16(0x27c),
708 	REG16(0x278),
709 	REG16(0x274),
710 	REG16(0x270),
711 
712 	END(80)
713 };
714 
715 static const u8 gen8_rcs_offsets[] = {
716 	NOP(1),
717 	LRI(14, POSTED),
718 	REG16(0x244),
719 	REG(0x034),
720 	REG(0x030),
721 	REG(0x038),
722 	REG(0x03c),
723 	REG(0x168),
724 	REG(0x140),
725 	REG(0x110),
726 	REG(0x11c),
727 	REG(0x114),
728 	REG(0x118),
729 	REG(0x1c0),
730 	REG(0x1c4),
731 	REG(0x1c8),
732 
733 	NOP(3),
734 	LRI(9, POSTED),
735 	REG16(0x3a8),
736 	REG16(0x28c),
737 	REG16(0x288),
738 	REG16(0x284),
739 	REG16(0x280),
740 	REG16(0x27c),
741 	REG16(0x278),
742 	REG16(0x274),
743 	REG16(0x270),
744 
745 	NOP(13),
746 	LRI(1, 0),
747 	REG(0x0c8),
748 
749 	END(80)
750 };
751 
752 static const u8 gen9_rcs_offsets[] = {
753 	NOP(1),
754 	LRI(14, POSTED),
755 	REG16(0x244),
756 	REG(0x34),
757 	REG(0x30),
758 	REG(0x38),
759 	REG(0x3c),
760 	REG(0x168),
761 	REG(0x140),
762 	REG(0x110),
763 	REG(0x11c),
764 	REG(0x114),
765 	REG(0x118),
766 	REG(0x1c0),
767 	REG(0x1c4),
768 	REG(0x1c8),
769 
770 	NOP(3),
771 	LRI(9, POSTED),
772 	REG16(0x3a8),
773 	REG16(0x28c),
774 	REG16(0x288),
775 	REG16(0x284),
776 	REG16(0x280),
777 	REG16(0x27c),
778 	REG16(0x278),
779 	REG16(0x274),
780 	REG16(0x270),
781 
782 	NOP(13),
783 	LRI(1, 0),
784 	REG(0xc8),
785 
786 	NOP(13),
787 	LRI(44, POSTED),
788 	REG(0x28),
789 	REG(0x9c),
790 	REG(0xc0),
791 	REG(0x178),
792 	REG(0x17c),
793 	REG16(0x358),
794 	REG(0x170),
795 	REG(0x150),
796 	REG(0x154),
797 	REG(0x158),
798 	REG16(0x41c),
799 	REG16(0x600),
800 	REG16(0x604),
801 	REG16(0x608),
802 	REG16(0x60c),
803 	REG16(0x610),
804 	REG16(0x614),
805 	REG16(0x618),
806 	REG16(0x61c),
807 	REG16(0x620),
808 	REG16(0x624),
809 	REG16(0x628),
810 	REG16(0x62c),
811 	REG16(0x630),
812 	REG16(0x634),
813 	REG16(0x638),
814 	REG16(0x63c),
815 	REG16(0x640),
816 	REG16(0x644),
817 	REG16(0x648),
818 	REG16(0x64c),
819 	REG16(0x650),
820 	REG16(0x654),
821 	REG16(0x658),
822 	REG16(0x65c),
823 	REG16(0x660),
824 	REG16(0x664),
825 	REG16(0x668),
826 	REG16(0x66c),
827 	REG16(0x670),
828 	REG16(0x674),
829 	REG16(0x678),
830 	REG16(0x67c),
831 	REG(0x68),
832 
833 	END(176)
834 };
835 
836 static const u8 gen11_rcs_offsets[] = {
837 	NOP(1),
838 	LRI(15, POSTED),
839 	REG16(0x244),
840 	REG(0x034),
841 	REG(0x030),
842 	REG(0x038),
843 	REG(0x03c),
844 	REG(0x168),
845 	REG(0x140),
846 	REG(0x110),
847 	REG(0x11c),
848 	REG(0x114),
849 	REG(0x118),
850 	REG(0x1c0),
851 	REG(0x1c4),
852 	REG(0x1c8),
853 	REG(0x180),
854 
855 	NOP(1),
856 	LRI(9, POSTED),
857 	REG16(0x3a8),
858 	REG16(0x28c),
859 	REG16(0x288),
860 	REG16(0x284),
861 	REG16(0x280),
862 	REG16(0x27c),
863 	REG16(0x278),
864 	REG16(0x274),
865 	REG16(0x270),
866 
867 	LRI(1, POSTED),
868 	REG(0x1b0),
869 
870 	NOP(10),
871 	LRI(1, 0),
872 	REG(0x0c8),
873 
874 	END(80)
875 };
876 
877 static const u8 gen12_rcs_offsets[] = {
878 	NOP(1),
879 	LRI(13, POSTED),
880 	REG16(0x244),
881 	REG(0x034),
882 	REG(0x030),
883 	REG(0x038),
884 	REG(0x03c),
885 	REG(0x168),
886 	REG(0x140),
887 	REG(0x110),
888 	REG(0x1c0),
889 	REG(0x1c4),
890 	REG(0x1c8),
891 	REG(0x180),
892 	REG16(0x2b4),
893 
894 	NOP(5),
895 	LRI(9, POSTED),
896 	REG16(0x3a8),
897 	REG16(0x28c),
898 	REG16(0x288),
899 	REG16(0x284),
900 	REG16(0x280),
901 	REG16(0x27c),
902 	REG16(0x278),
903 	REG16(0x274),
904 	REG16(0x270),
905 
906 	LRI(3, POSTED),
907 	REG(0x1b0),
908 	REG16(0x5a8),
909 	REG16(0x5ac),
910 
911 	NOP(6),
912 	LRI(1, 0),
913 	REG(0x0c8),
914 
915 	END(80)
916 };
917 
918 #undef END
919 #undef REG16
920 #undef REG
921 #undef LRI
922 #undef NOP
923 
924 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
925 {
926 	/*
927 	 * The gen12+ lists only have the registers we program in the basic
928 	 * default state. We rely on the context image using relative
929 	 * addressing to automatic fixup the register state between the
930 	 * physical engines for virtual engine.
931 	 */
932 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
933 		   !intel_engine_has_relative_mmio(engine));
934 
935 	if (engine->class == RENDER_CLASS) {
936 		if (INTEL_GEN(engine->i915) >= 12)
937 			return gen12_rcs_offsets;
938 		else if (INTEL_GEN(engine->i915) >= 11)
939 			return gen11_rcs_offsets;
940 		else if (INTEL_GEN(engine->i915) >= 9)
941 			return gen9_rcs_offsets;
942 		else
943 			return gen8_rcs_offsets;
944 	} else {
945 		if (INTEL_GEN(engine->i915) >= 12)
946 			return gen12_xcs_offsets;
947 		else if (INTEL_GEN(engine->i915) >= 9)
948 			return gen9_xcs_offsets;
949 		else
950 			return gen8_xcs_offsets;
951 	}
952 }
953 
954 static struct i915_request *
955 __unwind_incomplete_requests(struct intel_engine_cs *engine)
956 {
957 	struct i915_request *rq, *rn, *active = NULL;
958 	struct list_head *uninitialized_var(pl);
959 	int prio = I915_PRIORITY_INVALID;
960 
961 	lockdep_assert_held(&engine->active.lock);
962 
963 	list_for_each_entry_safe_reverse(rq, rn,
964 					 &engine->active.requests,
965 					 sched.link) {
966 		if (i915_request_completed(rq))
967 			continue; /* XXX */
968 
969 		__i915_request_unsubmit(rq);
970 
971 		/*
972 		 * Push the request back into the queue for later resubmission.
973 		 * If this request is not native to this physical engine (i.e.
974 		 * it came from a virtual source), push it back onto the virtual
975 		 * engine so that it can be moved across onto another physical
976 		 * engine as load dictates.
977 		 */
978 		if (likely(rq->execution_mask == engine->mask)) {
979 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
980 			if (rq_prio(rq) != prio) {
981 				prio = rq_prio(rq);
982 				pl = i915_sched_lookup_priolist(engine, prio);
983 			}
984 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
985 
986 			list_move(&rq->sched.link, pl);
987 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
988 
989 			active = rq;
990 		} else {
991 			struct intel_engine_cs *owner = rq->context->engine;
992 
993 			/*
994 			 * Decouple the virtual breadcrumb before moving it
995 			 * back to the virtual engine -- we don't want the
996 			 * request to complete in the background and try
997 			 * and cancel the breadcrumb on the virtual engine
998 			 * (instead of the old engine where it is linked)!
999 			 */
1000 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1001 				     &rq->fence.flags)) {
1002 				spin_lock_nested(&rq->lock,
1003 						 SINGLE_DEPTH_NESTING);
1004 				i915_request_cancel_breadcrumb(rq);
1005 				spin_unlock(&rq->lock);
1006 			}
1007 			WRITE_ONCE(rq->engine, owner);
1008 			owner->submit_request(rq);
1009 			active = NULL;
1010 		}
1011 	}
1012 
1013 	return active;
1014 }
1015 
1016 struct i915_request *
1017 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1018 {
1019 	struct intel_engine_cs *engine =
1020 		container_of(execlists, typeof(*engine), execlists);
1021 
1022 	return __unwind_incomplete_requests(engine);
1023 }
1024 
1025 static inline void
1026 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1027 {
1028 	/*
1029 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1030 	 * The compiler should eliminate this function as dead-code.
1031 	 */
1032 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1033 		return;
1034 
1035 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1036 				   status, rq);
1037 }
1038 
1039 static void intel_engine_context_in(struct intel_engine_cs *engine)
1040 {
1041 	unsigned long flags;
1042 
1043 	if (READ_ONCE(engine->stats.enabled) == 0)
1044 		return;
1045 
1046 	write_seqlock_irqsave(&engine->stats.lock, flags);
1047 
1048 	if (engine->stats.enabled > 0) {
1049 		if (engine->stats.active++ == 0)
1050 			engine->stats.start = ktime_get();
1051 		GEM_BUG_ON(engine->stats.active == 0);
1052 	}
1053 
1054 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1055 }
1056 
1057 static void intel_engine_context_out(struct intel_engine_cs *engine)
1058 {
1059 	unsigned long flags;
1060 
1061 	if (READ_ONCE(engine->stats.enabled) == 0)
1062 		return;
1063 
1064 	write_seqlock_irqsave(&engine->stats.lock, flags);
1065 
1066 	if (engine->stats.enabled > 0) {
1067 		ktime_t last;
1068 
1069 		if (engine->stats.active && --engine->stats.active == 0) {
1070 			/*
1071 			 * Decrement the active context count and in case GPU
1072 			 * is now idle add up to the running total.
1073 			 */
1074 			last = ktime_sub(ktime_get(), engine->stats.start);
1075 
1076 			engine->stats.total = ktime_add(engine->stats.total,
1077 							last);
1078 		} else if (engine->stats.active == 0) {
1079 			/*
1080 			 * After turning on engine stats, context out might be
1081 			 * the first event in which case we account from the
1082 			 * time stats gathering was turned on.
1083 			 */
1084 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1085 
1086 			engine->stats.total = ktime_add(engine->stats.total,
1087 							last);
1088 		}
1089 	}
1090 
1091 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1092 }
1093 
1094 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1095 {
1096 	if (INTEL_GEN(engine->i915) >= 12)
1097 		return 0x60;
1098 	else if (INTEL_GEN(engine->i915) >= 9)
1099 		return 0x54;
1100 	else if (engine->class == RENDER_CLASS)
1101 		return 0x58;
1102 	else
1103 		return -1;
1104 }
1105 
1106 static void
1107 execlists_check_context(const struct intel_context *ce,
1108 			const struct intel_engine_cs *engine)
1109 {
1110 	const struct intel_ring *ring = ce->ring;
1111 	u32 *regs = ce->lrc_reg_state;
1112 	bool valid = true;
1113 	int x;
1114 
1115 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1116 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1117 		       engine->name,
1118 		       regs[CTX_RING_START],
1119 		       i915_ggtt_offset(ring->vma));
1120 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1121 		valid = false;
1122 	}
1123 
1124 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1125 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1126 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1127 		       engine->name,
1128 		       regs[CTX_RING_CTL],
1129 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1130 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1131 		valid = false;
1132 	}
1133 
1134 	x = lrc_ring_mi_mode(engine);
1135 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1136 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1137 		       engine->name, regs[x + 1]);
1138 		regs[x + 1] &= ~STOP_RING;
1139 		regs[x + 1] |= STOP_RING << 16;
1140 		valid = false;
1141 	}
1142 
1143 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1144 }
1145 
1146 static void restore_default_state(struct intel_context *ce,
1147 				  struct intel_engine_cs *engine)
1148 {
1149 	u32 *regs = ce->lrc_reg_state;
1150 
1151 	if (engine->pinned_default_state)
1152 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
1153 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1154 		       engine->context_size - PAGE_SIZE);
1155 
1156 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1157 }
1158 
1159 static void reset_active(struct i915_request *rq,
1160 			 struct intel_engine_cs *engine)
1161 {
1162 	struct intel_context * const ce = rq->context;
1163 	u32 head;
1164 
1165 	/*
1166 	 * The executing context has been cancelled. We want to prevent
1167 	 * further execution along this context and propagate the error on
1168 	 * to anything depending on its results.
1169 	 *
1170 	 * In __i915_request_submit(), we apply the -EIO and remove the
1171 	 * requests' payloads for any banned requests. But first, we must
1172 	 * rewind the context back to the start of the incomplete request so
1173 	 * that we do not jump back into the middle of the batch.
1174 	 *
1175 	 * We preserve the breadcrumbs and semaphores of the incomplete
1176 	 * requests so that inter-timeline dependencies (i.e other timelines)
1177 	 * remain correctly ordered. And we defer to __i915_request_submit()
1178 	 * so that all asynchronous waits are correctly handled.
1179 	 */
1180 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1181 		     rq->fence.context, rq->fence.seqno);
1182 
1183 	/* On resubmission of the active request, payload will be scrubbed */
1184 	if (i915_request_completed(rq))
1185 		head = rq->tail;
1186 	else
1187 		head = active_request(ce->timeline, rq)->head;
1188 	head = intel_ring_wrap(ce->ring, head);
1189 
1190 	/* Scrub the context image to prevent replaying the previous batch */
1191 	restore_default_state(ce, engine);
1192 	__execlists_update_reg_state(ce, engine, head);
1193 
1194 	/* We've switched away, so this should be a no-op, but intent matters */
1195 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1196 }
1197 
1198 static u32 intel_context_get_runtime(const struct intel_context *ce)
1199 {
1200 	/*
1201 	 * We can use either ppHWSP[16] which is recorded before the context
1202 	 * switch (and so excludes the cost of context switches) or use the
1203 	 * value from the context image itself, which is saved/restored earlier
1204 	 * and so includes the cost of the save.
1205 	 */
1206 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1207 }
1208 
1209 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1210 {
1211 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1212 	ce->runtime.num_underflow += dt < 0;
1213 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1214 #endif
1215 }
1216 
1217 static void intel_context_update_runtime(struct intel_context *ce)
1218 {
1219 	u32 old;
1220 	s32 dt;
1221 
1222 	if (intel_context_is_barrier(ce))
1223 		return;
1224 
1225 	old = ce->runtime.last;
1226 	ce->runtime.last = intel_context_get_runtime(ce);
1227 	dt = ce->runtime.last - old;
1228 
1229 	if (unlikely(dt <= 0)) {
1230 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1231 			 old, ce->runtime.last, dt);
1232 		st_update_runtime_underflow(ce, dt);
1233 		return;
1234 	}
1235 
1236 	ewma_runtime_add(&ce->runtime.avg, dt);
1237 	ce->runtime.total += dt;
1238 }
1239 
1240 static inline struct intel_engine_cs *
1241 __execlists_schedule_in(struct i915_request *rq)
1242 {
1243 	struct intel_engine_cs * const engine = rq->engine;
1244 	struct intel_context * const ce = rq->context;
1245 
1246 	intel_context_get(ce);
1247 
1248 	if (unlikely(intel_context_is_banned(ce)))
1249 		reset_active(rq, engine);
1250 
1251 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1252 		execlists_check_context(ce, engine);
1253 
1254 	ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1255 	if (ce->tag) {
1256 		/* Use a fixed tag for OA and friends */
1257 		ce->lrc_desc |= (u64)ce->tag << 32;
1258 	} else {
1259 		/* We don't need a strict matching tag, just different values */
1260 		ce->lrc_desc |=
1261 			(u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1262 			GEN11_SW_CTX_ID_SHIFT;
1263 		BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1264 	}
1265 
1266 	__intel_gt_pm_get(engine->gt);
1267 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1268 	intel_engine_context_in(engine);
1269 
1270 	return engine;
1271 }
1272 
1273 static inline struct i915_request *
1274 execlists_schedule_in(struct i915_request *rq, int idx)
1275 {
1276 	struct intel_context * const ce = rq->context;
1277 	struct intel_engine_cs *old;
1278 
1279 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1280 	trace_i915_request_in(rq, idx);
1281 
1282 	old = READ_ONCE(ce->inflight);
1283 	do {
1284 		if (!old) {
1285 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1286 			break;
1287 		}
1288 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1289 
1290 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1291 	return i915_request_get(rq);
1292 }
1293 
1294 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1295 {
1296 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1297 	struct i915_request *next = READ_ONCE(ve->request);
1298 
1299 	if (next && next->execution_mask & ~rq->execution_mask)
1300 		tasklet_schedule(&ve->base.execlists.tasklet);
1301 }
1302 
1303 static inline void
1304 __execlists_schedule_out(struct i915_request *rq,
1305 			 struct intel_engine_cs * const engine)
1306 {
1307 	struct intel_context * const ce = rq->context;
1308 
1309 	/*
1310 	 * NB process_csb() is not under the engine->active.lock and hence
1311 	 * schedule_out can race with schedule_in meaning that we should
1312 	 * refrain from doing non-trivial work here.
1313 	 */
1314 
1315 	/*
1316 	 * If we have just completed this context, the engine may now be
1317 	 * idle and we want to re-enter powersaving.
1318 	 */
1319 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1320 	    i915_request_completed(rq))
1321 		intel_engine_add_retire(engine, ce->timeline);
1322 
1323 	intel_context_update_runtime(ce);
1324 	intel_engine_context_out(engine);
1325 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1326 	intel_gt_pm_put_async(engine->gt);
1327 
1328 	/*
1329 	 * If this is part of a virtual engine, its next request may
1330 	 * have been blocked waiting for access to the active context.
1331 	 * We have to kick all the siblings again in case we need to
1332 	 * switch (e.g. the next request is not runnable on this
1333 	 * engine). Hopefully, we will already have submitted the next
1334 	 * request before the tasklet runs and do not need to rebuild
1335 	 * each virtual tree and kick everyone again.
1336 	 */
1337 	if (ce->engine != engine)
1338 		kick_siblings(rq, ce);
1339 
1340 	intel_context_put(ce);
1341 }
1342 
1343 static inline void
1344 execlists_schedule_out(struct i915_request *rq)
1345 {
1346 	struct intel_context * const ce = rq->context;
1347 	struct intel_engine_cs *cur, *old;
1348 
1349 	trace_i915_request_out(rq);
1350 
1351 	old = READ_ONCE(ce->inflight);
1352 	do
1353 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1354 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1355 	if (!cur)
1356 		__execlists_schedule_out(rq, old);
1357 
1358 	i915_request_put(rq);
1359 }
1360 
1361 static u64 execlists_update_context(struct i915_request *rq)
1362 {
1363 	struct intel_context *ce = rq->context;
1364 	u64 desc = ce->lrc_desc;
1365 	u32 tail, prev;
1366 
1367 	/*
1368 	 * WaIdleLiteRestore:bdw,skl
1369 	 *
1370 	 * We should never submit the context with the same RING_TAIL twice
1371 	 * just in case we submit an empty ring, which confuses the HW.
1372 	 *
1373 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1374 	 * the normal request to be able to always advance the RING_TAIL on
1375 	 * subsequent resubmissions (for lite restore). Should that fail us,
1376 	 * and we try and submit the same tail again, force the context
1377 	 * reload.
1378 	 *
1379 	 * If we need to return to a preempted context, we need to skip the
1380 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1381 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1382 	 * an earlier request.
1383 	 */
1384 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1385 	prev = ce->lrc_reg_state[CTX_RING_TAIL];
1386 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1387 		desc |= CTX_DESC_FORCE_RESTORE;
1388 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1389 	rq->tail = rq->wa_tail;
1390 
1391 	/*
1392 	 * Make sure the context image is complete before we submit it to HW.
1393 	 *
1394 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1395 	 * an uncached write such as our mmio register access, the empirical
1396 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1397 	 * may not be visible to the HW prior to the completion of the UC
1398 	 * register write and that we may begin execution from the context
1399 	 * before its image is complete leading to invalid PD chasing.
1400 	 */
1401 	wmb();
1402 
1403 	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1404 	return desc;
1405 }
1406 
1407 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1408 {
1409 	if (execlists->ctrl_reg) {
1410 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1411 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1412 	} else {
1413 		writel(upper_32_bits(desc), execlists->submit_reg);
1414 		writel(lower_32_bits(desc), execlists->submit_reg);
1415 	}
1416 }
1417 
1418 static __maybe_unused void
1419 trace_ports(const struct intel_engine_execlists *execlists,
1420 	    const char *msg,
1421 	    struct i915_request * const *ports)
1422 {
1423 	const struct intel_engine_cs *engine =
1424 		container_of(execlists, typeof(*engine), execlists);
1425 
1426 	if (!ports[0])
1427 		return;
1428 
1429 	ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1430 		     ports[0]->fence.context,
1431 		     ports[0]->fence.seqno,
1432 		     i915_request_completed(ports[0]) ? "!" :
1433 		     i915_request_started(ports[0]) ? "*" :
1434 		     "",
1435 		     ports[1] ? ports[1]->fence.context : 0,
1436 		     ports[1] ? ports[1]->fence.seqno : 0);
1437 }
1438 
1439 static inline bool
1440 reset_in_progress(const struct intel_engine_execlists *execlists)
1441 {
1442 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1443 }
1444 
1445 static __maybe_unused bool
1446 assert_pending_valid(const struct intel_engine_execlists *execlists,
1447 		     const char *msg)
1448 {
1449 	struct i915_request * const *port, *rq;
1450 	struct intel_context *ce = NULL;
1451 	bool sentinel = false;
1452 
1453 	trace_ports(execlists, msg, execlists->pending);
1454 
1455 	/* We may be messing around with the lists during reset, lalala */
1456 	if (reset_in_progress(execlists))
1457 		return true;
1458 
1459 	if (!execlists->pending[0]) {
1460 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1461 		return false;
1462 	}
1463 
1464 	if (execlists->pending[execlists_num_ports(execlists)]) {
1465 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1466 			      execlists_num_ports(execlists));
1467 		return false;
1468 	}
1469 
1470 	for (port = execlists->pending; (rq = *port); port++) {
1471 		unsigned long flags;
1472 		bool ok = true;
1473 
1474 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1475 		GEM_BUG_ON(!i915_request_is_active(rq));
1476 
1477 		if (ce == rq->context) {
1478 			GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1479 				      ce->timeline->fence_context,
1480 				      port - execlists->pending);
1481 			return false;
1482 		}
1483 		ce = rq->context;
1484 
1485 		/*
1486 		 * Sentinels are supposed to be lonely so they flush the
1487 		 * current exection off the HW. Check that they are the
1488 		 * only request in the pending submission.
1489 		 */
1490 		if (sentinel) {
1491 			GEM_TRACE_ERR("context:%llx after sentinel in pending[%zd]\n",
1492 				      ce->timeline->fence_context,
1493 				      port - execlists->pending);
1494 			return false;
1495 		}
1496 
1497 		sentinel = i915_request_has_sentinel(rq);
1498 		if (sentinel && port != execlists->pending) {
1499 			GEM_TRACE_ERR("sentinel context:%llx not in prime position[%zd]\n",
1500 				      ce->timeline->fence_context,
1501 				      port - execlists->pending);
1502 			return false;
1503 		}
1504 
1505 		/* Hold tightly onto the lock to prevent concurrent retires! */
1506 		if (!spin_trylock_irqsave(&rq->lock, flags))
1507 			continue;
1508 
1509 		if (i915_request_completed(rq))
1510 			goto unlock;
1511 
1512 		if (i915_active_is_idle(&ce->active) &&
1513 		    !intel_context_is_barrier(ce)) {
1514 			GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1515 				      ce->timeline->fence_context,
1516 				      port - execlists->pending);
1517 			ok = false;
1518 			goto unlock;
1519 		}
1520 
1521 		if (!i915_vma_is_pinned(ce->state)) {
1522 			GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1523 				      ce->timeline->fence_context,
1524 				      port - execlists->pending);
1525 			ok = false;
1526 			goto unlock;
1527 		}
1528 
1529 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1530 			GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1531 				      ce->timeline->fence_context,
1532 				      port - execlists->pending);
1533 			ok = false;
1534 			goto unlock;
1535 		}
1536 
1537 unlock:
1538 		spin_unlock_irqrestore(&rq->lock, flags);
1539 		if (!ok)
1540 			return false;
1541 	}
1542 
1543 	return ce;
1544 }
1545 
1546 static void execlists_submit_ports(struct intel_engine_cs *engine)
1547 {
1548 	struct intel_engine_execlists *execlists = &engine->execlists;
1549 	unsigned int n;
1550 
1551 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1552 
1553 	/*
1554 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1555 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1556 	 * not be relinquished until the device is idle (see
1557 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1558 	 * that all ELSP are drained i.e. we have processed the CSB,
1559 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1560 	 */
1561 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1562 
1563 	/*
1564 	 * ELSQ note: the submit queue is not cleared after being submitted
1565 	 * to the HW so we need to make sure we always clean it up. This is
1566 	 * currently ensured by the fact that we always write the same number
1567 	 * of elsq entries, keep this in mind before changing the loop below.
1568 	 */
1569 	for (n = execlists_num_ports(execlists); n--; ) {
1570 		struct i915_request *rq = execlists->pending[n];
1571 
1572 		write_desc(execlists,
1573 			   rq ? execlists_update_context(rq) : 0,
1574 			   n);
1575 	}
1576 
1577 	/* we need to manually load the submit queue */
1578 	if (execlists->ctrl_reg)
1579 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1580 }
1581 
1582 static bool ctx_single_port_submission(const struct intel_context *ce)
1583 {
1584 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1585 		intel_context_force_single_submission(ce));
1586 }
1587 
1588 static bool can_merge_ctx(const struct intel_context *prev,
1589 			  const struct intel_context *next)
1590 {
1591 	if (prev != next)
1592 		return false;
1593 
1594 	if (ctx_single_port_submission(prev))
1595 		return false;
1596 
1597 	return true;
1598 }
1599 
1600 static unsigned long i915_request_flags(const struct i915_request *rq)
1601 {
1602 	return READ_ONCE(rq->fence.flags);
1603 }
1604 
1605 static bool can_merge_rq(const struct i915_request *prev,
1606 			 const struct i915_request *next)
1607 {
1608 	GEM_BUG_ON(prev == next);
1609 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1610 
1611 	/*
1612 	 * We do not submit known completed requests. Therefore if the next
1613 	 * request is already completed, we can pretend to merge it in
1614 	 * with the previous context (and we will skip updating the ELSP
1615 	 * and tracking). Thus hopefully keeping the ELSP full with active
1616 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1617 	 * us.
1618 	 */
1619 	if (i915_request_completed(next))
1620 		return true;
1621 
1622 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1623 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1624 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1625 		return false;
1626 
1627 	if (!can_merge_ctx(prev->context, next->context))
1628 		return false;
1629 
1630 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1631 	return true;
1632 }
1633 
1634 static void virtual_update_register_offsets(u32 *regs,
1635 					    struct intel_engine_cs *engine)
1636 {
1637 	set_offsets(regs, reg_offsets(engine), engine, false);
1638 }
1639 
1640 static bool virtual_matches(const struct virtual_engine *ve,
1641 			    const struct i915_request *rq,
1642 			    const struct intel_engine_cs *engine)
1643 {
1644 	const struct intel_engine_cs *inflight;
1645 
1646 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1647 		return false;
1648 
1649 	/*
1650 	 * We track when the HW has completed saving the context image
1651 	 * (i.e. when we have seen the final CS event switching out of
1652 	 * the context) and must not overwrite the context image before
1653 	 * then. This restricts us to only using the active engine
1654 	 * while the previous virtualized request is inflight (so
1655 	 * we reuse the register offsets). This is a very small
1656 	 * hystersis on the greedy seelction algorithm.
1657 	 */
1658 	inflight = intel_context_inflight(&ve->context);
1659 	if (inflight && inflight != engine)
1660 		return false;
1661 
1662 	return true;
1663 }
1664 
1665 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1666 				     struct i915_request *rq)
1667 {
1668 	struct intel_engine_cs *old = ve->siblings[0];
1669 
1670 	/* All unattached (rq->engine == old) must already be completed */
1671 
1672 	spin_lock(&old->breadcrumbs.irq_lock);
1673 	if (!list_empty(&ve->context.signal_link)) {
1674 		list_del_init(&ve->context.signal_link);
1675 
1676 		/*
1677 		 * We cannot acquire the new engine->breadcrumbs.irq_lock
1678 		 * (as we are holding a breadcrumbs.irq_lock already),
1679 		 * so attach this request to the signaler on submission.
1680 		 * The queued irq_work will occur when we finally drop
1681 		 * the engine->active.lock after dequeue.
1682 		 */
1683 		set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags);
1684 
1685 		/* Also transfer the pending irq_work for the old breadcrumb. */
1686 		intel_engine_signal_breadcrumbs(rq->engine);
1687 	}
1688 	spin_unlock(&old->breadcrumbs.irq_lock);
1689 }
1690 
1691 #define for_each_waiter(p__, rq__) \
1692 	list_for_each_entry_lockless(p__, \
1693 				     &(rq__)->sched.waiters_list, \
1694 				     wait_link)
1695 
1696 #define for_each_signaler(p__, rq__) \
1697 	list_for_each_entry_rcu(p__, \
1698 				&(rq__)->sched.signalers_list, \
1699 				signal_link)
1700 
1701 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1702 {
1703 	LIST_HEAD(list);
1704 
1705 	/*
1706 	 * We want to move the interrupted request to the back of
1707 	 * the round-robin list (i.e. its priority level), but
1708 	 * in doing so, we must then move all requests that were in
1709 	 * flight and were waiting for the interrupted request to
1710 	 * be run after it again.
1711 	 */
1712 	do {
1713 		struct i915_dependency *p;
1714 
1715 		GEM_BUG_ON(i915_request_is_active(rq));
1716 		list_move_tail(&rq->sched.link, pl);
1717 
1718 		for_each_waiter(p, rq) {
1719 			struct i915_request *w =
1720 				container_of(p->waiter, typeof(*w), sched);
1721 
1722 			/* Leave semaphores spinning on the other engines */
1723 			if (w->engine != rq->engine)
1724 				continue;
1725 
1726 			/* No waiter should start before its signaler */
1727 			GEM_BUG_ON(i915_request_started(w) &&
1728 				   !i915_request_completed(rq));
1729 
1730 			GEM_BUG_ON(i915_request_is_active(w));
1731 			if (!i915_request_is_ready(w))
1732 				continue;
1733 
1734 			if (rq_prio(w) < rq_prio(rq))
1735 				continue;
1736 
1737 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1738 			list_move_tail(&w->sched.link, &list);
1739 		}
1740 
1741 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1742 	} while (rq);
1743 }
1744 
1745 static void defer_active(struct intel_engine_cs *engine)
1746 {
1747 	struct i915_request *rq;
1748 
1749 	rq = __unwind_incomplete_requests(engine);
1750 	if (!rq)
1751 		return;
1752 
1753 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1754 }
1755 
1756 static bool
1757 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1758 {
1759 	int hint;
1760 
1761 	if (!intel_engine_has_timeslices(engine))
1762 		return false;
1763 
1764 	hint = engine->execlists.queue_priority_hint;
1765 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1766 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1767 
1768 	return hint >= effective_prio(rq);
1769 }
1770 
1771 static int
1772 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1773 {
1774 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1775 		return INT_MIN;
1776 
1777 	return rq_prio(list_next_entry(rq, sched.link));
1778 }
1779 
1780 static inline unsigned long
1781 timeslice(const struct intel_engine_cs *engine)
1782 {
1783 	return READ_ONCE(engine->props.timeslice_duration_ms);
1784 }
1785 
1786 static unsigned long
1787 active_timeslice(const struct intel_engine_cs *engine)
1788 {
1789 	const struct intel_engine_execlists *execlists = &engine->execlists;
1790 	const struct i915_request *rq = *execlists->active;
1791 
1792 	if (!rq || i915_request_completed(rq))
1793 		return 0;
1794 
1795 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1796 		return 0;
1797 
1798 	return timeslice(engine);
1799 }
1800 
1801 static void set_timeslice(struct intel_engine_cs *engine)
1802 {
1803 	if (!intel_engine_has_timeslices(engine))
1804 		return;
1805 
1806 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1807 }
1808 
1809 static void start_timeslice(struct intel_engine_cs *engine)
1810 {
1811 	struct intel_engine_execlists *execlists = &engine->execlists;
1812 	int prio = queue_prio(execlists);
1813 
1814 	WRITE_ONCE(execlists->switch_priority_hint, prio);
1815 	if (prio == INT_MIN)
1816 		return;
1817 
1818 	if (timer_pending(&execlists->timer))
1819 		return;
1820 
1821 	set_timer_ms(&execlists->timer, timeslice(engine));
1822 }
1823 
1824 static void record_preemption(struct intel_engine_execlists *execlists)
1825 {
1826 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1827 }
1828 
1829 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
1830 					    const struct i915_request *rq)
1831 {
1832 	if (!rq)
1833 		return 0;
1834 
1835 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1836 	if (unlikely(intel_context_is_banned(rq->context)))
1837 		return 1;
1838 
1839 	return READ_ONCE(engine->props.preempt_timeout_ms);
1840 }
1841 
1842 static void set_preempt_timeout(struct intel_engine_cs *engine,
1843 				const struct i915_request *rq)
1844 {
1845 	if (!intel_engine_has_preempt_reset(engine))
1846 		return;
1847 
1848 	set_timer_ms(&engine->execlists.preempt,
1849 		     active_preempt_timeout(engine, rq));
1850 }
1851 
1852 static inline void clear_ports(struct i915_request **ports, int count)
1853 {
1854 	memset_p((void **)ports, NULL, count);
1855 }
1856 
1857 static void execlists_dequeue(struct intel_engine_cs *engine)
1858 {
1859 	struct intel_engine_execlists * const execlists = &engine->execlists;
1860 	struct i915_request **port = execlists->pending;
1861 	struct i915_request ** const last_port = port + execlists->port_mask;
1862 	struct i915_request * const *active;
1863 	struct i915_request *last;
1864 	struct rb_node *rb;
1865 	bool submit = false;
1866 
1867 	/*
1868 	 * Hardware submission is through 2 ports. Conceptually each port
1869 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1870 	 * static for a context, and unique to each, so we only execute
1871 	 * requests belonging to a single context from each ring. RING_HEAD
1872 	 * is maintained by the CS in the context image, it marks the place
1873 	 * where it got up to last time, and through RING_TAIL we tell the CS
1874 	 * where we want to execute up to this time.
1875 	 *
1876 	 * In this list the requests are in order of execution. Consecutive
1877 	 * requests from the same context are adjacent in the ringbuffer. We
1878 	 * can combine these requests into a single RING_TAIL update:
1879 	 *
1880 	 *              RING_HEAD...req1...req2
1881 	 *                                    ^- RING_TAIL
1882 	 * since to execute req2 the CS must first execute req1.
1883 	 *
1884 	 * Our goal then is to point each port to the end of a consecutive
1885 	 * sequence of requests as being the most optimal (fewest wake ups
1886 	 * and context switches) submission.
1887 	 */
1888 
1889 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1890 		struct virtual_engine *ve =
1891 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1892 		struct i915_request *rq = READ_ONCE(ve->request);
1893 
1894 		if (!rq) { /* lazily cleanup after another engine handled rq */
1895 			rb_erase_cached(rb, &execlists->virtual);
1896 			RB_CLEAR_NODE(rb);
1897 			rb = rb_first_cached(&execlists->virtual);
1898 			continue;
1899 		}
1900 
1901 		if (!virtual_matches(ve, rq, engine)) {
1902 			rb = rb_next(rb);
1903 			continue;
1904 		}
1905 
1906 		break;
1907 	}
1908 
1909 	/*
1910 	 * If the queue is higher priority than the last
1911 	 * request in the currently active context, submit afresh.
1912 	 * We will resubmit again afterwards in case we need to split
1913 	 * the active context to interject the preemption request,
1914 	 * i.e. we will retrigger preemption following the ack in case
1915 	 * of trouble.
1916 	 */
1917 	active = READ_ONCE(execlists->active);
1918 	while ((last = *active) && i915_request_completed(last))
1919 		active++;
1920 
1921 	if (last) {
1922 		if (need_preempt(engine, last, rb)) {
1923 			ENGINE_TRACE(engine,
1924 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1925 				     last->fence.context,
1926 				     last->fence.seqno,
1927 				     last->sched.attr.priority,
1928 				     execlists->queue_priority_hint);
1929 			record_preemption(execlists);
1930 
1931 			/*
1932 			 * Don't let the RING_HEAD advance past the breadcrumb
1933 			 * as we unwind (and until we resubmit) so that we do
1934 			 * not accidentally tell it to go backwards.
1935 			 */
1936 			ring_set_paused(engine, 1);
1937 
1938 			/*
1939 			 * Note that we have not stopped the GPU at this point,
1940 			 * so we are unwinding the incomplete requests as they
1941 			 * remain inflight and so by the time we do complete
1942 			 * the preemption, some of the unwound requests may
1943 			 * complete!
1944 			 */
1945 			__unwind_incomplete_requests(engine);
1946 
1947 			last = NULL;
1948 		} else if (need_timeslice(engine, last) &&
1949 			   timer_expired(&engine->execlists.timer)) {
1950 			ENGINE_TRACE(engine,
1951 				     "expired last=%llx:%lld, prio=%d, hint=%d\n",
1952 				     last->fence.context,
1953 				     last->fence.seqno,
1954 				     last->sched.attr.priority,
1955 				     execlists->queue_priority_hint);
1956 
1957 			ring_set_paused(engine, 1);
1958 			defer_active(engine);
1959 
1960 			/*
1961 			 * Unlike for preemption, if we rewind and continue
1962 			 * executing the same context as previously active,
1963 			 * the order of execution will remain the same and
1964 			 * the tail will only advance. We do not need to
1965 			 * force a full context restore, as a lite-restore
1966 			 * is sufficient to resample the monotonic TAIL.
1967 			 *
1968 			 * If we switch to any other context, similarly we
1969 			 * will not rewind TAIL of current context, and
1970 			 * normal save/restore will preserve state and allow
1971 			 * us to later continue executing the same request.
1972 			 */
1973 			last = NULL;
1974 		} else {
1975 			/*
1976 			 * Otherwise if we already have a request pending
1977 			 * for execution after the current one, we can
1978 			 * just wait until the next CS event before
1979 			 * queuing more. In either case we will force a
1980 			 * lite-restore preemption event, but if we wait
1981 			 * we hopefully coalesce several updates into a single
1982 			 * submission.
1983 			 */
1984 			if (!list_is_last(&last->sched.link,
1985 					  &engine->active.requests)) {
1986 				/*
1987 				 * Even if ELSP[1] is occupied and not worthy
1988 				 * of timeslices, our queue might be.
1989 				 */
1990 				start_timeslice(engine);
1991 				return;
1992 			}
1993 		}
1994 	}
1995 
1996 	while (rb) { /* XXX virtual is always taking precedence */
1997 		struct virtual_engine *ve =
1998 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1999 		struct i915_request *rq;
2000 
2001 		spin_lock(&ve->base.active.lock);
2002 
2003 		rq = ve->request;
2004 		if (unlikely(!rq)) { /* lost the race to a sibling */
2005 			spin_unlock(&ve->base.active.lock);
2006 			rb_erase_cached(rb, &execlists->virtual);
2007 			RB_CLEAR_NODE(rb);
2008 			rb = rb_first_cached(&execlists->virtual);
2009 			continue;
2010 		}
2011 
2012 		GEM_BUG_ON(rq != ve->request);
2013 		GEM_BUG_ON(rq->engine != &ve->base);
2014 		GEM_BUG_ON(rq->context != &ve->context);
2015 
2016 		if (rq_prio(rq) >= queue_prio(execlists)) {
2017 			if (!virtual_matches(ve, rq, engine)) {
2018 				spin_unlock(&ve->base.active.lock);
2019 				rb = rb_next(rb);
2020 				continue;
2021 			}
2022 
2023 			if (last && !can_merge_rq(last, rq)) {
2024 				spin_unlock(&ve->base.active.lock);
2025 				start_timeslice(engine);
2026 				return; /* leave this for another sibling */
2027 			}
2028 
2029 			ENGINE_TRACE(engine,
2030 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2031 				     rq->fence.context,
2032 				     rq->fence.seqno,
2033 				     i915_request_completed(rq) ? "!" :
2034 				     i915_request_started(rq) ? "*" :
2035 				     "",
2036 				     yesno(engine != ve->siblings[0]));
2037 
2038 			WRITE_ONCE(ve->request, NULL);
2039 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2040 				   INT_MIN);
2041 			rb_erase_cached(rb, &execlists->virtual);
2042 			RB_CLEAR_NODE(rb);
2043 
2044 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2045 			WRITE_ONCE(rq->engine, engine);
2046 
2047 			if (engine != ve->siblings[0]) {
2048 				u32 *regs = ve->context.lrc_reg_state;
2049 				unsigned int n;
2050 
2051 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2052 
2053 				if (!intel_engine_has_relative_mmio(engine))
2054 					virtual_update_register_offsets(regs,
2055 									engine);
2056 
2057 				if (!list_empty(&ve->context.signals))
2058 					virtual_xfer_breadcrumbs(ve, rq);
2059 
2060 				/*
2061 				 * Move the bound engine to the top of the list
2062 				 * for future execution. We then kick this
2063 				 * tasklet first before checking others, so that
2064 				 * we preferentially reuse this set of bound
2065 				 * registers.
2066 				 */
2067 				for (n = 1; n < ve->num_siblings; n++) {
2068 					if (ve->siblings[n] == engine) {
2069 						swap(ve->siblings[n],
2070 						     ve->siblings[0]);
2071 						break;
2072 					}
2073 				}
2074 
2075 				GEM_BUG_ON(ve->siblings[0] != engine);
2076 			}
2077 
2078 			if (__i915_request_submit(rq)) {
2079 				submit = true;
2080 				last = rq;
2081 			}
2082 			i915_request_put(rq);
2083 
2084 			/*
2085 			 * Hmm, we have a bunch of virtual engine requests,
2086 			 * but the first one was already completed (thanks
2087 			 * preempt-to-busy!). Keep looking at the veng queue
2088 			 * until we have no more relevant requests (i.e.
2089 			 * the normal submit queue has higher priority).
2090 			 */
2091 			if (!submit) {
2092 				spin_unlock(&ve->base.active.lock);
2093 				rb = rb_first_cached(&execlists->virtual);
2094 				continue;
2095 			}
2096 		}
2097 
2098 		spin_unlock(&ve->base.active.lock);
2099 		break;
2100 	}
2101 
2102 	while ((rb = rb_first_cached(&execlists->queue))) {
2103 		struct i915_priolist *p = to_priolist(rb);
2104 		struct i915_request *rq, *rn;
2105 		int i;
2106 
2107 		priolist_for_each_request_consume(rq, rn, p, i) {
2108 			bool merge = true;
2109 
2110 			/*
2111 			 * Can we combine this request with the current port?
2112 			 * It has to be the same context/ringbuffer and not
2113 			 * have any exceptions (e.g. GVT saying never to
2114 			 * combine contexts).
2115 			 *
2116 			 * If we can combine the requests, we can execute both
2117 			 * by updating the RING_TAIL to point to the end of the
2118 			 * second request, and so we never need to tell the
2119 			 * hardware about the first.
2120 			 */
2121 			if (last && !can_merge_rq(last, rq)) {
2122 				/*
2123 				 * If we are on the second port and cannot
2124 				 * combine this request with the last, then we
2125 				 * are done.
2126 				 */
2127 				if (port == last_port)
2128 					goto done;
2129 
2130 				/*
2131 				 * We must not populate both ELSP[] with the
2132 				 * same LRCA, i.e. we must submit 2 different
2133 				 * contexts if we submit 2 ELSP.
2134 				 */
2135 				if (last->context == rq->context)
2136 					goto done;
2137 
2138 				if (i915_request_has_sentinel(last))
2139 					goto done;
2140 
2141 				/*
2142 				 * If GVT overrides us we only ever submit
2143 				 * port[0], leaving port[1] empty. Note that we
2144 				 * also have to be careful that we don't queue
2145 				 * the same context (even though a different
2146 				 * request) to the second port.
2147 				 */
2148 				if (ctx_single_port_submission(last->context) ||
2149 				    ctx_single_port_submission(rq->context))
2150 					goto done;
2151 
2152 				merge = false;
2153 			}
2154 
2155 			if (__i915_request_submit(rq)) {
2156 				if (!merge) {
2157 					*port = execlists_schedule_in(last, port - execlists->pending);
2158 					port++;
2159 					last = NULL;
2160 				}
2161 
2162 				GEM_BUG_ON(last &&
2163 					   !can_merge_ctx(last->context,
2164 							  rq->context));
2165 				GEM_BUG_ON(last &&
2166 					   i915_seqno_passed(last->fence.seqno,
2167 							     rq->fence.seqno));
2168 
2169 				submit = true;
2170 				last = rq;
2171 			}
2172 		}
2173 
2174 		rb_erase_cached(&p->node, &execlists->queue);
2175 		i915_priolist_free(p);
2176 	}
2177 
2178 done:
2179 	/*
2180 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2181 	 *
2182 	 * We choose the priority hint such that if we add a request of greater
2183 	 * priority than this, we kick the submission tasklet to decide on
2184 	 * the right order of submitting the requests to hardware. We must
2185 	 * also be prepared to reorder requests as they are in-flight on the
2186 	 * HW. We derive the priority hint then as the first "hole" in
2187 	 * the HW submission ports and if there are no available slots,
2188 	 * the priority of the lowest executing request, i.e. last.
2189 	 *
2190 	 * When we do receive a higher priority request ready to run from the
2191 	 * user, see queue_request(), the priority hint is bumped to that
2192 	 * request triggering preemption on the next dequeue (or subsequent
2193 	 * interrupt for secondary ports).
2194 	 */
2195 	execlists->queue_priority_hint = queue_prio(execlists);
2196 
2197 	if (submit) {
2198 		*port = execlists_schedule_in(last, port - execlists->pending);
2199 		execlists->switch_priority_hint =
2200 			switch_prio(engine, *execlists->pending);
2201 
2202 		/*
2203 		 * Skip if we ended up with exactly the same set of requests,
2204 		 * e.g. trying to timeslice a pair of ordered contexts
2205 		 */
2206 		if (!memcmp(active, execlists->pending,
2207 			    (port - execlists->pending + 1) * sizeof(*port))) {
2208 			do
2209 				execlists_schedule_out(fetch_and_zero(port));
2210 			while (port-- != execlists->pending);
2211 
2212 			goto skip_submit;
2213 		}
2214 		clear_ports(port + 1, last_port - port);
2215 
2216 		execlists_submit_ports(engine);
2217 		set_preempt_timeout(engine, *active);
2218 	} else {
2219 skip_submit:
2220 		ring_set_paused(engine, 0);
2221 	}
2222 }
2223 
2224 static void
2225 cancel_port_requests(struct intel_engine_execlists * const execlists)
2226 {
2227 	struct i915_request * const *port;
2228 
2229 	for (port = execlists->pending; *port; port++)
2230 		execlists_schedule_out(*port);
2231 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2232 
2233 	/* Mark the end of active before we overwrite *active */
2234 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2235 		execlists_schedule_out(*port);
2236 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2237 
2238 	smp_wmb(); /* complete the seqlock for execlists_active() */
2239 	WRITE_ONCE(execlists->active, execlists->inflight);
2240 }
2241 
2242 static inline void
2243 invalidate_csb_entries(const u32 *first, const u32 *last)
2244 {
2245 	clflush((void *)first);
2246 	clflush((void *)last);
2247 }
2248 
2249 /*
2250  * Starting with Gen12, the status has a new format:
2251  *
2252  *     bit  0:     switched to new queue
2253  *     bit  1:     reserved
2254  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2255  *                 switch detail is set to "wait on semaphore"
2256  *     bits 3-5:   engine class
2257  *     bits 6-11:  engine instance
2258  *     bits 12-14: reserved
2259  *     bits 15-25: sw context id of the lrc the GT switched to
2260  *     bits 26-31: sw counter of the lrc the GT switched to
2261  *     bits 32-35: context switch detail
2262  *                  - 0: ctx complete
2263  *                  - 1: wait on sync flip
2264  *                  - 2: wait on vblank
2265  *                  - 3: wait on scanline
2266  *                  - 4: wait on semaphore
2267  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2268  *                       WAIT_FOR_EVENT)
2269  *     bit  36:    reserved
2270  *     bits 37-43: wait detail (for switch detail 1 to 4)
2271  *     bits 44-46: reserved
2272  *     bits 47-57: sw context id of the lrc the GT switched away from
2273  *     bits 58-63: sw counter of the lrc the GT switched away from
2274  */
2275 static inline bool
2276 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2277 {
2278 	u32 lower_dw = csb[0];
2279 	u32 upper_dw = csb[1];
2280 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2281 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2282 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2283 
2284 	/*
2285 	 * The context switch detail is not guaranteed to be 5 when a preemption
2286 	 * occurs, so we can't just check for that. The check below works for
2287 	 * all the cases we care about, including preemptions of WAIT
2288 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2289 	 * would require some extra handling, but we don't support that.
2290 	 */
2291 	if (!ctx_away_valid || new_queue) {
2292 		GEM_BUG_ON(!ctx_to_valid);
2293 		return true;
2294 	}
2295 
2296 	/*
2297 	 * switch detail = 5 is covered by the case above and we do not expect a
2298 	 * context switch on an unsuccessful wait instruction since we always
2299 	 * use polling mode.
2300 	 */
2301 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2302 	return false;
2303 }
2304 
2305 static inline bool
2306 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2307 {
2308 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2309 }
2310 
2311 static void process_csb(struct intel_engine_cs *engine)
2312 {
2313 	struct intel_engine_execlists * const execlists = &engine->execlists;
2314 	const u32 * const buf = execlists->csb_status;
2315 	const u8 num_entries = execlists->csb_size;
2316 	u8 head, tail;
2317 
2318 	/*
2319 	 * As we modify our execlists state tracking we require exclusive
2320 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2321 	 * and we assume that is only inside the reset paths and so serialised.
2322 	 */
2323 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2324 		   !reset_in_progress(execlists));
2325 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2326 
2327 	/*
2328 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2329 	 * When reading from the csb_write mmio register, we have to be
2330 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2331 	 * the low 4bits. As it happens we know the next 4bits are always
2332 	 * zero and so we can simply masked off the low u8 of the register
2333 	 * and treat it identically to reading from the HWSP (without having
2334 	 * to use explicit shifting and masking, and probably bifurcating
2335 	 * the code to handle the legacy mmio read).
2336 	 */
2337 	head = execlists->csb_head;
2338 	tail = READ_ONCE(*execlists->csb_write);
2339 	if (unlikely(head == tail))
2340 		return;
2341 
2342 	/*
2343 	 * Hopefully paired with a wmb() in HW!
2344 	 *
2345 	 * We must complete the read of the write pointer before any reads
2346 	 * from the CSB, so that we do not see stale values. Without an rmb
2347 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2348 	 * we perform the READ_ONCE(*csb_write).
2349 	 */
2350 	rmb();
2351 
2352 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2353 	do {
2354 		bool promote;
2355 
2356 		if (++head == num_entries)
2357 			head = 0;
2358 
2359 		/*
2360 		 * We are flying near dragons again.
2361 		 *
2362 		 * We hold a reference to the request in execlist_port[]
2363 		 * but no more than that. We are operating in softirq
2364 		 * context and so cannot hold any mutex or sleep. That
2365 		 * prevents us stopping the requests we are processing
2366 		 * in port[] from being retired simultaneously (the
2367 		 * breadcrumb will be complete before we see the
2368 		 * context-switch). As we only hold the reference to the
2369 		 * request, any pointer chasing underneath the request
2370 		 * is subject to a potential use-after-free. Thus we
2371 		 * store all of the bookkeeping within port[] as
2372 		 * required, and avoid using unguarded pointers beneath
2373 		 * request itself. The same applies to the atomic
2374 		 * status notifier.
2375 		 */
2376 
2377 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2378 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2379 
2380 		if (INTEL_GEN(engine->i915) >= 12)
2381 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2382 		else
2383 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2384 		if (promote) {
2385 			struct i915_request * const *old = execlists->active;
2386 
2387 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2388 
2389 			ring_set_paused(engine, 0);
2390 
2391 			/* Point active to the new ELSP; prevent overwriting */
2392 			WRITE_ONCE(execlists->active, execlists->pending);
2393 			smp_wmb(); /* notify execlists_active() */
2394 
2395 			/* cancel old inflight, prepare for switch */
2396 			trace_ports(execlists, "preempted", old);
2397 			while (*old)
2398 				execlists_schedule_out(*old++);
2399 
2400 			/* switch pending to inflight */
2401 			memcpy(execlists->inflight,
2402 			       execlists->pending,
2403 			       execlists_num_ports(execlists) *
2404 			       sizeof(*execlists->pending));
2405 			smp_wmb(); /* complete the seqlock */
2406 			WRITE_ONCE(execlists->active, execlists->inflight);
2407 
2408 			WRITE_ONCE(execlists->pending[0], NULL);
2409 		} else {
2410 			GEM_BUG_ON(!*execlists->active);
2411 
2412 			/* port0 completed, advanced to port1 */
2413 			trace_ports(execlists, "completed", execlists->active);
2414 
2415 			/*
2416 			 * We rely on the hardware being strongly
2417 			 * ordered, that the breadcrumb write is
2418 			 * coherent (visible from the CPU) before the
2419 			 * user interrupt and CSB is processed.
2420 			 */
2421 			if (GEM_SHOW_DEBUG() &&
2422 			    !i915_request_completed(*execlists->active) &&
2423 			    !reset_in_progress(execlists)) {
2424 				struct i915_request *rq __maybe_unused =
2425 					*execlists->active;
2426 				const u32 *regs __maybe_unused =
2427 					rq->context->lrc_reg_state;
2428 
2429 				ENGINE_TRACE(engine,
2430 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2431 					     ENGINE_READ(engine, RING_START),
2432 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2433 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2434 					     ENGINE_READ(engine, RING_CTL),
2435 					     ENGINE_READ(engine, RING_MI_MODE));
2436 				ENGINE_TRACE(engine,
2437 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2438 					     i915_ggtt_offset(rq->ring->vma),
2439 					     rq->head, rq->tail,
2440 					     rq->fence.context,
2441 					     lower_32_bits(rq->fence.seqno),
2442 					     hwsp_seqno(rq));
2443 				ENGINE_TRACE(engine,
2444 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2445 					     regs[CTX_RING_START],
2446 					     regs[CTX_RING_HEAD],
2447 					     regs[CTX_RING_TAIL]);
2448 
2449 				GEM_BUG_ON("context completed before request");
2450 			}
2451 
2452 			execlists_schedule_out(*execlists->active++);
2453 
2454 			GEM_BUG_ON(execlists->active - execlists->inflight >
2455 				   execlists_num_ports(execlists));
2456 		}
2457 	} while (head != tail);
2458 
2459 	execlists->csb_head = head;
2460 	set_timeslice(engine);
2461 
2462 	/*
2463 	 * Gen11 has proven to fail wrt global observation point between
2464 	 * entry and tail update, failing on the ordering and thus
2465 	 * we see an old entry in the context status buffer.
2466 	 *
2467 	 * Forcibly evict out entries for the next gpu csb update,
2468 	 * to increase the odds that we get a fresh entries with non
2469 	 * working hardware. The cost for doing so comes out mostly with
2470 	 * the wash as hardware, working or not, will need to do the
2471 	 * invalidation before.
2472 	 */
2473 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2474 }
2475 
2476 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2477 {
2478 	lockdep_assert_held(&engine->active.lock);
2479 	if (!READ_ONCE(engine->execlists.pending[0])) {
2480 		rcu_read_lock(); /* protect peeking at execlists->active */
2481 		execlists_dequeue(engine);
2482 		rcu_read_unlock();
2483 	}
2484 }
2485 
2486 static void __execlists_hold(struct i915_request *rq)
2487 {
2488 	LIST_HEAD(list);
2489 
2490 	do {
2491 		struct i915_dependency *p;
2492 
2493 		if (i915_request_is_active(rq))
2494 			__i915_request_unsubmit(rq);
2495 
2496 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2497 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2498 		i915_request_set_hold(rq);
2499 		RQ_TRACE(rq, "on hold\n");
2500 
2501 		for_each_waiter(p, rq) {
2502 			struct i915_request *w =
2503 				container_of(p->waiter, typeof(*w), sched);
2504 
2505 			/* Leave semaphores spinning on the other engines */
2506 			if (w->engine != rq->engine)
2507 				continue;
2508 
2509 			if (!i915_request_is_ready(w))
2510 				continue;
2511 
2512 			if (i915_request_completed(w))
2513 				continue;
2514 
2515 			if (i915_request_on_hold(w))
2516 				continue;
2517 
2518 			list_move_tail(&w->sched.link, &list);
2519 		}
2520 
2521 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2522 	} while (rq);
2523 }
2524 
2525 static bool execlists_hold(struct intel_engine_cs *engine,
2526 			   struct i915_request *rq)
2527 {
2528 	spin_lock_irq(&engine->active.lock);
2529 
2530 	if (i915_request_completed(rq)) { /* too late! */
2531 		rq = NULL;
2532 		goto unlock;
2533 	}
2534 
2535 	if (rq->engine != engine) { /* preempted virtual engine */
2536 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2537 
2538 		/*
2539 		 * intel_context_inflight() is only protected by virtue
2540 		 * of process_csb() being called only by the tasklet (or
2541 		 * directly from inside reset while the tasklet is suspended).
2542 		 * Assert that neither of those are allowed to run while we
2543 		 * poke at the request queues.
2544 		 */
2545 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2546 
2547 		/*
2548 		 * An unsubmitted request along a virtual engine will
2549 		 * remain on the active (this) engine until we are able
2550 		 * to process the context switch away (and so mark the
2551 		 * context as no longer in flight). That cannot have happened
2552 		 * yet, otherwise we would not be hanging!
2553 		 */
2554 		spin_lock(&ve->base.active.lock);
2555 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2556 		GEM_BUG_ON(ve->request != rq);
2557 		ve->request = NULL;
2558 		spin_unlock(&ve->base.active.lock);
2559 		i915_request_put(rq);
2560 
2561 		rq->engine = engine;
2562 	}
2563 
2564 	/*
2565 	 * Transfer this request onto the hold queue to prevent it
2566 	 * being resumbitted to HW (and potentially completed) before we have
2567 	 * released it. Since we may have already submitted following
2568 	 * requests, we need to remove those as well.
2569 	 */
2570 	GEM_BUG_ON(i915_request_on_hold(rq));
2571 	GEM_BUG_ON(rq->engine != engine);
2572 	__execlists_hold(rq);
2573 	GEM_BUG_ON(list_empty(&engine->active.hold));
2574 
2575 unlock:
2576 	spin_unlock_irq(&engine->active.lock);
2577 	return rq;
2578 }
2579 
2580 static bool hold_request(const struct i915_request *rq)
2581 {
2582 	struct i915_dependency *p;
2583 	bool result = false;
2584 
2585 	/*
2586 	 * If one of our ancestors is on hold, we must also be on hold,
2587 	 * otherwise we will bypass it and execute before it.
2588 	 */
2589 	rcu_read_lock();
2590 	for_each_signaler(p, rq) {
2591 		const struct i915_request *s =
2592 			container_of(p->signaler, typeof(*s), sched);
2593 
2594 		if (s->engine != rq->engine)
2595 			continue;
2596 
2597 		result = i915_request_on_hold(s);
2598 		if (result)
2599 			break;
2600 	}
2601 	rcu_read_unlock();
2602 
2603 	return result;
2604 }
2605 
2606 static void __execlists_unhold(struct i915_request *rq)
2607 {
2608 	LIST_HEAD(list);
2609 
2610 	do {
2611 		struct i915_dependency *p;
2612 
2613 		RQ_TRACE(rq, "hold release\n");
2614 
2615 		GEM_BUG_ON(!i915_request_on_hold(rq));
2616 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2617 
2618 		i915_request_clear_hold(rq);
2619 		list_move_tail(&rq->sched.link,
2620 			       i915_sched_lookup_priolist(rq->engine,
2621 							  rq_prio(rq)));
2622 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2623 
2624 		/* Also release any children on this engine that are ready */
2625 		for_each_waiter(p, rq) {
2626 			struct i915_request *w =
2627 				container_of(p->waiter, typeof(*w), sched);
2628 
2629 			/* Propagate any change in error status */
2630 			if (rq->fence.error)
2631 				i915_request_set_error_once(w, rq->fence.error);
2632 
2633 			if (w->engine != rq->engine)
2634 				continue;
2635 
2636 			if (!i915_request_on_hold(w))
2637 				continue;
2638 
2639 			/* Check that no other parents are also on hold */
2640 			if (hold_request(w))
2641 				continue;
2642 
2643 			list_move_tail(&w->sched.link, &list);
2644 		}
2645 
2646 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2647 	} while (rq);
2648 }
2649 
2650 static void execlists_unhold(struct intel_engine_cs *engine,
2651 			     struct i915_request *rq)
2652 {
2653 	spin_lock_irq(&engine->active.lock);
2654 
2655 	/*
2656 	 * Move this request back to the priority queue, and all of its
2657 	 * children and grandchildren that were suspended along with it.
2658 	 */
2659 	__execlists_unhold(rq);
2660 
2661 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2662 		engine->execlists.queue_priority_hint = rq_prio(rq);
2663 		tasklet_hi_schedule(&engine->execlists.tasklet);
2664 	}
2665 
2666 	spin_unlock_irq(&engine->active.lock);
2667 }
2668 
2669 struct execlists_capture {
2670 	struct work_struct work;
2671 	struct i915_request *rq;
2672 	struct i915_gpu_coredump *error;
2673 };
2674 
2675 static void execlists_capture_work(struct work_struct *work)
2676 {
2677 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2678 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2679 	struct intel_engine_cs *engine = cap->rq->engine;
2680 	struct intel_gt_coredump *gt = cap->error->gt;
2681 	struct intel_engine_capture_vma *vma;
2682 
2683 	/* Compress all the objects attached to the request, slow! */
2684 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2685 	if (vma) {
2686 		struct i915_vma_compress *compress =
2687 			i915_vma_capture_prepare(gt);
2688 
2689 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2690 		i915_vma_capture_finish(gt, compress);
2691 	}
2692 
2693 	gt->simulated = gt->engine->simulated;
2694 	cap->error->simulated = gt->simulated;
2695 
2696 	/* Publish the error state, and announce it to the world */
2697 	i915_error_state_store(cap->error);
2698 	i915_gpu_coredump_put(cap->error);
2699 
2700 	/* Return this request and all that depend upon it for signaling */
2701 	execlists_unhold(engine, cap->rq);
2702 	i915_request_put(cap->rq);
2703 
2704 	kfree(cap);
2705 }
2706 
2707 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2708 {
2709 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2710 	struct execlists_capture *cap;
2711 
2712 	cap = kmalloc(sizeof(*cap), gfp);
2713 	if (!cap)
2714 		return NULL;
2715 
2716 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2717 	if (!cap->error)
2718 		goto err_cap;
2719 
2720 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2721 	if (!cap->error->gt)
2722 		goto err_gpu;
2723 
2724 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2725 	if (!cap->error->gt->engine)
2726 		goto err_gt;
2727 
2728 	return cap;
2729 
2730 err_gt:
2731 	kfree(cap->error->gt);
2732 err_gpu:
2733 	kfree(cap->error);
2734 err_cap:
2735 	kfree(cap);
2736 	return NULL;
2737 }
2738 
2739 static bool execlists_capture(struct intel_engine_cs *engine)
2740 {
2741 	struct execlists_capture *cap;
2742 
2743 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2744 		return true;
2745 
2746 	/*
2747 	 * We need to _quickly_ capture the engine state before we reset.
2748 	 * We are inside an atomic section (softirq) here and we are delaying
2749 	 * the forced preemption event.
2750 	 */
2751 	cap = capture_regs(engine);
2752 	if (!cap)
2753 		return true;
2754 
2755 	spin_lock_irq(&engine->active.lock);
2756 	cap->rq = execlists_active(&engine->execlists);
2757 	if (cap->rq) {
2758 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2759 		cap->rq = i915_request_get_rcu(cap->rq);
2760 	}
2761 	spin_unlock_irq(&engine->active.lock);
2762 	if (!cap->rq)
2763 		goto err_free;
2764 
2765 	/*
2766 	 * Remove the request from the execlists queue, and take ownership
2767 	 * of the request. We pass it to our worker who will _slowly_ compress
2768 	 * all the pages the _user_ requested for debugging their batch, after
2769 	 * which we return it to the queue for signaling.
2770 	 *
2771 	 * By removing them from the execlists queue, we also remove the
2772 	 * requests from being processed by __unwind_incomplete_requests()
2773 	 * during the intel_engine_reset(), and so they will *not* be replayed
2774 	 * afterwards.
2775 	 *
2776 	 * Note that because we have not yet reset the engine at this point,
2777 	 * it is possible for the request that we have identified as being
2778 	 * guilty, did in fact complete and we will then hit an arbitration
2779 	 * point allowing the outstanding preemption to succeed. The likelihood
2780 	 * of that is very low (as capturing of the engine registers should be
2781 	 * fast enough to run inside an irq-off atomic section!), so we will
2782 	 * simply hold that request accountable for being non-preemptible
2783 	 * long enough to force the reset.
2784 	 */
2785 	if (!execlists_hold(engine, cap->rq))
2786 		goto err_rq;
2787 
2788 	INIT_WORK(&cap->work, execlists_capture_work);
2789 	schedule_work(&cap->work);
2790 	return true;
2791 
2792 err_rq:
2793 	i915_request_put(cap->rq);
2794 err_free:
2795 	i915_gpu_coredump_put(cap->error);
2796 	kfree(cap);
2797 	return false;
2798 }
2799 
2800 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
2801 {
2802 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2803 	unsigned long *lock = &engine->gt->reset.flags;
2804 
2805 	if (!intel_has_reset_engine(engine->gt))
2806 		return;
2807 
2808 	if (test_and_set_bit(bit, lock))
2809 		return;
2810 
2811 	ENGINE_TRACE(engine, "reset for %s\n", msg);
2812 
2813 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2814 	tasklet_disable_nosync(&engine->execlists.tasklet);
2815 
2816 	ring_set_paused(engine, 1); /* Freeze the current request in place */
2817 	if (execlists_capture(engine))
2818 		intel_engine_reset(engine, msg);
2819 	else
2820 		ring_set_paused(engine, 0);
2821 
2822 	tasklet_enable(&engine->execlists.tasklet);
2823 	clear_and_wake_up_bit(bit, lock);
2824 }
2825 
2826 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2827 {
2828 	const struct timer_list *t = &engine->execlists.preempt;
2829 
2830 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2831 		return false;
2832 
2833 	if (!timer_expired(t))
2834 		return false;
2835 
2836 	return READ_ONCE(engine->execlists.pending[0]);
2837 }
2838 
2839 /*
2840  * Check the unread Context Status Buffers and manage the submission of new
2841  * contexts to the ELSP accordingly.
2842  */
2843 static void execlists_submission_tasklet(unsigned long data)
2844 {
2845 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2846 	bool timeout = preempt_timeout(engine);
2847 
2848 	process_csb(engine);
2849 
2850 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
2851 		engine->execlists.error_interrupt = 0;
2852 		if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
2853 			execlists_reset(engine, "CS error");
2854 	}
2855 
2856 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2857 		unsigned long flags;
2858 
2859 		spin_lock_irqsave(&engine->active.lock, flags);
2860 		__execlists_submission_tasklet(engine);
2861 		spin_unlock_irqrestore(&engine->active.lock, flags);
2862 
2863 		/* Recheck after serialising with direct-submission */
2864 		if (unlikely(timeout && preempt_timeout(engine)))
2865 			execlists_reset(engine, "preemption time out");
2866 	}
2867 }
2868 
2869 static void __execlists_kick(struct intel_engine_execlists *execlists)
2870 {
2871 	/* Kick the tasklet for some interrupt coalescing and reset handling */
2872 	tasklet_hi_schedule(&execlists->tasklet);
2873 }
2874 
2875 #define execlists_kick(t, member) \
2876 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2877 
2878 static void execlists_timeslice(struct timer_list *timer)
2879 {
2880 	execlists_kick(timer, timer);
2881 }
2882 
2883 static void execlists_preempt(struct timer_list *timer)
2884 {
2885 	execlists_kick(timer, preempt);
2886 }
2887 
2888 static void queue_request(struct intel_engine_cs *engine,
2889 			  struct i915_request *rq)
2890 {
2891 	GEM_BUG_ON(!list_empty(&rq->sched.link));
2892 	list_add_tail(&rq->sched.link,
2893 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
2894 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2895 }
2896 
2897 static void __submit_queue_imm(struct intel_engine_cs *engine)
2898 {
2899 	struct intel_engine_execlists * const execlists = &engine->execlists;
2900 
2901 	if (reset_in_progress(execlists))
2902 		return; /* defer until we restart the engine following reset */
2903 
2904 	if (execlists->tasklet.func == execlists_submission_tasklet)
2905 		__execlists_submission_tasklet(engine);
2906 	else
2907 		tasklet_hi_schedule(&execlists->tasklet);
2908 }
2909 
2910 static void submit_queue(struct intel_engine_cs *engine,
2911 			 const struct i915_request *rq)
2912 {
2913 	struct intel_engine_execlists *execlists = &engine->execlists;
2914 
2915 	if (rq_prio(rq) <= execlists->queue_priority_hint)
2916 		return;
2917 
2918 	execlists->queue_priority_hint = rq_prio(rq);
2919 	__submit_queue_imm(engine);
2920 }
2921 
2922 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2923 			     const struct i915_request *rq)
2924 {
2925 	GEM_BUG_ON(i915_request_on_hold(rq));
2926 	return !list_empty(&engine->active.hold) && hold_request(rq);
2927 }
2928 
2929 static void execlists_submit_request(struct i915_request *request)
2930 {
2931 	struct intel_engine_cs *engine = request->engine;
2932 	unsigned long flags;
2933 
2934 	/* Will be called from irq-context when using foreign fences. */
2935 	spin_lock_irqsave(&engine->active.lock, flags);
2936 
2937 	if (unlikely(ancestor_on_hold(engine, request))) {
2938 		RQ_TRACE(request, "ancestor on hold\n");
2939 		list_add_tail(&request->sched.link, &engine->active.hold);
2940 		i915_request_set_hold(request);
2941 	} else {
2942 		queue_request(engine, request);
2943 
2944 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2945 		GEM_BUG_ON(list_empty(&request->sched.link));
2946 
2947 		submit_queue(engine, request);
2948 	}
2949 
2950 	spin_unlock_irqrestore(&engine->active.lock, flags);
2951 }
2952 
2953 static void __execlists_context_fini(struct intel_context *ce)
2954 {
2955 	intel_ring_put(ce->ring);
2956 	i915_vma_put(ce->state);
2957 }
2958 
2959 static void execlists_context_destroy(struct kref *kref)
2960 {
2961 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2962 
2963 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2964 	GEM_BUG_ON(intel_context_is_pinned(ce));
2965 
2966 	if (ce->state)
2967 		__execlists_context_fini(ce);
2968 
2969 	intel_context_fini(ce);
2970 	intel_context_free(ce);
2971 }
2972 
2973 static void
2974 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2975 {
2976 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2977 		return;
2978 
2979 	vaddr += engine->context_size;
2980 
2981 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
2982 }
2983 
2984 static void
2985 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2986 {
2987 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2988 		return;
2989 
2990 	vaddr += engine->context_size;
2991 
2992 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
2993 		dev_err_once(engine->i915->drm.dev,
2994 			     "%s context redzone overwritten!\n",
2995 			     engine->name);
2996 }
2997 
2998 static void execlists_context_unpin(struct intel_context *ce)
2999 {
3000 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
3001 		      ce->engine);
3002 
3003 	i915_gem_object_unpin_map(ce->state->obj);
3004 }
3005 
3006 static void
3007 __execlists_update_reg_state(const struct intel_context *ce,
3008 			     const struct intel_engine_cs *engine,
3009 			     u32 head)
3010 {
3011 	struct intel_ring *ring = ce->ring;
3012 	u32 *regs = ce->lrc_reg_state;
3013 
3014 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3015 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3016 
3017 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3018 	regs[CTX_RING_HEAD] = head;
3019 	regs[CTX_RING_TAIL] = ring->tail;
3020 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3021 
3022 	/* RPCS */
3023 	if (engine->class == RENDER_CLASS) {
3024 		regs[CTX_R_PWR_CLK_STATE] =
3025 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
3026 
3027 		i915_oa_init_reg_state(ce, engine);
3028 	}
3029 }
3030 
3031 static int
3032 __execlists_context_pin(struct intel_context *ce,
3033 			struct intel_engine_cs *engine)
3034 {
3035 	void *vaddr;
3036 
3037 	GEM_BUG_ON(!ce->state);
3038 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3039 
3040 	vaddr = i915_gem_object_pin_map(ce->state->obj,
3041 					i915_coherent_map_type(engine->i915) |
3042 					I915_MAP_OVERRIDE);
3043 	if (IS_ERR(vaddr))
3044 		return PTR_ERR(vaddr);
3045 
3046 	ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3047 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
3048 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3049 
3050 	return 0;
3051 }
3052 
3053 static int execlists_context_pin(struct intel_context *ce)
3054 {
3055 	return __execlists_context_pin(ce, ce->engine);
3056 }
3057 
3058 static int execlists_context_alloc(struct intel_context *ce)
3059 {
3060 	return __execlists_context_alloc(ce, ce->engine);
3061 }
3062 
3063 static void execlists_context_reset(struct intel_context *ce)
3064 {
3065 	CE_TRACE(ce, "reset\n");
3066 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3067 
3068 	intel_ring_reset(ce->ring, ce->ring->emit);
3069 
3070 	/* Scrub away the garbage */
3071 	execlists_init_reg_state(ce->lrc_reg_state,
3072 				 ce, ce->engine, ce->ring, true);
3073 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3074 
3075 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
3076 }
3077 
3078 static const struct intel_context_ops execlists_context_ops = {
3079 	.alloc = execlists_context_alloc,
3080 
3081 	.pin = execlists_context_pin,
3082 	.unpin = execlists_context_unpin,
3083 
3084 	.enter = intel_context_enter_engine,
3085 	.exit = intel_context_exit_engine,
3086 
3087 	.reset = execlists_context_reset,
3088 	.destroy = execlists_context_destroy,
3089 };
3090 
3091 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3092 {
3093 	u32 *cs;
3094 
3095 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3096 		return 0;
3097 
3098 	cs = intel_ring_begin(rq, 6);
3099 	if (IS_ERR(cs))
3100 		return PTR_ERR(cs);
3101 
3102 	/*
3103 	 * Check if we have been preempted before we even get started.
3104 	 *
3105 	 * After this point i915_request_started() reports true, even if
3106 	 * we get preempted and so are no longer running.
3107 	 */
3108 	*cs++ = MI_ARB_CHECK;
3109 	*cs++ = MI_NOOP;
3110 
3111 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3112 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3113 	*cs++ = 0;
3114 	*cs++ = rq->fence.seqno - 1;
3115 
3116 	intel_ring_advance(rq, cs);
3117 
3118 	/* Record the updated position of the request's payload */
3119 	rq->infix = intel_ring_offset(rq, cs);
3120 
3121 	return 0;
3122 }
3123 
3124 static int execlists_request_alloc(struct i915_request *request)
3125 {
3126 	int ret;
3127 
3128 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3129 
3130 	/*
3131 	 * Flush enough space to reduce the likelihood of waiting after
3132 	 * we start building the request - in which case we will just
3133 	 * have to repeat work.
3134 	 */
3135 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3136 
3137 	/*
3138 	 * Note that after this point, we have committed to using
3139 	 * this request as it is being used to both track the
3140 	 * state of engine initialisation and liveness of the
3141 	 * golden renderstate above. Think twice before you try
3142 	 * to cancel/unwind this request now.
3143 	 */
3144 
3145 	/* Unconditionally invalidate GPU caches and TLBs. */
3146 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3147 	if (ret)
3148 		return ret;
3149 
3150 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3151 	return 0;
3152 }
3153 
3154 /*
3155  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3156  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3157  * but there is a slight complication as this is applied in WA batch where the
3158  * values are only initialized once so we cannot take register value at the
3159  * beginning and reuse it further; hence we save its value to memory, upload a
3160  * constant value with bit21 set and then we restore it back with the saved value.
3161  * To simplify the WA, a constant value is formed by using the default value
3162  * of this register. This shouldn't be a problem because we are only modifying
3163  * it for a short period and this batch in non-premptible. We can ofcourse
3164  * use additional instructions that read the actual value of the register
3165  * at that time and set our bit of interest but it makes the WA complicated.
3166  *
3167  * This WA is also required for Gen9 so extracting as a function avoids
3168  * code duplication.
3169  */
3170 static u32 *
3171 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3172 {
3173 	/* NB no one else is allowed to scribble over scratch + 256! */
3174 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3175 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3176 	*batch++ = intel_gt_scratch_offset(engine->gt,
3177 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3178 	*batch++ = 0;
3179 
3180 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3181 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3182 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3183 
3184 	batch = gen8_emit_pipe_control(batch,
3185 				       PIPE_CONTROL_CS_STALL |
3186 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3187 				       0);
3188 
3189 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3190 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3191 	*batch++ = intel_gt_scratch_offset(engine->gt,
3192 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3193 	*batch++ = 0;
3194 
3195 	return batch;
3196 }
3197 
3198 /*
3199  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3200  * initialized at the beginning and shared across all contexts but this field
3201  * helps us to have multiple batches at different offsets and select them based
3202  * on a criteria. At the moment this batch always start at the beginning of the page
3203  * and at this point we don't have multiple wa_ctx batch buffers.
3204  *
3205  * The number of WA applied are not known at the beginning; we use this field
3206  * to return the no of DWORDS written.
3207  *
3208  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3209  * so it adds NOOPs as padding to make it cacheline aligned.
3210  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3211  * makes a complete batch buffer.
3212  */
3213 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3214 {
3215 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3216 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3217 
3218 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3219 	if (IS_BROADWELL(engine->i915))
3220 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3221 
3222 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3223 	/* Actual scratch location is at 128 bytes offset */
3224 	batch = gen8_emit_pipe_control(batch,
3225 				       PIPE_CONTROL_FLUSH_L3 |
3226 				       PIPE_CONTROL_STORE_DATA_INDEX |
3227 				       PIPE_CONTROL_CS_STALL |
3228 				       PIPE_CONTROL_QW_WRITE,
3229 				       LRC_PPHWSP_SCRATCH_ADDR);
3230 
3231 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3232 
3233 	/* Pad to end of cacheline */
3234 	while ((unsigned long)batch % CACHELINE_BYTES)
3235 		*batch++ = MI_NOOP;
3236 
3237 	/*
3238 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3239 	 * execution depends on the length specified in terms of cache lines
3240 	 * in the register CTX_RCS_INDIRECT_CTX
3241 	 */
3242 
3243 	return batch;
3244 }
3245 
3246 struct lri {
3247 	i915_reg_t reg;
3248 	u32 value;
3249 };
3250 
3251 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3252 {
3253 	GEM_BUG_ON(!count || count > 63);
3254 
3255 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3256 	do {
3257 		*batch++ = i915_mmio_reg_offset(lri->reg);
3258 		*batch++ = lri->value;
3259 	} while (lri++, --count);
3260 	*batch++ = MI_NOOP;
3261 
3262 	return batch;
3263 }
3264 
3265 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3266 {
3267 	static const struct lri lri[] = {
3268 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3269 		{
3270 			COMMON_SLICE_CHICKEN2,
3271 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3272 				       0),
3273 		},
3274 
3275 		/* BSpec: 11391 */
3276 		{
3277 			FF_SLICE_CHICKEN,
3278 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3279 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3280 		},
3281 
3282 		/* BSpec: 11299 */
3283 		{
3284 			_3D_CHICKEN3,
3285 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3286 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3287 		}
3288 	};
3289 
3290 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3291 
3292 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3293 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3294 
3295 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3296 	batch = gen8_emit_pipe_control(batch,
3297 				       PIPE_CONTROL_FLUSH_L3 |
3298 				       PIPE_CONTROL_STORE_DATA_INDEX |
3299 				       PIPE_CONTROL_CS_STALL |
3300 				       PIPE_CONTROL_QW_WRITE,
3301 				       LRC_PPHWSP_SCRATCH_ADDR);
3302 
3303 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3304 
3305 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3306 	if (HAS_POOLED_EU(engine->i915)) {
3307 		/*
3308 		 * EU pool configuration is setup along with golden context
3309 		 * during context initialization. This value depends on
3310 		 * device type (2x6 or 3x6) and needs to be updated based
3311 		 * on which subslice is disabled especially for 2x6
3312 		 * devices, however it is safe to load default
3313 		 * configuration of 3x6 device instead of masking off
3314 		 * corresponding bits because HW ignores bits of a disabled
3315 		 * subslice and drops down to appropriate config. Please
3316 		 * see render_state_setup() in i915_gem_render_state.c for
3317 		 * possible configurations, to avoid duplication they are
3318 		 * not shown here again.
3319 		 */
3320 		*batch++ = GEN9_MEDIA_POOL_STATE;
3321 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3322 		*batch++ = 0x00777000;
3323 		*batch++ = 0;
3324 		*batch++ = 0;
3325 		*batch++ = 0;
3326 	}
3327 
3328 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3329 
3330 	/* Pad to end of cacheline */
3331 	while ((unsigned long)batch % CACHELINE_BYTES)
3332 		*batch++ = MI_NOOP;
3333 
3334 	return batch;
3335 }
3336 
3337 static u32 *
3338 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3339 {
3340 	int i;
3341 
3342 	/*
3343 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3344 	 *
3345 	 * Ensure the engine is idle prior to programming a
3346 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3347 	 */
3348 	batch = gen8_emit_pipe_control(batch,
3349 				       PIPE_CONTROL_CS_STALL,
3350 				       0);
3351 	/*
3352 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3353 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3354 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3355 	 * confusing. Since gen8_emit_pipe_control() already advances the
3356 	 * batch by 6 dwords, we advance the other 10 here, completing a
3357 	 * cacheline. It's not clear if the workaround requires this padding
3358 	 * before other commands, or if it's just the regular padding we would
3359 	 * already have for the workaround bb, so leave it here for now.
3360 	 */
3361 	for (i = 0; i < 10; i++)
3362 		*batch++ = MI_NOOP;
3363 
3364 	/* Pad to end of cacheline */
3365 	while ((unsigned long)batch % CACHELINE_BYTES)
3366 		*batch++ = MI_NOOP;
3367 
3368 	return batch;
3369 }
3370 
3371 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3372 
3373 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3374 {
3375 	struct drm_i915_gem_object *obj;
3376 	struct i915_vma *vma;
3377 	int err;
3378 
3379 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3380 	if (IS_ERR(obj))
3381 		return PTR_ERR(obj);
3382 
3383 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3384 	if (IS_ERR(vma)) {
3385 		err = PTR_ERR(vma);
3386 		goto err;
3387 	}
3388 
3389 	err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3390 	if (err)
3391 		goto err;
3392 
3393 	engine->wa_ctx.vma = vma;
3394 	return 0;
3395 
3396 err:
3397 	i915_gem_object_put(obj);
3398 	return err;
3399 }
3400 
3401 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3402 {
3403 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3404 }
3405 
3406 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3407 
3408 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3409 {
3410 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3411 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3412 					    &wa_ctx->per_ctx };
3413 	wa_bb_func_t wa_bb_fn[2];
3414 	struct page *page;
3415 	void *batch, *batch_ptr;
3416 	unsigned int i;
3417 	int ret;
3418 
3419 	if (engine->class != RENDER_CLASS)
3420 		return 0;
3421 
3422 	switch (INTEL_GEN(engine->i915)) {
3423 	case 12:
3424 	case 11:
3425 		return 0;
3426 	case 10:
3427 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3428 		wa_bb_fn[1] = NULL;
3429 		break;
3430 	case 9:
3431 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3432 		wa_bb_fn[1] = NULL;
3433 		break;
3434 	case 8:
3435 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3436 		wa_bb_fn[1] = NULL;
3437 		break;
3438 	default:
3439 		MISSING_CASE(INTEL_GEN(engine->i915));
3440 		return 0;
3441 	}
3442 
3443 	ret = lrc_setup_wa_ctx(engine);
3444 	if (ret) {
3445 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
3446 		return ret;
3447 	}
3448 
3449 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3450 	batch = batch_ptr = kmap_atomic(page);
3451 
3452 	/*
3453 	 * Emit the two workaround batch buffers, recording the offset from the
3454 	 * start of the workaround batch buffer object for each and their
3455 	 * respective sizes.
3456 	 */
3457 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3458 		wa_bb[i]->offset = batch_ptr - batch;
3459 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3460 						  CACHELINE_BYTES))) {
3461 			ret = -EINVAL;
3462 			break;
3463 		}
3464 		if (wa_bb_fn[i])
3465 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3466 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3467 	}
3468 
3469 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3470 
3471 	kunmap_atomic(batch);
3472 	if (ret)
3473 		lrc_destroy_wa_ctx(engine);
3474 
3475 	return ret;
3476 }
3477 
3478 static void enable_error_interrupt(struct intel_engine_cs *engine)
3479 {
3480 	u32 status;
3481 
3482 	engine->execlists.error_interrupt = 0;
3483 	ENGINE_WRITE(engine, RING_EMR, ~0u);
3484 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
3485 
3486 	status = ENGINE_READ(engine, RING_ESR);
3487 	if (unlikely(status)) {
3488 		dev_err(engine->i915->drm.dev,
3489 			"engine '%s' resumed still in error: %08x\n",
3490 			engine->name, status);
3491 		__intel_gt_reset(engine->gt, engine->mask);
3492 	}
3493 
3494 	/*
3495 	 * On current gen8+, we have 2 signals to play with
3496 	 *
3497 	 * - I915_ERROR_INSTUCTION (bit 0)
3498 	 *
3499 	 *    Generate an error if the command parser encounters an invalid
3500 	 *    instruction
3501 	 *
3502 	 *    This is a fatal error.
3503 	 *
3504 	 * - CP_PRIV (bit 2)
3505 	 *
3506 	 *    Generate an error on privilege violation (where the CP replaces
3507 	 *    the instruction with a no-op). This also fires for writes into
3508 	 *    read-only scratch pages.
3509 	 *
3510 	 *    This is a non-fatal error, parsing continues.
3511 	 *
3512 	 * * there are a few others defined for odd HW that we do not use
3513 	 *
3514 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
3515 	 * error (as the HW is validating and suppressing the mistakes), we
3516 	 * only unmask the instruction error bit.
3517 	 */
3518 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
3519 }
3520 
3521 static void enable_execlists(struct intel_engine_cs *engine)
3522 {
3523 	u32 mode;
3524 
3525 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3526 
3527 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3528 
3529 	if (INTEL_GEN(engine->i915) >= 11)
3530 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3531 	else
3532 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3533 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3534 
3535 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3536 
3537 	ENGINE_WRITE_FW(engine,
3538 			RING_HWS_PGA,
3539 			i915_ggtt_offset(engine->status_page.vma));
3540 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3541 
3542 	enable_error_interrupt(engine);
3543 
3544 	engine->context_tag = 0;
3545 }
3546 
3547 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3548 {
3549 	bool unexpected = false;
3550 
3551 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3552 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3553 		unexpected = true;
3554 	}
3555 
3556 	return unexpected;
3557 }
3558 
3559 static int execlists_resume(struct intel_engine_cs *engine)
3560 {
3561 	intel_mocs_init_engine(engine);
3562 
3563 	intel_engine_reset_breadcrumbs(engine);
3564 
3565 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3566 		struct drm_printer p = drm_debug_printer(__func__);
3567 
3568 		intel_engine_dump(engine, &p, NULL);
3569 	}
3570 
3571 	enable_execlists(engine);
3572 
3573 	return 0;
3574 }
3575 
3576 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3577 {
3578 	struct intel_engine_execlists * const execlists = &engine->execlists;
3579 	unsigned long flags;
3580 
3581 	ENGINE_TRACE(engine, "depth<-%d\n",
3582 		     atomic_read(&execlists->tasklet.count));
3583 
3584 	/*
3585 	 * Prevent request submission to the hardware until we have
3586 	 * completed the reset in i915_gem_reset_finish(). If a request
3587 	 * is completed by one engine, it may then queue a request
3588 	 * to a second via its execlists->tasklet *just* as we are
3589 	 * calling engine->resume() and also writing the ELSP.
3590 	 * Turning off the execlists->tasklet until the reset is over
3591 	 * prevents the race.
3592 	 */
3593 	__tasklet_disable_sync_once(&execlists->tasklet);
3594 	GEM_BUG_ON(!reset_in_progress(execlists));
3595 
3596 	/* And flush any current direct submission. */
3597 	spin_lock_irqsave(&engine->active.lock, flags);
3598 	spin_unlock_irqrestore(&engine->active.lock, flags);
3599 
3600 	/*
3601 	 * We stop engines, otherwise we might get failed reset and a
3602 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
3603 	 * from system hang if batchbuffer is progressing when
3604 	 * the reset is issued, regardless of READY_TO_RESET ack.
3605 	 * Thus assume it is best to stop engines on all gens
3606 	 * where we have a gpu reset.
3607 	 *
3608 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3609 	 *
3610 	 * FIXME: Wa for more modern gens needs to be validated
3611 	 */
3612 	intel_engine_stop_cs(engine);
3613 }
3614 
3615 static void reset_csb_pointers(struct intel_engine_cs *engine)
3616 {
3617 	struct intel_engine_execlists * const execlists = &engine->execlists;
3618 	const unsigned int reset_value = execlists->csb_size - 1;
3619 
3620 	ring_set_paused(engine, 0);
3621 
3622 	/*
3623 	 * After a reset, the HW starts writing into CSB entry [0]. We
3624 	 * therefore have to set our HEAD pointer back one entry so that
3625 	 * the *first* entry we check is entry 0. To complicate this further,
3626 	 * as we don't wait for the first interrupt after reset, we have to
3627 	 * fake the HW write to point back to the last entry so that our
3628 	 * inline comparison of our cached head position against the last HW
3629 	 * write works even before the first interrupt.
3630 	 */
3631 	execlists->csb_head = reset_value;
3632 	WRITE_ONCE(*execlists->csb_write, reset_value);
3633 	wmb(); /* Make sure this is visible to HW (paranoia?) */
3634 
3635 	/*
3636 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3637 	 * Bludgeon them with a mmio update to be sure.
3638 	 */
3639 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3640 		     reset_value << 8 | reset_value);
3641 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3642 
3643 	invalidate_csb_entries(&execlists->csb_status[0],
3644 			       &execlists->csb_status[reset_value]);
3645 }
3646 
3647 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3648 {
3649 	int x;
3650 
3651 	x = lrc_ring_mi_mode(engine);
3652 	if (x != -1) {
3653 		regs[x + 1] &= ~STOP_RING;
3654 		regs[x + 1] |= STOP_RING << 16;
3655 	}
3656 }
3657 
3658 static void __execlists_reset_reg_state(const struct intel_context *ce,
3659 					const struct intel_engine_cs *engine)
3660 {
3661 	u32 *regs = ce->lrc_reg_state;
3662 
3663 	__reset_stop_ring(regs, engine);
3664 }
3665 
3666 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3667 {
3668 	struct intel_engine_execlists * const execlists = &engine->execlists;
3669 	struct intel_context *ce;
3670 	struct i915_request *rq;
3671 	u32 head;
3672 
3673 	mb(); /* paranoia: read the CSB pointers from after the reset */
3674 	clflush(execlists->csb_write);
3675 	mb();
3676 
3677 	process_csb(engine); /* drain preemption events */
3678 
3679 	/* Following the reset, we need to reload the CSB read/write pointers */
3680 	reset_csb_pointers(engine);
3681 
3682 	/*
3683 	 * Save the currently executing context, even if we completed
3684 	 * its request, it was still running at the time of the
3685 	 * reset and will have been clobbered.
3686 	 */
3687 	rq = execlists_active(execlists);
3688 	if (!rq)
3689 		goto unwind;
3690 
3691 	ce = rq->context;
3692 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3693 
3694 	if (i915_request_completed(rq)) {
3695 		/* Idle context; tidy up the ring so we can restart afresh */
3696 		head = intel_ring_wrap(ce->ring, rq->tail);
3697 		goto out_replay;
3698 	}
3699 
3700 	/* We still have requests in-flight; the engine should be active */
3701 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3702 
3703 	/* Context has requests still in-flight; it should not be idle! */
3704 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3705 
3706 	rq = active_request(ce->timeline, rq);
3707 	head = intel_ring_wrap(ce->ring, rq->head);
3708 	GEM_BUG_ON(head == ce->ring->tail);
3709 
3710 	/*
3711 	 * If this request hasn't started yet, e.g. it is waiting on a
3712 	 * semaphore, we need to avoid skipping the request or else we
3713 	 * break the signaling chain. However, if the context is corrupt
3714 	 * the request will not restart and we will be stuck with a wedged
3715 	 * device. It is quite often the case that if we issue a reset
3716 	 * while the GPU is loading the context image, that the context
3717 	 * image becomes corrupt.
3718 	 *
3719 	 * Otherwise, if we have not started yet, the request should replay
3720 	 * perfectly and we do not need to flag the result as being erroneous.
3721 	 */
3722 	if (!i915_request_started(rq))
3723 		goto out_replay;
3724 
3725 	/*
3726 	 * If the request was innocent, we leave the request in the ELSP
3727 	 * and will try to replay it on restarting. The context image may
3728 	 * have been corrupted by the reset, in which case we may have
3729 	 * to service a new GPU hang, but more likely we can continue on
3730 	 * without impact.
3731 	 *
3732 	 * If the request was guilty, we presume the context is corrupt
3733 	 * and have to at least restore the RING register in the context
3734 	 * image back to the expected values to skip over the guilty request.
3735 	 */
3736 	__i915_request_reset(rq, stalled);
3737 	if (!stalled)
3738 		goto out_replay;
3739 
3740 	/*
3741 	 * We want a simple context + ring to execute the breadcrumb update.
3742 	 * We cannot rely on the context being intact across the GPU hang,
3743 	 * so clear it and rebuild just what we need for the breadcrumb.
3744 	 * All pending requests for this context will be zapped, and any
3745 	 * future request will be after userspace has had the opportunity
3746 	 * to recreate its own state.
3747 	 */
3748 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3749 	restore_default_state(ce, engine);
3750 
3751 out_replay:
3752 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3753 		     head, ce->ring->tail);
3754 	__execlists_reset_reg_state(ce, engine);
3755 	__execlists_update_reg_state(ce, engine, head);
3756 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3757 
3758 unwind:
3759 	/* Push back any incomplete requests for replay after the reset. */
3760 	cancel_port_requests(execlists);
3761 	__unwind_incomplete_requests(engine);
3762 }
3763 
3764 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3765 {
3766 	unsigned long flags;
3767 
3768 	ENGINE_TRACE(engine, "\n");
3769 
3770 	spin_lock_irqsave(&engine->active.lock, flags);
3771 
3772 	__execlists_reset(engine, stalled);
3773 
3774 	spin_unlock_irqrestore(&engine->active.lock, flags);
3775 }
3776 
3777 static void nop_submission_tasklet(unsigned long data)
3778 {
3779 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3780 
3781 	/* The driver is wedged; don't process any more events. */
3782 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
3783 }
3784 
3785 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3786 {
3787 	struct intel_engine_execlists * const execlists = &engine->execlists;
3788 	struct i915_request *rq, *rn;
3789 	struct rb_node *rb;
3790 	unsigned long flags;
3791 
3792 	ENGINE_TRACE(engine, "\n");
3793 
3794 	/*
3795 	 * Before we call engine->cancel_requests(), we should have exclusive
3796 	 * access to the submission state. This is arranged for us by the
3797 	 * caller disabling the interrupt generation, the tasklet and other
3798 	 * threads that may then access the same state, giving us a free hand
3799 	 * to reset state. However, we still need to let lockdep be aware that
3800 	 * we know this state may be accessed in hardirq context, so we
3801 	 * disable the irq around this manipulation and we want to keep
3802 	 * the spinlock focused on its duties and not accidentally conflate
3803 	 * coverage to the submission's irq state. (Similarly, although we
3804 	 * shouldn't need to disable irq around the manipulation of the
3805 	 * submission's irq state, we also wish to remind ourselves that
3806 	 * it is irq state.)
3807 	 */
3808 	spin_lock_irqsave(&engine->active.lock, flags);
3809 
3810 	__execlists_reset(engine, true);
3811 
3812 	/* Mark all executing requests as skipped. */
3813 	list_for_each_entry(rq, &engine->active.requests, sched.link)
3814 		mark_eio(rq);
3815 
3816 	/* Flush the queued requests to the timeline list (for retiring). */
3817 	while ((rb = rb_first_cached(&execlists->queue))) {
3818 		struct i915_priolist *p = to_priolist(rb);
3819 		int i;
3820 
3821 		priolist_for_each_request_consume(rq, rn, p, i) {
3822 			mark_eio(rq);
3823 			__i915_request_submit(rq);
3824 		}
3825 
3826 		rb_erase_cached(&p->node, &execlists->queue);
3827 		i915_priolist_free(p);
3828 	}
3829 
3830 	/* On-hold requests will be flushed to timeline upon their release */
3831 	list_for_each_entry(rq, &engine->active.hold, sched.link)
3832 		mark_eio(rq);
3833 
3834 	/* Cancel all attached virtual engines */
3835 	while ((rb = rb_first_cached(&execlists->virtual))) {
3836 		struct virtual_engine *ve =
3837 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3838 
3839 		rb_erase_cached(rb, &execlists->virtual);
3840 		RB_CLEAR_NODE(rb);
3841 
3842 		spin_lock(&ve->base.active.lock);
3843 		rq = fetch_and_zero(&ve->request);
3844 		if (rq) {
3845 			mark_eio(rq);
3846 
3847 			rq->engine = engine;
3848 			__i915_request_submit(rq);
3849 			i915_request_put(rq);
3850 
3851 			ve->base.execlists.queue_priority_hint = INT_MIN;
3852 		}
3853 		spin_unlock(&ve->base.active.lock);
3854 	}
3855 
3856 	/* Remaining _unready_ requests will be nop'ed when submitted */
3857 
3858 	execlists->queue_priority_hint = INT_MIN;
3859 	execlists->queue = RB_ROOT_CACHED;
3860 
3861 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3862 	execlists->tasklet.func = nop_submission_tasklet;
3863 
3864 	spin_unlock_irqrestore(&engine->active.lock, flags);
3865 }
3866 
3867 static void execlists_reset_finish(struct intel_engine_cs *engine)
3868 {
3869 	struct intel_engine_execlists * const execlists = &engine->execlists;
3870 
3871 	/*
3872 	 * After a GPU reset, we may have requests to replay. Do so now while
3873 	 * we still have the forcewake to be sure that the GPU is not allowed
3874 	 * to sleep before we restart and reload a context.
3875 	 */
3876 	GEM_BUG_ON(!reset_in_progress(execlists));
3877 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3878 		execlists->tasklet.func(execlists->tasklet.data);
3879 
3880 	if (__tasklet_enable(&execlists->tasklet))
3881 		/* And kick in case we missed a new request submission. */
3882 		tasklet_hi_schedule(&execlists->tasklet);
3883 	ENGINE_TRACE(engine, "depth->%d\n",
3884 		     atomic_read(&execlists->tasklet.count));
3885 }
3886 
3887 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3888 				    u64 offset, u32 len,
3889 				    const unsigned int flags)
3890 {
3891 	u32 *cs;
3892 
3893 	cs = intel_ring_begin(rq, 4);
3894 	if (IS_ERR(cs))
3895 		return PTR_ERR(cs);
3896 
3897 	/*
3898 	 * WaDisableCtxRestoreArbitration:bdw,chv
3899 	 *
3900 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3901 	 * particular all the gen that do not need the w/a at all!), if we
3902 	 * took care to make sure that on every switch into this context
3903 	 * (both ordinary and for preemption) that arbitrartion was enabled
3904 	 * we would be fine.  However, for gen8 there is another w/a that
3905 	 * requires us to not preempt inside GPGPU execution, so we keep
3906 	 * arbitration disabled for gen8 batches. Arbitration will be
3907 	 * re-enabled before we close the request
3908 	 * (engine->emit_fini_breadcrumb).
3909 	 */
3910 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3911 
3912 	/* FIXME(BDW+): Address space and security selectors. */
3913 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3914 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3915 	*cs++ = lower_32_bits(offset);
3916 	*cs++ = upper_32_bits(offset);
3917 
3918 	intel_ring_advance(rq, cs);
3919 
3920 	return 0;
3921 }
3922 
3923 static int gen8_emit_bb_start(struct i915_request *rq,
3924 			      u64 offset, u32 len,
3925 			      const unsigned int flags)
3926 {
3927 	u32 *cs;
3928 
3929 	cs = intel_ring_begin(rq, 6);
3930 	if (IS_ERR(cs))
3931 		return PTR_ERR(cs);
3932 
3933 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3934 
3935 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3936 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3937 	*cs++ = lower_32_bits(offset);
3938 	*cs++ = upper_32_bits(offset);
3939 
3940 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3941 	*cs++ = MI_NOOP;
3942 
3943 	intel_ring_advance(rq, cs);
3944 
3945 	return 0;
3946 }
3947 
3948 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3949 {
3950 	ENGINE_WRITE(engine, RING_IMR,
3951 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
3952 	ENGINE_POSTING_READ(engine, RING_IMR);
3953 }
3954 
3955 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3956 {
3957 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3958 }
3959 
3960 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3961 {
3962 	u32 cmd, *cs;
3963 
3964 	cs = intel_ring_begin(request, 4);
3965 	if (IS_ERR(cs))
3966 		return PTR_ERR(cs);
3967 
3968 	cmd = MI_FLUSH_DW + 1;
3969 
3970 	/* We always require a command barrier so that subsequent
3971 	 * commands, such as breadcrumb interrupts, are strictly ordered
3972 	 * wrt the contents of the write cache being flushed to memory
3973 	 * (and thus being coherent from the CPU).
3974 	 */
3975 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3976 
3977 	if (mode & EMIT_INVALIDATE) {
3978 		cmd |= MI_INVALIDATE_TLB;
3979 		if (request->engine->class == VIDEO_DECODE_CLASS)
3980 			cmd |= MI_INVALIDATE_BSD;
3981 	}
3982 
3983 	*cs++ = cmd;
3984 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3985 	*cs++ = 0; /* upper addr */
3986 	*cs++ = 0; /* value */
3987 	intel_ring_advance(request, cs);
3988 
3989 	return 0;
3990 }
3991 
3992 static int gen8_emit_flush_render(struct i915_request *request,
3993 				  u32 mode)
3994 {
3995 	bool vf_flush_wa = false, dc_flush_wa = false;
3996 	u32 *cs, flags = 0;
3997 	int len;
3998 
3999 	flags |= PIPE_CONTROL_CS_STALL;
4000 
4001 	if (mode & EMIT_FLUSH) {
4002 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4003 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4004 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4005 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4006 	}
4007 
4008 	if (mode & EMIT_INVALIDATE) {
4009 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4010 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4011 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4012 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4013 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4014 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4015 		flags |= PIPE_CONTROL_QW_WRITE;
4016 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4017 
4018 		/*
4019 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4020 		 * pipe control.
4021 		 */
4022 		if (IS_GEN(request->i915, 9))
4023 			vf_flush_wa = true;
4024 
4025 		/* WaForGAMHang:kbl */
4026 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
4027 			dc_flush_wa = true;
4028 	}
4029 
4030 	len = 6;
4031 
4032 	if (vf_flush_wa)
4033 		len += 6;
4034 
4035 	if (dc_flush_wa)
4036 		len += 12;
4037 
4038 	cs = intel_ring_begin(request, len);
4039 	if (IS_ERR(cs))
4040 		return PTR_ERR(cs);
4041 
4042 	if (vf_flush_wa)
4043 		cs = gen8_emit_pipe_control(cs, 0, 0);
4044 
4045 	if (dc_flush_wa)
4046 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4047 					    0);
4048 
4049 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4050 
4051 	if (dc_flush_wa)
4052 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4053 
4054 	intel_ring_advance(request, cs);
4055 
4056 	return 0;
4057 }
4058 
4059 static int gen11_emit_flush_render(struct i915_request *request,
4060 				   u32 mode)
4061 {
4062 	if (mode & EMIT_FLUSH) {
4063 		u32 *cs;
4064 		u32 flags = 0;
4065 
4066 		flags |= PIPE_CONTROL_CS_STALL;
4067 
4068 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4069 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4070 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4071 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4072 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4073 		flags |= PIPE_CONTROL_QW_WRITE;
4074 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4075 
4076 		cs = intel_ring_begin(request, 6);
4077 		if (IS_ERR(cs))
4078 			return PTR_ERR(cs);
4079 
4080 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4081 		intel_ring_advance(request, cs);
4082 	}
4083 
4084 	if (mode & EMIT_INVALIDATE) {
4085 		u32 *cs;
4086 		u32 flags = 0;
4087 
4088 		flags |= PIPE_CONTROL_CS_STALL;
4089 
4090 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4091 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4092 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4093 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4094 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4095 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4096 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4097 		flags |= PIPE_CONTROL_QW_WRITE;
4098 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4099 
4100 		cs = intel_ring_begin(request, 6);
4101 		if (IS_ERR(cs))
4102 			return PTR_ERR(cs);
4103 
4104 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4105 		intel_ring_advance(request, cs);
4106 	}
4107 
4108 	return 0;
4109 }
4110 
4111 static u32 preparser_disable(bool state)
4112 {
4113 	return MI_ARB_CHECK | 1 << 8 | state;
4114 }
4115 
4116 static int gen12_emit_flush_render(struct i915_request *request,
4117 				   u32 mode)
4118 {
4119 	if (mode & EMIT_FLUSH) {
4120 		u32 flags = 0;
4121 		u32 *cs;
4122 
4123 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4124 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4125 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4126 		/* Wa_1409600907:tgl */
4127 		flags |= PIPE_CONTROL_DEPTH_STALL;
4128 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4129 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4130 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4131 
4132 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4133 		flags |= PIPE_CONTROL_QW_WRITE;
4134 
4135 		flags |= PIPE_CONTROL_CS_STALL;
4136 
4137 		cs = intel_ring_begin(request, 6);
4138 		if (IS_ERR(cs))
4139 			return PTR_ERR(cs);
4140 
4141 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4142 		intel_ring_advance(request, cs);
4143 	}
4144 
4145 	if (mode & EMIT_INVALIDATE) {
4146 		u32 flags = 0;
4147 		u32 *cs;
4148 
4149 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4150 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4151 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4152 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4153 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4154 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4155 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4156 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
4157 
4158 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4159 		flags |= PIPE_CONTROL_QW_WRITE;
4160 
4161 		flags |= PIPE_CONTROL_CS_STALL;
4162 
4163 		cs = intel_ring_begin(request, 8);
4164 		if (IS_ERR(cs))
4165 			return PTR_ERR(cs);
4166 
4167 		/*
4168 		 * Prevent the pre-parser from skipping past the TLB
4169 		 * invalidate and loading a stale page for the batch
4170 		 * buffer / request payload.
4171 		 */
4172 		*cs++ = preparser_disable(true);
4173 
4174 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4175 
4176 		*cs++ = preparser_disable(false);
4177 		intel_ring_advance(request, cs);
4178 	}
4179 
4180 	return 0;
4181 }
4182 
4183 /*
4184  * Reserve space for 2 NOOPs at the end of each request to be
4185  * used as a workaround for not being allowed to do lite
4186  * restore with HEAD==TAIL (WaIdleLiteRestore).
4187  */
4188 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4189 {
4190 	/* Ensure there's always at least one preemption point per-request. */
4191 	*cs++ = MI_ARB_CHECK;
4192 	*cs++ = MI_NOOP;
4193 	request->wa_tail = intel_ring_offset(request, cs);
4194 
4195 	return cs;
4196 }
4197 
4198 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4199 {
4200 	*cs++ = MI_SEMAPHORE_WAIT |
4201 		MI_SEMAPHORE_GLOBAL_GTT |
4202 		MI_SEMAPHORE_POLL |
4203 		MI_SEMAPHORE_SAD_EQ_SDD;
4204 	*cs++ = 0;
4205 	*cs++ = intel_hws_preempt_address(request->engine);
4206 	*cs++ = 0;
4207 
4208 	return cs;
4209 }
4210 
4211 static __always_inline u32*
4212 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4213 				 u32 *cs)
4214 {
4215 	*cs++ = MI_USER_INTERRUPT;
4216 
4217 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4218 	if (intel_engine_has_semaphores(request->engine))
4219 		cs = emit_preempt_busywait(request, cs);
4220 
4221 	request->tail = intel_ring_offset(request, cs);
4222 	assert_ring_tail_valid(request->ring, request->tail);
4223 
4224 	return gen8_emit_wa_tail(request, cs);
4225 }
4226 
4227 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4228 {
4229 	cs = gen8_emit_ggtt_write(cs,
4230 				  request->fence.seqno,
4231 				  i915_request_active_timeline(request)->hwsp_offset,
4232 				  0);
4233 
4234 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4235 }
4236 
4237 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4238 {
4239 	cs = gen8_emit_pipe_control(cs,
4240 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4241 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4242 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4243 				    0);
4244 
4245 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4246 	cs = gen8_emit_ggtt_write_rcs(cs,
4247 				      request->fence.seqno,
4248 				      i915_request_active_timeline(request)->hwsp_offset,
4249 				      PIPE_CONTROL_FLUSH_ENABLE |
4250 				      PIPE_CONTROL_CS_STALL);
4251 
4252 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4253 }
4254 
4255 static u32 *
4256 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4257 {
4258 	cs = gen8_emit_ggtt_write_rcs(cs,
4259 				      request->fence.seqno,
4260 				      i915_request_active_timeline(request)->hwsp_offset,
4261 				      PIPE_CONTROL_CS_STALL |
4262 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4263 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4264 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4265 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4266 				      PIPE_CONTROL_FLUSH_ENABLE);
4267 
4268 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4269 }
4270 
4271 /*
4272  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4273  * flush and will continue pre-fetching the instructions after it before the
4274  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4275  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4276  * of the next request before the memory has been flushed, we're guaranteed that
4277  * we won't access the batch itself too early.
4278  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4279  * so, if the current request is modifying an instruction in the next request on
4280  * the same intel_context, we might pre-fetch and then execute the pre-update
4281  * instruction. To avoid this, the users of self-modifying code should either
4282  * disable the parser around the code emitting the memory writes, via a new flag
4283  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4284  * the in-kernel use-cases we've opted to use a separate context, see
4285  * reloc_gpu() as an example.
4286  * All the above applies only to the instructions themselves. Non-inline data
4287  * used by the instructions is not pre-fetched.
4288  */
4289 
4290 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4291 {
4292 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4293 		MI_SEMAPHORE_GLOBAL_GTT |
4294 		MI_SEMAPHORE_POLL |
4295 		MI_SEMAPHORE_SAD_EQ_SDD;
4296 	*cs++ = 0;
4297 	*cs++ = intel_hws_preempt_address(request->engine);
4298 	*cs++ = 0;
4299 	*cs++ = 0;
4300 	*cs++ = MI_NOOP;
4301 
4302 	return cs;
4303 }
4304 
4305 static __always_inline u32*
4306 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4307 {
4308 	*cs++ = MI_USER_INTERRUPT;
4309 
4310 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4311 	if (intel_engine_has_semaphores(request->engine))
4312 		cs = gen12_emit_preempt_busywait(request, cs);
4313 
4314 	request->tail = intel_ring_offset(request, cs);
4315 	assert_ring_tail_valid(request->ring, request->tail);
4316 
4317 	return gen8_emit_wa_tail(request, cs);
4318 }
4319 
4320 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4321 {
4322 	cs = gen8_emit_ggtt_write(cs,
4323 				  request->fence.seqno,
4324 				  i915_request_active_timeline(request)->hwsp_offset,
4325 				  0);
4326 
4327 	return gen12_emit_fini_breadcrumb_footer(request, cs);
4328 }
4329 
4330 static u32 *
4331 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4332 {
4333 	cs = gen8_emit_ggtt_write_rcs(cs,
4334 				      request->fence.seqno,
4335 				      i915_request_active_timeline(request)->hwsp_offset,
4336 				      PIPE_CONTROL_CS_STALL |
4337 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4338 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4339 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4340 				      /* Wa_1409600907:tgl */
4341 				      PIPE_CONTROL_DEPTH_STALL |
4342 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4343 				      PIPE_CONTROL_FLUSH_ENABLE |
4344 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4345 
4346 	return gen12_emit_fini_breadcrumb_footer(request, cs);
4347 }
4348 
4349 static void execlists_park(struct intel_engine_cs *engine)
4350 {
4351 	cancel_timer(&engine->execlists.timer);
4352 	cancel_timer(&engine->execlists.preempt);
4353 }
4354 
4355 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4356 {
4357 	engine->submit_request = execlists_submit_request;
4358 	engine->schedule = i915_schedule;
4359 	engine->execlists.tasklet.func = execlists_submission_tasklet;
4360 
4361 	engine->reset.prepare = execlists_reset_prepare;
4362 	engine->reset.rewind = execlists_reset_rewind;
4363 	engine->reset.cancel = execlists_reset_cancel;
4364 	engine->reset.finish = execlists_reset_finish;
4365 
4366 	engine->park = execlists_park;
4367 	engine->unpark = NULL;
4368 
4369 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4370 	if (!intel_vgpu_active(engine->i915)) {
4371 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4372 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
4373 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4374 	}
4375 
4376 	if (INTEL_GEN(engine->i915) >= 12)
4377 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4378 
4379 	if (intel_engine_has_preemption(engine))
4380 		engine->emit_bb_start = gen8_emit_bb_start;
4381 	else
4382 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
4383 }
4384 
4385 static void execlists_shutdown(struct intel_engine_cs *engine)
4386 {
4387 	/* Synchronise with residual timers and any softirq they raise */
4388 	del_timer_sync(&engine->execlists.timer);
4389 	del_timer_sync(&engine->execlists.preempt);
4390 	tasklet_kill(&engine->execlists.tasklet);
4391 }
4392 
4393 static void execlists_release(struct intel_engine_cs *engine)
4394 {
4395 	execlists_shutdown(engine);
4396 
4397 	intel_engine_cleanup_common(engine);
4398 	lrc_destroy_wa_ctx(engine);
4399 }
4400 
4401 static void
4402 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4403 {
4404 	/* Default vfuncs which can be overriden by each engine. */
4405 
4406 	engine->resume = execlists_resume;
4407 
4408 	engine->cops = &execlists_context_ops;
4409 	engine->request_alloc = execlists_request_alloc;
4410 
4411 	engine->emit_flush = gen8_emit_flush;
4412 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4413 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4414 	if (INTEL_GEN(engine->i915) >= 12)
4415 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4416 
4417 	engine->set_default_submission = intel_execlists_set_default_submission;
4418 
4419 	if (INTEL_GEN(engine->i915) < 11) {
4420 		engine->irq_enable = gen8_logical_ring_enable_irq;
4421 		engine->irq_disable = gen8_logical_ring_disable_irq;
4422 	} else {
4423 		/*
4424 		 * TODO: On Gen11 interrupt masks need to be clear
4425 		 * to allow C6 entry. Keep interrupts enabled at
4426 		 * and take the hit of generating extra interrupts
4427 		 * until a more refined solution exists.
4428 		 */
4429 	}
4430 }
4431 
4432 static inline void
4433 logical_ring_default_irqs(struct intel_engine_cs *engine)
4434 {
4435 	unsigned int shift = 0;
4436 
4437 	if (INTEL_GEN(engine->i915) < 11) {
4438 		const u8 irq_shifts[] = {
4439 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
4440 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
4441 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
4442 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
4443 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
4444 		};
4445 
4446 		shift = irq_shifts[engine->id];
4447 	}
4448 
4449 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4450 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4451 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
4452 }
4453 
4454 static void rcs_submission_override(struct intel_engine_cs *engine)
4455 {
4456 	switch (INTEL_GEN(engine->i915)) {
4457 	case 12:
4458 		engine->emit_flush = gen12_emit_flush_render;
4459 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4460 		break;
4461 	case 11:
4462 		engine->emit_flush = gen11_emit_flush_render;
4463 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4464 		break;
4465 	default:
4466 		engine->emit_flush = gen8_emit_flush_render;
4467 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4468 		break;
4469 	}
4470 }
4471 
4472 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4473 {
4474 	struct intel_engine_execlists * const execlists = &engine->execlists;
4475 	struct drm_i915_private *i915 = engine->i915;
4476 	struct intel_uncore *uncore = engine->uncore;
4477 	u32 base = engine->mmio_base;
4478 
4479 	tasklet_init(&engine->execlists.tasklet,
4480 		     execlists_submission_tasklet, (unsigned long)engine);
4481 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4482 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4483 
4484 	logical_ring_default_vfuncs(engine);
4485 	logical_ring_default_irqs(engine);
4486 
4487 	if (engine->class == RENDER_CLASS)
4488 		rcs_submission_override(engine);
4489 
4490 	if (intel_init_workaround_bb(engine))
4491 		/*
4492 		 * We continue even if we fail to initialize WA batch
4493 		 * because we only expect rare glitches but nothing
4494 		 * critical to prevent us from using GPU
4495 		 */
4496 		DRM_ERROR("WA batch buffer initialization failed\n");
4497 
4498 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
4499 		execlists->submit_reg = uncore->regs +
4500 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4501 		execlists->ctrl_reg = uncore->regs +
4502 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4503 	} else {
4504 		execlists->submit_reg = uncore->regs +
4505 			i915_mmio_reg_offset(RING_ELSP(base));
4506 	}
4507 
4508 	execlists->csb_status =
4509 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4510 
4511 	execlists->csb_write =
4512 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
4513 
4514 	if (INTEL_GEN(i915) < 11)
4515 		execlists->csb_size = GEN8_CSB_ENTRIES;
4516 	else
4517 		execlists->csb_size = GEN11_CSB_ENTRIES;
4518 
4519 	reset_csb_pointers(engine);
4520 
4521 	/* Finally, take ownership and responsibility for cleanup! */
4522 	engine->release = execlists_release;
4523 
4524 	return 0;
4525 }
4526 
4527 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4528 {
4529 	u32 indirect_ctx_offset;
4530 
4531 	switch (INTEL_GEN(engine->i915)) {
4532 	default:
4533 		MISSING_CASE(INTEL_GEN(engine->i915));
4534 		/* fall through */
4535 	case 12:
4536 		indirect_ctx_offset =
4537 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4538 		break;
4539 	case 11:
4540 		indirect_ctx_offset =
4541 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4542 		break;
4543 	case 10:
4544 		indirect_ctx_offset =
4545 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4546 		break;
4547 	case 9:
4548 		indirect_ctx_offset =
4549 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4550 		break;
4551 	case 8:
4552 		indirect_ctx_offset =
4553 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4554 		break;
4555 	}
4556 
4557 	return indirect_ctx_offset;
4558 }
4559 
4560 
4561 static void init_common_reg_state(u32 * const regs,
4562 				  const struct intel_engine_cs *engine,
4563 				  const struct intel_ring *ring,
4564 				  bool inhibit)
4565 {
4566 	u32 ctl;
4567 
4568 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4569 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4570 	if (inhibit)
4571 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4572 	if (INTEL_GEN(engine->i915) < 11)
4573 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4574 					   CTX_CTRL_RS_CTX_ENABLE);
4575 	regs[CTX_CONTEXT_CONTROL] = ctl;
4576 
4577 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4578 }
4579 
4580 static void init_wa_bb_reg_state(u32 * const regs,
4581 				 const struct intel_engine_cs *engine,
4582 				 u32 pos_bb_per_ctx)
4583 {
4584 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4585 
4586 	if (wa_ctx->per_ctx.size) {
4587 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4588 
4589 		regs[pos_bb_per_ctx] =
4590 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4591 	}
4592 
4593 	if (wa_ctx->indirect_ctx.size) {
4594 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4595 
4596 		regs[pos_bb_per_ctx + 2] =
4597 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
4598 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4599 
4600 		regs[pos_bb_per_ctx + 4] =
4601 			intel_lr_indirect_ctx_offset(engine) << 6;
4602 	}
4603 }
4604 
4605 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4606 {
4607 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
4608 		/* 64b PPGTT (48bit canonical)
4609 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
4610 		 * other PDP Descriptors are ignored.
4611 		 */
4612 		ASSIGN_CTX_PML4(ppgtt, regs);
4613 	} else {
4614 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
4615 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
4616 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
4617 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
4618 	}
4619 }
4620 
4621 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4622 {
4623 	if (i915_is_ggtt(vm))
4624 		return i915_vm_to_ggtt(vm)->alias;
4625 	else
4626 		return i915_vm_to_ppgtt(vm);
4627 }
4628 
4629 static void execlists_init_reg_state(u32 *regs,
4630 				     const struct intel_context *ce,
4631 				     const struct intel_engine_cs *engine,
4632 				     const struct intel_ring *ring,
4633 				     bool inhibit)
4634 {
4635 	/*
4636 	 * A context is actually a big batch buffer with several
4637 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4638 	 * values we are setting here are only for the first context restore:
4639 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
4640 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4641 	 * we are not initializing here).
4642 	 *
4643 	 * Must keep consistent with virtual_update_register_offsets().
4644 	 */
4645 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
4646 
4647 	init_common_reg_state(regs, engine, ring, inhibit);
4648 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4649 
4650 	init_wa_bb_reg_state(regs, engine,
4651 			     INTEL_GEN(engine->i915) >= 12 ?
4652 			     GEN12_CTX_BB_PER_CTX_PTR :
4653 			     CTX_BB_PER_CTX_PTR);
4654 
4655 	__reset_stop_ring(regs, engine);
4656 }
4657 
4658 static int
4659 populate_lr_context(struct intel_context *ce,
4660 		    struct drm_i915_gem_object *ctx_obj,
4661 		    struct intel_engine_cs *engine,
4662 		    struct intel_ring *ring)
4663 {
4664 	bool inhibit = true;
4665 	void *vaddr;
4666 	int ret;
4667 
4668 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4669 	if (IS_ERR(vaddr)) {
4670 		ret = PTR_ERR(vaddr);
4671 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4672 		return ret;
4673 	}
4674 
4675 	set_redzone(vaddr, engine);
4676 
4677 	if (engine->default_state) {
4678 		void *defaults;
4679 
4680 		defaults = i915_gem_object_pin_map(engine->default_state,
4681 						   I915_MAP_WB);
4682 		if (IS_ERR(defaults)) {
4683 			ret = PTR_ERR(defaults);
4684 			goto err_unpin_ctx;
4685 		}
4686 
4687 		memcpy(vaddr, defaults, engine->context_size);
4688 		i915_gem_object_unpin_map(engine->default_state);
4689 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
4690 		inhibit = false;
4691 	}
4692 
4693 	/* Clear the ppHWSP (inc. per-context counters) */
4694 	memset(vaddr, 0, PAGE_SIZE);
4695 
4696 	/*
4697 	 * The second page of the context object contains some registers which
4698 	 * must be set up prior to the first execution.
4699 	 */
4700 	execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4701 				 ce, engine, ring, inhibit);
4702 
4703 	ret = 0;
4704 err_unpin_ctx:
4705 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4706 	i915_gem_object_unpin_map(ctx_obj);
4707 	return ret;
4708 }
4709 
4710 static int __execlists_context_alloc(struct intel_context *ce,
4711 				     struct intel_engine_cs *engine)
4712 {
4713 	struct drm_i915_gem_object *ctx_obj;
4714 	struct intel_ring *ring;
4715 	struct i915_vma *vma;
4716 	u32 context_size;
4717 	int ret;
4718 
4719 	GEM_BUG_ON(ce->state);
4720 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4721 
4722 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4723 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4724 
4725 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4726 	if (IS_ERR(ctx_obj))
4727 		return PTR_ERR(ctx_obj);
4728 
4729 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4730 	if (IS_ERR(vma)) {
4731 		ret = PTR_ERR(vma);
4732 		goto error_deref_obj;
4733 	}
4734 
4735 	if (!ce->timeline) {
4736 		struct intel_timeline *tl;
4737 		struct i915_vma *hwsp;
4738 
4739 		/*
4740 		 * Use the static global HWSP for the kernel context, and
4741 		 * a dynamically allocated cacheline for everyone else.
4742 		 */
4743 		hwsp = NULL;
4744 		if (unlikely(intel_context_is_barrier(ce)))
4745 			hwsp = engine->status_page.vma;
4746 
4747 		tl = intel_timeline_create(engine->gt, hwsp);
4748 		if (IS_ERR(tl)) {
4749 			ret = PTR_ERR(tl);
4750 			goto error_deref_obj;
4751 		}
4752 
4753 		ce->timeline = tl;
4754 	}
4755 
4756 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4757 	if (IS_ERR(ring)) {
4758 		ret = PTR_ERR(ring);
4759 		goto error_deref_obj;
4760 	}
4761 
4762 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4763 	if (ret) {
4764 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4765 		goto error_ring_free;
4766 	}
4767 
4768 	ce->ring = ring;
4769 	ce->state = vma;
4770 
4771 	return 0;
4772 
4773 error_ring_free:
4774 	intel_ring_put(ring);
4775 error_deref_obj:
4776 	i915_gem_object_put(ctx_obj);
4777 	return ret;
4778 }
4779 
4780 static struct list_head *virtual_queue(struct virtual_engine *ve)
4781 {
4782 	return &ve->base.execlists.default_priolist.requests[0];
4783 }
4784 
4785 static void virtual_context_destroy(struct kref *kref)
4786 {
4787 	struct virtual_engine *ve =
4788 		container_of(kref, typeof(*ve), context.ref);
4789 	unsigned int n;
4790 
4791 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4792 	GEM_BUG_ON(ve->request);
4793 	GEM_BUG_ON(ve->context.inflight);
4794 
4795 	for (n = 0; n < ve->num_siblings; n++) {
4796 		struct intel_engine_cs *sibling = ve->siblings[n];
4797 		struct rb_node *node = &ve->nodes[sibling->id].rb;
4798 		unsigned long flags;
4799 
4800 		if (RB_EMPTY_NODE(node))
4801 			continue;
4802 
4803 		spin_lock_irqsave(&sibling->active.lock, flags);
4804 
4805 		/* Detachment is lazily performed in the execlists tasklet */
4806 		if (!RB_EMPTY_NODE(node))
4807 			rb_erase_cached(node, &sibling->execlists.virtual);
4808 
4809 		spin_unlock_irqrestore(&sibling->active.lock, flags);
4810 	}
4811 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4812 
4813 	if (ve->context.state)
4814 		__execlists_context_fini(&ve->context);
4815 	intel_context_fini(&ve->context);
4816 
4817 	kfree(ve->bonds);
4818 	kfree(ve);
4819 }
4820 
4821 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4822 {
4823 	int swp;
4824 
4825 	/*
4826 	 * Pick a random sibling on starting to help spread the load around.
4827 	 *
4828 	 * New contexts are typically created with exactly the same order
4829 	 * of siblings, and often started in batches. Due to the way we iterate
4830 	 * the array of sibling when submitting requests, sibling[0] is
4831 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4832 	 * randomised across the system, we also help spread the load by the
4833 	 * first engine we inspect being different each time.
4834 	 *
4835 	 * NB This does not force us to execute on this engine, it will just
4836 	 * typically be the first we inspect for submission.
4837 	 */
4838 	swp = prandom_u32_max(ve->num_siblings);
4839 	if (!swp)
4840 		return;
4841 
4842 	swap(ve->siblings[swp], ve->siblings[0]);
4843 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4844 		virtual_update_register_offsets(ve->context.lrc_reg_state,
4845 						ve->siblings[0]);
4846 }
4847 
4848 static int virtual_context_alloc(struct intel_context *ce)
4849 {
4850 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4851 
4852 	return __execlists_context_alloc(ce, ve->siblings[0]);
4853 }
4854 
4855 static int virtual_context_pin(struct intel_context *ce)
4856 {
4857 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4858 	int err;
4859 
4860 	/* Note: we must use a real engine class for setting up reg state */
4861 	err = __execlists_context_pin(ce, ve->siblings[0]);
4862 	if (err)
4863 		return err;
4864 
4865 	virtual_engine_initial_hint(ve);
4866 	return 0;
4867 }
4868 
4869 static void virtual_context_enter(struct intel_context *ce)
4870 {
4871 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4872 	unsigned int n;
4873 
4874 	for (n = 0; n < ve->num_siblings; n++)
4875 		intel_engine_pm_get(ve->siblings[n]);
4876 
4877 	intel_timeline_enter(ce->timeline);
4878 }
4879 
4880 static void virtual_context_exit(struct intel_context *ce)
4881 {
4882 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4883 	unsigned int n;
4884 
4885 	intel_timeline_exit(ce->timeline);
4886 
4887 	for (n = 0; n < ve->num_siblings; n++)
4888 		intel_engine_pm_put(ve->siblings[n]);
4889 }
4890 
4891 static const struct intel_context_ops virtual_context_ops = {
4892 	.alloc = virtual_context_alloc,
4893 
4894 	.pin = virtual_context_pin,
4895 	.unpin = execlists_context_unpin,
4896 
4897 	.enter = virtual_context_enter,
4898 	.exit = virtual_context_exit,
4899 
4900 	.destroy = virtual_context_destroy,
4901 };
4902 
4903 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4904 {
4905 	struct i915_request *rq;
4906 	intel_engine_mask_t mask;
4907 
4908 	rq = READ_ONCE(ve->request);
4909 	if (!rq)
4910 		return 0;
4911 
4912 	/* The rq is ready for submission; rq->execution_mask is now stable. */
4913 	mask = rq->execution_mask;
4914 	if (unlikely(!mask)) {
4915 		/* Invalid selection, submit to a random engine in error */
4916 		i915_request_set_error_once(rq, -ENODEV);
4917 		mask = ve->siblings[0]->mask;
4918 	}
4919 
4920 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4921 		     rq->fence.context, rq->fence.seqno,
4922 		     mask, ve->base.execlists.queue_priority_hint);
4923 
4924 	return mask;
4925 }
4926 
4927 static void virtual_submission_tasklet(unsigned long data)
4928 {
4929 	struct virtual_engine * const ve = (struct virtual_engine *)data;
4930 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
4931 	intel_engine_mask_t mask;
4932 	unsigned int n;
4933 
4934 	rcu_read_lock();
4935 	mask = virtual_submission_mask(ve);
4936 	rcu_read_unlock();
4937 	if (unlikely(!mask))
4938 		return;
4939 
4940 	local_irq_disable();
4941 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4942 		struct intel_engine_cs *sibling = ve->siblings[n];
4943 		struct ve_node * const node = &ve->nodes[sibling->id];
4944 		struct rb_node **parent, *rb;
4945 		bool first;
4946 
4947 		if (unlikely(!(mask & sibling->mask))) {
4948 			if (!RB_EMPTY_NODE(&node->rb)) {
4949 				spin_lock(&sibling->active.lock);
4950 				rb_erase_cached(&node->rb,
4951 						&sibling->execlists.virtual);
4952 				RB_CLEAR_NODE(&node->rb);
4953 				spin_unlock(&sibling->active.lock);
4954 			}
4955 			continue;
4956 		}
4957 
4958 		spin_lock(&sibling->active.lock);
4959 
4960 		if (!RB_EMPTY_NODE(&node->rb)) {
4961 			/*
4962 			 * Cheat and avoid rebalancing the tree if we can
4963 			 * reuse this node in situ.
4964 			 */
4965 			first = rb_first_cached(&sibling->execlists.virtual) ==
4966 				&node->rb;
4967 			if (prio == node->prio || (prio > node->prio && first))
4968 				goto submit_engine;
4969 
4970 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4971 		}
4972 
4973 		rb = NULL;
4974 		first = true;
4975 		parent = &sibling->execlists.virtual.rb_root.rb_node;
4976 		while (*parent) {
4977 			struct ve_node *other;
4978 
4979 			rb = *parent;
4980 			other = rb_entry(rb, typeof(*other), rb);
4981 			if (prio > other->prio) {
4982 				parent = &rb->rb_left;
4983 			} else {
4984 				parent = &rb->rb_right;
4985 				first = false;
4986 			}
4987 		}
4988 
4989 		rb_link_node(&node->rb, rb, parent);
4990 		rb_insert_color_cached(&node->rb,
4991 				       &sibling->execlists.virtual,
4992 				       first);
4993 
4994 submit_engine:
4995 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4996 		node->prio = prio;
4997 		if (first && prio > sibling->execlists.queue_priority_hint) {
4998 			sibling->execlists.queue_priority_hint = prio;
4999 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5000 		}
5001 
5002 		spin_unlock(&sibling->active.lock);
5003 	}
5004 	local_irq_enable();
5005 }
5006 
5007 static void virtual_submit_request(struct i915_request *rq)
5008 {
5009 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5010 	struct i915_request *old;
5011 	unsigned long flags;
5012 
5013 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5014 		     rq->fence.context,
5015 		     rq->fence.seqno);
5016 
5017 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5018 
5019 	spin_lock_irqsave(&ve->base.active.lock, flags);
5020 
5021 	old = ve->request;
5022 	if (old) { /* background completion event from preempt-to-busy */
5023 		GEM_BUG_ON(!i915_request_completed(old));
5024 		__i915_request_submit(old);
5025 		i915_request_put(old);
5026 	}
5027 
5028 	if (i915_request_completed(rq)) {
5029 		__i915_request_submit(rq);
5030 
5031 		ve->base.execlists.queue_priority_hint = INT_MIN;
5032 		ve->request = NULL;
5033 	} else {
5034 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5035 		ve->request = i915_request_get(rq);
5036 
5037 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5038 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5039 
5040 		tasklet_schedule(&ve->base.execlists.tasklet);
5041 	}
5042 
5043 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5044 }
5045 
5046 static struct ve_bond *
5047 virtual_find_bond(struct virtual_engine *ve,
5048 		  const struct intel_engine_cs *master)
5049 {
5050 	int i;
5051 
5052 	for (i = 0; i < ve->num_bonds; i++) {
5053 		if (ve->bonds[i].master == master)
5054 			return &ve->bonds[i];
5055 	}
5056 
5057 	return NULL;
5058 }
5059 
5060 static void
5061 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5062 {
5063 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5064 	intel_engine_mask_t allowed, exec;
5065 	struct ve_bond *bond;
5066 
5067 	allowed = ~to_request(signal)->engine->mask;
5068 
5069 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5070 	if (bond)
5071 		allowed &= bond->sibling_mask;
5072 
5073 	/* Restrict the bonded request to run on only the available engines */
5074 	exec = READ_ONCE(rq->execution_mask);
5075 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5076 		;
5077 
5078 	/* Prevent the master from being re-run on the bonded engines */
5079 	to_request(signal)->execution_mask &= ~allowed;
5080 }
5081 
5082 struct intel_context *
5083 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5084 			       unsigned int count)
5085 {
5086 	struct virtual_engine *ve;
5087 	unsigned int n;
5088 	int err;
5089 
5090 	if (count == 0)
5091 		return ERR_PTR(-EINVAL);
5092 
5093 	if (count == 1)
5094 		return intel_context_create(siblings[0]);
5095 
5096 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5097 	if (!ve)
5098 		return ERR_PTR(-ENOMEM);
5099 
5100 	ve->base.i915 = siblings[0]->i915;
5101 	ve->base.gt = siblings[0]->gt;
5102 	ve->base.uncore = siblings[0]->uncore;
5103 	ve->base.id = -1;
5104 
5105 	ve->base.class = OTHER_CLASS;
5106 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5107 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5108 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5109 
5110 	/*
5111 	 * The decision on whether to submit a request using semaphores
5112 	 * depends on the saturated state of the engine. We only compute
5113 	 * this during HW submission of the request, and we need for this
5114 	 * state to be globally applied to all requests being submitted
5115 	 * to this engine. Virtual engines encompass more than one physical
5116 	 * engine and so we cannot accurately tell in advance if one of those
5117 	 * engines is already saturated and so cannot afford to use a semaphore
5118 	 * and be pessimized in priority for doing so -- if we are the only
5119 	 * context using semaphores after all other clients have stopped, we
5120 	 * will be starved on the saturated system. Such a global switch for
5121 	 * semaphores is less than ideal, but alas is the current compromise.
5122 	 */
5123 	ve->base.saturated = ALL_ENGINES;
5124 
5125 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5126 
5127 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5128 	intel_engine_init_breadcrumbs(&ve->base);
5129 	intel_engine_init_execlists(&ve->base);
5130 
5131 	ve->base.cops = &virtual_context_ops;
5132 	ve->base.request_alloc = execlists_request_alloc;
5133 
5134 	ve->base.schedule = i915_schedule;
5135 	ve->base.submit_request = virtual_submit_request;
5136 	ve->base.bond_execute = virtual_bond_execute;
5137 
5138 	INIT_LIST_HEAD(virtual_queue(ve));
5139 	ve->base.execlists.queue_priority_hint = INT_MIN;
5140 	tasklet_init(&ve->base.execlists.tasklet,
5141 		     virtual_submission_tasklet,
5142 		     (unsigned long)ve);
5143 
5144 	intel_context_init(&ve->context, &ve->base);
5145 
5146 	for (n = 0; n < count; n++) {
5147 		struct intel_engine_cs *sibling = siblings[n];
5148 
5149 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5150 		if (sibling->mask & ve->base.mask) {
5151 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5152 				  sibling->name);
5153 			err = -EINVAL;
5154 			goto err_put;
5155 		}
5156 
5157 		/*
5158 		 * The virtual engine implementation is tightly coupled to
5159 		 * the execlists backend -- we push out request directly
5160 		 * into a tree inside each physical engine. We could support
5161 		 * layering if we handle cloning of the requests and
5162 		 * submitting a copy into each backend.
5163 		 */
5164 		if (sibling->execlists.tasklet.func !=
5165 		    execlists_submission_tasklet) {
5166 			err = -ENODEV;
5167 			goto err_put;
5168 		}
5169 
5170 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5171 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5172 
5173 		ve->siblings[ve->num_siblings++] = sibling;
5174 		ve->base.mask |= sibling->mask;
5175 
5176 		/*
5177 		 * All physical engines must be compatible for their emission
5178 		 * functions (as we build the instructions during request
5179 		 * construction and do not alter them before submission
5180 		 * on the physical engine). We use the engine class as a guide
5181 		 * here, although that could be refined.
5182 		 */
5183 		if (ve->base.class != OTHER_CLASS) {
5184 			if (ve->base.class != sibling->class) {
5185 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5186 					  sibling->class, ve->base.class);
5187 				err = -EINVAL;
5188 				goto err_put;
5189 			}
5190 			continue;
5191 		}
5192 
5193 		ve->base.class = sibling->class;
5194 		ve->base.uabi_class = sibling->uabi_class;
5195 		snprintf(ve->base.name, sizeof(ve->base.name),
5196 			 "v%dx%d", ve->base.class, count);
5197 		ve->base.context_size = sibling->context_size;
5198 
5199 		ve->base.emit_bb_start = sibling->emit_bb_start;
5200 		ve->base.emit_flush = sibling->emit_flush;
5201 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5202 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5203 		ve->base.emit_fini_breadcrumb_dw =
5204 			sibling->emit_fini_breadcrumb_dw;
5205 
5206 		ve->base.flags = sibling->flags;
5207 	}
5208 
5209 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5210 
5211 	return &ve->context;
5212 
5213 err_put:
5214 	intel_context_put(&ve->context);
5215 	return ERR_PTR(err);
5216 }
5217 
5218 struct intel_context *
5219 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5220 {
5221 	struct virtual_engine *se = to_virtual_engine(src);
5222 	struct intel_context *dst;
5223 
5224 	dst = intel_execlists_create_virtual(se->siblings,
5225 					     se->num_siblings);
5226 	if (IS_ERR(dst))
5227 		return dst;
5228 
5229 	if (se->num_bonds) {
5230 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5231 
5232 		de->bonds = kmemdup(se->bonds,
5233 				    sizeof(*se->bonds) * se->num_bonds,
5234 				    GFP_KERNEL);
5235 		if (!de->bonds) {
5236 			intel_context_put(dst);
5237 			return ERR_PTR(-ENOMEM);
5238 		}
5239 
5240 		de->num_bonds = se->num_bonds;
5241 	}
5242 
5243 	return dst;
5244 }
5245 
5246 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5247 				     const struct intel_engine_cs *master,
5248 				     const struct intel_engine_cs *sibling)
5249 {
5250 	struct virtual_engine *ve = to_virtual_engine(engine);
5251 	struct ve_bond *bond;
5252 	int n;
5253 
5254 	/* Sanity check the sibling is part of the virtual engine */
5255 	for (n = 0; n < ve->num_siblings; n++)
5256 		if (sibling == ve->siblings[n])
5257 			break;
5258 	if (n == ve->num_siblings)
5259 		return -EINVAL;
5260 
5261 	bond = virtual_find_bond(ve, master);
5262 	if (bond) {
5263 		bond->sibling_mask |= sibling->mask;
5264 		return 0;
5265 	}
5266 
5267 	bond = krealloc(ve->bonds,
5268 			sizeof(*bond) * (ve->num_bonds + 1),
5269 			GFP_KERNEL);
5270 	if (!bond)
5271 		return -ENOMEM;
5272 
5273 	bond[ve->num_bonds].master = master;
5274 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5275 
5276 	ve->bonds = bond;
5277 	ve->num_bonds++;
5278 
5279 	return 0;
5280 }
5281 
5282 struct intel_engine_cs *
5283 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5284 				 unsigned int sibling)
5285 {
5286 	struct virtual_engine *ve = to_virtual_engine(engine);
5287 
5288 	if (sibling >= ve->num_siblings)
5289 		return NULL;
5290 
5291 	return ve->siblings[sibling];
5292 }
5293 
5294 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5295 				   struct drm_printer *m,
5296 				   void (*show_request)(struct drm_printer *m,
5297 							struct i915_request *rq,
5298 							const char *prefix),
5299 				   unsigned int max)
5300 {
5301 	const struct intel_engine_execlists *execlists = &engine->execlists;
5302 	struct i915_request *rq, *last;
5303 	unsigned long flags;
5304 	unsigned int count;
5305 	struct rb_node *rb;
5306 
5307 	spin_lock_irqsave(&engine->active.lock, flags);
5308 
5309 	last = NULL;
5310 	count = 0;
5311 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5312 		if (count++ < max - 1)
5313 			show_request(m, rq, "\t\tE ");
5314 		else
5315 			last = rq;
5316 	}
5317 	if (last) {
5318 		if (count > max) {
5319 			drm_printf(m,
5320 				   "\t\t...skipping %d executing requests...\n",
5321 				   count - max);
5322 		}
5323 		show_request(m, last, "\t\tE ");
5324 	}
5325 
5326 	if (execlists->switch_priority_hint != INT_MIN)
5327 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5328 			   READ_ONCE(execlists->switch_priority_hint));
5329 	if (execlists->queue_priority_hint != INT_MIN)
5330 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5331 			   READ_ONCE(execlists->queue_priority_hint));
5332 
5333 	last = NULL;
5334 	count = 0;
5335 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5336 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5337 		int i;
5338 
5339 		priolist_for_each_request(rq, p, i) {
5340 			if (count++ < max - 1)
5341 				show_request(m, rq, "\t\tQ ");
5342 			else
5343 				last = rq;
5344 		}
5345 	}
5346 	if (last) {
5347 		if (count > max) {
5348 			drm_printf(m,
5349 				   "\t\t...skipping %d queued requests...\n",
5350 				   count - max);
5351 		}
5352 		show_request(m, last, "\t\tQ ");
5353 	}
5354 
5355 	last = NULL;
5356 	count = 0;
5357 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5358 		struct virtual_engine *ve =
5359 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5360 		struct i915_request *rq = READ_ONCE(ve->request);
5361 
5362 		if (rq) {
5363 			if (count++ < max - 1)
5364 				show_request(m, rq, "\t\tV ");
5365 			else
5366 				last = rq;
5367 		}
5368 	}
5369 	if (last) {
5370 		if (count > max) {
5371 			drm_printf(m,
5372 				   "\t\t...skipping %d virtual requests...\n",
5373 				   count - max);
5374 		}
5375 		show_request(m, last, "\t\tV ");
5376 	}
5377 
5378 	spin_unlock_irqrestore(&engine->active.lock, flags);
5379 }
5380 
5381 void intel_lr_context_reset(struct intel_engine_cs *engine,
5382 			    struct intel_context *ce,
5383 			    u32 head,
5384 			    bool scrub)
5385 {
5386 	GEM_BUG_ON(!intel_context_is_pinned(ce));
5387 
5388 	/*
5389 	 * We want a simple context + ring to execute the breadcrumb update.
5390 	 * We cannot rely on the context being intact across the GPU hang,
5391 	 * so clear it and rebuild just what we need for the breadcrumb.
5392 	 * All pending requests for this context will be zapped, and any
5393 	 * future request will be after userspace has had the opportunity
5394 	 * to recreate its own state.
5395 	 */
5396 	if (scrub)
5397 		restore_default_state(ce, engine);
5398 
5399 	/* Rerun the request; its payload has been neutered (if guilty). */
5400 	__execlists_update_reg_state(ce, engine, head);
5401 }
5402 
5403 bool
5404 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5405 {
5406 	return engine->set_default_submission ==
5407 	       intel_execlists_set_default_submission;
5408 }
5409 
5410 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5411 #include "selftest_lrc.c"
5412 #endif
5413