xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision dc6a81c3)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 
151 #define RING_EXECLIST_QFULL		(1 << 0x2)
152 #define RING_EXECLIST1_VALID		(1 << 0x3)
153 #define RING_EXECLIST0_VALID		(1 << 0x4)
154 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
155 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
156 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
157 
158 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
159 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
161 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
162 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
163 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
164 
165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
166 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
167 
168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
169 
170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
172 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
173 #define GEN12_IDLE_CTX_ID		0x7FF
174 #define GEN12_CSB_CTX_VALID(csb_dw) \
175 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
176 
177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
179 #define WA_TAIL_DWORDS 2
180 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
181 
182 struct virtual_engine {
183 	struct intel_engine_cs base;
184 	struct intel_context context;
185 
186 	/*
187 	 * We allow only a single request through the virtual engine at a time
188 	 * (each request in the timeline waits for the completion fence of
189 	 * the previous before being submitted). By restricting ourselves to
190 	 * only submitting a single request, each request is placed on to a
191 	 * physical to maximise load spreading (by virtue of the late greedy
192 	 * scheduling -- each real engine takes the next available request
193 	 * upon idling).
194 	 */
195 	struct i915_request *request;
196 
197 	/*
198 	 * We keep a rbtree of available virtual engines inside each physical
199 	 * engine, sorted by priority. Here we preallocate the nodes we need
200 	 * for the virtual engine, indexed by physical_engine->id.
201 	 */
202 	struct ve_node {
203 		struct rb_node rb;
204 		int prio;
205 	} nodes[I915_NUM_ENGINES];
206 
207 	/*
208 	 * Keep track of bonded pairs -- restrictions upon on our selection
209 	 * of physical engines any particular request may be submitted to.
210 	 * If we receive a submit-fence from a master engine, we will only
211 	 * use one of sibling_mask physical engines.
212 	 */
213 	struct ve_bond {
214 		const struct intel_engine_cs *master;
215 		intel_engine_mask_t sibling_mask;
216 	} *bonds;
217 	unsigned int num_bonds;
218 
219 	/* And finally, which physical engines this virtual engine maps onto. */
220 	unsigned int num_siblings;
221 	struct intel_engine_cs *siblings[0];
222 };
223 
224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
225 {
226 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
227 	return container_of(engine, struct virtual_engine, base);
228 }
229 
230 static int __execlists_context_alloc(struct intel_context *ce,
231 				     struct intel_engine_cs *engine);
232 
233 static void execlists_init_reg_state(u32 *reg_state,
234 				     const struct intel_context *ce,
235 				     const struct intel_engine_cs *engine,
236 				     const struct intel_ring *ring,
237 				     bool close);
238 static void
239 __execlists_update_reg_state(const struct intel_context *ce,
240 			     const struct intel_engine_cs *engine);
241 
242 static void mark_eio(struct i915_request *rq)
243 {
244 	if (i915_request_completed(rq))
245 		return;
246 
247 	GEM_BUG_ON(i915_request_signaled(rq));
248 
249 	dma_fence_set_error(&rq->fence, -EIO);
250 	i915_request_mark_complete(rq);
251 }
252 
253 static struct i915_request *
254 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
255 {
256 	struct i915_request *active = rq;
257 
258 	rcu_read_lock();
259 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
260 		if (i915_request_completed(rq))
261 			break;
262 
263 		active = rq;
264 	}
265 	rcu_read_unlock();
266 
267 	return active;
268 }
269 
270 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
271 {
272 	return (i915_ggtt_offset(engine->status_page.vma) +
273 		I915_GEM_HWS_PREEMPT_ADDR);
274 }
275 
276 static inline void
277 ring_set_paused(const struct intel_engine_cs *engine, int state)
278 {
279 	/*
280 	 * We inspect HWS_PREEMPT with a semaphore inside
281 	 * engine->emit_fini_breadcrumb. If the dword is true,
282 	 * the ring is paused as the semaphore will busywait
283 	 * until the dword is false.
284 	 */
285 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
286 	if (state)
287 		wmb();
288 }
289 
290 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
291 {
292 	return rb_entry(rb, struct i915_priolist, node);
293 }
294 
295 static inline int rq_prio(const struct i915_request *rq)
296 {
297 	return rq->sched.attr.priority;
298 }
299 
300 static int effective_prio(const struct i915_request *rq)
301 {
302 	int prio = rq_prio(rq);
303 
304 	/*
305 	 * If this request is special and must not be interrupted at any
306 	 * cost, so be it. Note we are only checking the most recent request
307 	 * in the context and so may be masking an earlier vip request. It
308 	 * is hoped that under the conditions where nopreempt is used, this
309 	 * will not matter (i.e. all requests to that context will be
310 	 * nopreempt for as long as desired).
311 	 */
312 	if (i915_request_has_nopreempt(rq))
313 		prio = I915_PRIORITY_UNPREEMPTABLE;
314 
315 	/*
316 	 * On unwinding the active request, we give it a priority bump
317 	 * if it has completed waiting on any semaphore. If we know that
318 	 * the request has already started, we can prevent an unwanted
319 	 * preempt-to-idle cycle by taking that into account now.
320 	 */
321 	if (__i915_request_has_started(rq))
322 		prio |= I915_PRIORITY_NOSEMAPHORE;
323 
324 	/* Restrict mere WAIT boosts from triggering preemption */
325 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
326 	return prio | __NO_PREEMPTION;
327 }
328 
329 static int queue_prio(const struct intel_engine_execlists *execlists)
330 {
331 	struct i915_priolist *p;
332 	struct rb_node *rb;
333 
334 	rb = rb_first_cached(&execlists->queue);
335 	if (!rb)
336 		return INT_MIN;
337 
338 	/*
339 	 * As the priolist[] are inverted, with the highest priority in [0],
340 	 * we have to flip the index value to become priority.
341 	 */
342 	p = to_priolist(rb);
343 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
344 }
345 
346 static inline bool need_preempt(const struct intel_engine_cs *engine,
347 				const struct i915_request *rq,
348 				struct rb_node *rb)
349 {
350 	int last_prio;
351 
352 	if (!intel_engine_has_semaphores(engine))
353 		return false;
354 
355 	/*
356 	 * Check if the current priority hint merits a preemption attempt.
357 	 *
358 	 * We record the highest value priority we saw during rescheduling
359 	 * prior to this dequeue, therefore we know that if it is strictly
360 	 * less than the current tail of ESLP[0], we do not need to force
361 	 * a preempt-to-idle cycle.
362 	 *
363 	 * However, the priority hint is a mere hint that we may need to
364 	 * preempt. If that hint is stale or we may be trying to preempt
365 	 * ourselves, ignore the request.
366 	 *
367 	 * More naturally we would write
368 	 *      prio >= max(0, last);
369 	 * except that we wish to prevent triggering preemption at the same
370 	 * priority level: the task that is running should remain running
371 	 * to preserve FIFO ordering of dependencies.
372 	 */
373 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
374 	if (engine->execlists.queue_priority_hint <= last_prio)
375 		return false;
376 
377 	/*
378 	 * Check against the first request in ELSP[1], it will, thanks to the
379 	 * power of PI, be the highest priority of that context.
380 	 */
381 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
382 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
383 		return true;
384 
385 	if (rb) {
386 		struct virtual_engine *ve =
387 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
388 		bool preempt = false;
389 
390 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
391 			struct i915_request *next;
392 
393 			rcu_read_lock();
394 			next = READ_ONCE(ve->request);
395 			if (next)
396 				preempt = rq_prio(next) > last_prio;
397 			rcu_read_unlock();
398 		}
399 
400 		if (preempt)
401 			return preempt;
402 	}
403 
404 	/*
405 	 * If the inflight context did not trigger the preemption, then maybe
406 	 * it was the set of queued requests? Pick the highest priority in
407 	 * the queue (the first active priolist) and see if it deserves to be
408 	 * running instead of ELSP[0].
409 	 *
410 	 * The highest priority request in the queue can not be either
411 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
412 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
413 	 */
414 	return queue_prio(&engine->execlists) > last_prio;
415 }
416 
417 __maybe_unused static inline bool
418 assert_priority_queue(const struct i915_request *prev,
419 		      const struct i915_request *next)
420 {
421 	/*
422 	 * Without preemption, the prev may refer to the still active element
423 	 * which we refuse to let go.
424 	 *
425 	 * Even with preemption, there are times when we think it is better not
426 	 * to preempt and leave an ostensibly lower priority request in flight.
427 	 */
428 	if (i915_request_is_active(prev))
429 		return true;
430 
431 	return rq_prio(prev) >= rq_prio(next);
432 }
433 
434 /*
435  * The context descriptor encodes various attributes of a context,
436  * including its GTT address and some flags. Because it's fairly
437  * expensive to calculate, we'll just do it once and cache the result,
438  * which remains valid until the context is unpinned.
439  *
440  * This is what a descriptor looks like, from LSB to MSB::
441  *
442  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
443  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
444  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
445  *      bits 53-54:    mbz, reserved for use by hardware
446  *      bits 55-63:    group ID, currently unused and set to 0
447  *
448  * Starting from Gen11, the upper dword of the descriptor has a new format:
449  *
450  *      bits 32-36:    reserved
451  *      bits 37-47:    SW context ID
452  *      bits 48:53:    engine instance
453  *      bit 54:        mbz, reserved for use by hardware
454  *      bits 55-60:    SW counter
455  *      bits 61-63:    engine class
456  *
457  * engine info, SW context ID and SW counter need to form a unique number
458  * (Context ID) per lrc.
459  */
460 static u64
461 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
462 {
463 	u64 desc;
464 
465 	desc = INTEL_LEGACY_32B_CONTEXT;
466 	if (i915_vm_is_4lvl(ce->vm))
467 		desc = INTEL_LEGACY_64B_CONTEXT;
468 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
469 
470 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
471 	if (IS_GEN(engine->i915, 8))
472 		desc |= GEN8_CTX_L3LLC_COHERENT;
473 
474 	desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
475 	/*
476 	 * The following 32bits are copied into the OA reports (dword 2).
477 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
478 	 * anything below.
479 	 */
480 	if (INTEL_GEN(engine->i915) >= 11) {
481 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
482 								/* bits 48-53 */
483 
484 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
485 								/* bits 61-63 */
486 	}
487 
488 	return desc;
489 }
490 
491 static inline unsigned int dword_in_page(void *addr)
492 {
493 	return offset_in_page(addr) / sizeof(u32);
494 }
495 
496 static void set_offsets(u32 *regs,
497 			const u8 *data,
498 			const struct intel_engine_cs *engine,
499 			bool clear)
500 #define NOP(x) (BIT(7) | (x))
501 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
502 #define POSTED BIT(0)
503 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
504 #define REG16(x) \
505 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
506 	(((x) >> 2) & 0x7f)
507 #define END(x) 0, (x)
508 {
509 	const u32 base = engine->mmio_base;
510 
511 	while (*data) {
512 		u8 count, flags;
513 
514 		if (*data & BIT(7)) { /* skip */
515 			count = *data++ & ~BIT(7);
516 			if (clear)
517 				memset32(regs, MI_NOOP, count);
518 			regs += count;
519 			continue;
520 		}
521 
522 		count = *data & 0x3f;
523 		flags = *data >> 6;
524 		data++;
525 
526 		*regs = MI_LOAD_REGISTER_IMM(count);
527 		if (flags & POSTED)
528 			*regs |= MI_LRI_FORCE_POSTED;
529 		if (INTEL_GEN(engine->i915) >= 11)
530 			*regs |= MI_LRI_CS_MMIO;
531 		regs++;
532 
533 		GEM_BUG_ON(!count);
534 		do {
535 			u32 offset = 0;
536 			u8 v;
537 
538 			do {
539 				v = *data++;
540 				offset <<= 7;
541 				offset |= v & ~BIT(7);
542 			} while (v & BIT(7));
543 
544 			regs[0] = base + (offset << 2);
545 			if (clear)
546 				regs[1] = 0;
547 			regs += 2;
548 		} while (--count);
549 	}
550 
551 	if (clear) {
552 		u8 count = *++data;
553 
554 		/* Clear past the tail for HW access */
555 		GEM_BUG_ON(dword_in_page(regs) > count);
556 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
557 
558 		/* Close the batch; used mainly by live_lrc_layout() */
559 		*regs = MI_BATCH_BUFFER_END;
560 		if (INTEL_GEN(engine->i915) >= 10)
561 			*regs |= BIT(0);
562 	}
563 }
564 
565 static const u8 gen8_xcs_offsets[] = {
566 	NOP(1),
567 	LRI(11, 0),
568 	REG16(0x244),
569 	REG(0x034),
570 	REG(0x030),
571 	REG(0x038),
572 	REG(0x03c),
573 	REG(0x168),
574 	REG(0x140),
575 	REG(0x110),
576 	REG(0x11c),
577 	REG(0x114),
578 	REG(0x118),
579 
580 	NOP(9),
581 	LRI(9, 0),
582 	REG16(0x3a8),
583 	REG16(0x28c),
584 	REG16(0x288),
585 	REG16(0x284),
586 	REG16(0x280),
587 	REG16(0x27c),
588 	REG16(0x278),
589 	REG16(0x274),
590 	REG16(0x270),
591 
592 	NOP(13),
593 	LRI(2, 0),
594 	REG16(0x200),
595 	REG(0x028),
596 
597 	END(80)
598 };
599 
600 static const u8 gen9_xcs_offsets[] = {
601 	NOP(1),
602 	LRI(14, POSTED),
603 	REG16(0x244),
604 	REG(0x034),
605 	REG(0x030),
606 	REG(0x038),
607 	REG(0x03c),
608 	REG(0x168),
609 	REG(0x140),
610 	REG(0x110),
611 	REG(0x11c),
612 	REG(0x114),
613 	REG(0x118),
614 	REG(0x1c0),
615 	REG(0x1c4),
616 	REG(0x1c8),
617 
618 	NOP(3),
619 	LRI(9, POSTED),
620 	REG16(0x3a8),
621 	REG16(0x28c),
622 	REG16(0x288),
623 	REG16(0x284),
624 	REG16(0x280),
625 	REG16(0x27c),
626 	REG16(0x278),
627 	REG16(0x274),
628 	REG16(0x270),
629 
630 	NOP(13),
631 	LRI(1, POSTED),
632 	REG16(0x200),
633 
634 	NOP(13),
635 	LRI(44, POSTED),
636 	REG(0x028),
637 	REG(0x09c),
638 	REG(0x0c0),
639 	REG(0x178),
640 	REG(0x17c),
641 	REG16(0x358),
642 	REG(0x170),
643 	REG(0x150),
644 	REG(0x154),
645 	REG(0x158),
646 	REG16(0x41c),
647 	REG16(0x600),
648 	REG16(0x604),
649 	REG16(0x608),
650 	REG16(0x60c),
651 	REG16(0x610),
652 	REG16(0x614),
653 	REG16(0x618),
654 	REG16(0x61c),
655 	REG16(0x620),
656 	REG16(0x624),
657 	REG16(0x628),
658 	REG16(0x62c),
659 	REG16(0x630),
660 	REG16(0x634),
661 	REG16(0x638),
662 	REG16(0x63c),
663 	REG16(0x640),
664 	REG16(0x644),
665 	REG16(0x648),
666 	REG16(0x64c),
667 	REG16(0x650),
668 	REG16(0x654),
669 	REG16(0x658),
670 	REG16(0x65c),
671 	REG16(0x660),
672 	REG16(0x664),
673 	REG16(0x668),
674 	REG16(0x66c),
675 	REG16(0x670),
676 	REG16(0x674),
677 	REG16(0x678),
678 	REG16(0x67c),
679 	REG(0x068),
680 
681 	END(176)
682 };
683 
684 static const u8 gen12_xcs_offsets[] = {
685 	NOP(1),
686 	LRI(13, POSTED),
687 	REG16(0x244),
688 	REG(0x034),
689 	REG(0x030),
690 	REG(0x038),
691 	REG(0x03c),
692 	REG(0x168),
693 	REG(0x140),
694 	REG(0x110),
695 	REG(0x1c0),
696 	REG(0x1c4),
697 	REG(0x1c8),
698 	REG(0x180),
699 	REG16(0x2b4),
700 
701 	NOP(5),
702 	LRI(9, POSTED),
703 	REG16(0x3a8),
704 	REG16(0x28c),
705 	REG16(0x288),
706 	REG16(0x284),
707 	REG16(0x280),
708 	REG16(0x27c),
709 	REG16(0x278),
710 	REG16(0x274),
711 	REG16(0x270),
712 
713 	END(80)
714 };
715 
716 static const u8 gen8_rcs_offsets[] = {
717 	NOP(1),
718 	LRI(14, POSTED),
719 	REG16(0x244),
720 	REG(0x034),
721 	REG(0x030),
722 	REG(0x038),
723 	REG(0x03c),
724 	REG(0x168),
725 	REG(0x140),
726 	REG(0x110),
727 	REG(0x11c),
728 	REG(0x114),
729 	REG(0x118),
730 	REG(0x1c0),
731 	REG(0x1c4),
732 	REG(0x1c8),
733 
734 	NOP(3),
735 	LRI(9, POSTED),
736 	REG16(0x3a8),
737 	REG16(0x28c),
738 	REG16(0x288),
739 	REG16(0x284),
740 	REG16(0x280),
741 	REG16(0x27c),
742 	REG16(0x278),
743 	REG16(0x274),
744 	REG16(0x270),
745 
746 	NOP(13),
747 	LRI(1, 0),
748 	REG(0x0c8),
749 
750 	END(80)
751 };
752 
753 static const u8 gen9_rcs_offsets[] = {
754 	NOP(1),
755 	LRI(14, POSTED),
756 	REG16(0x244),
757 	REG(0x34),
758 	REG(0x30),
759 	REG(0x38),
760 	REG(0x3c),
761 	REG(0x168),
762 	REG(0x140),
763 	REG(0x110),
764 	REG(0x11c),
765 	REG(0x114),
766 	REG(0x118),
767 	REG(0x1c0),
768 	REG(0x1c4),
769 	REG(0x1c8),
770 
771 	NOP(3),
772 	LRI(9, POSTED),
773 	REG16(0x3a8),
774 	REG16(0x28c),
775 	REG16(0x288),
776 	REG16(0x284),
777 	REG16(0x280),
778 	REG16(0x27c),
779 	REG16(0x278),
780 	REG16(0x274),
781 	REG16(0x270),
782 
783 	NOP(13),
784 	LRI(1, 0),
785 	REG(0xc8),
786 
787 	NOP(13),
788 	LRI(44, POSTED),
789 	REG(0x28),
790 	REG(0x9c),
791 	REG(0xc0),
792 	REG(0x178),
793 	REG(0x17c),
794 	REG16(0x358),
795 	REG(0x170),
796 	REG(0x150),
797 	REG(0x154),
798 	REG(0x158),
799 	REG16(0x41c),
800 	REG16(0x600),
801 	REG16(0x604),
802 	REG16(0x608),
803 	REG16(0x60c),
804 	REG16(0x610),
805 	REG16(0x614),
806 	REG16(0x618),
807 	REG16(0x61c),
808 	REG16(0x620),
809 	REG16(0x624),
810 	REG16(0x628),
811 	REG16(0x62c),
812 	REG16(0x630),
813 	REG16(0x634),
814 	REG16(0x638),
815 	REG16(0x63c),
816 	REG16(0x640),
817 	REG16(0x644),
818 	REG16(0x648),
819 	REG16(0x64c),
820 	REG16(0x650),
821 	REG16(0x654),
822 	REG16(0x658),
823 	REG16(0x65c),
824 	REG16(0x660),
825 	REG16(0x664),
826 	REG16(0x668),
827 	REG16(0x66c),
828 	REG16(0x670),
829 	REG16(0x674),
830 	REG16(0x678),
831 	REG16(0x67c),
832 	REG(0x68),
833 
834 	END(176)
835 };
836 
837 static const u8 gen11_rcs_offsets[] = {
838 	NOP(1),
839 	LRI(15, POSTED),
840 	REG16(0x244),
841 	REG(0x034),
842 	REG(0x030),
843 	REG(0x038),
844 	REG(0x03c),
845 	REG(0x168),
846 	REG(0x140),
847 	REG(0x110),
848 	REG(0x11c),
849 	REG(0x114),
850 	REG(0x118),
851 	REG(0x1c0),
852 	REG(0x1c4),
853 	REG(0x1c8),
854 	REG(0x180),
855 
856 	NOP(1),
857 	LRI(9, POSTED),
858 	REG16(0x3a8),
859 	REG16(0x28c),
860 	REG16(0x288),
861 	REG16(0x284),
862 	REG16(0x280),
863 	REG16(0x27c),
864 	REG16(0x278),
865 	REG16(0x274),
866 	REG16(0x270),
867 
868 	LRI(1, POSTED),
869 	REG(0x1b0),
870 
871 	NOP(10),
872 	LRI(1, 0),
873 	REG(0x0c8),
874 
875 	END(80)
876 };
877 
878 static const u8 gen12_rcs_offsets[] = {
879 	NOP(1),
880 	LRI(13, POSTED),
881 	REG16(0x244),
882 	REG(0x034),
883 	REG(0x030),
884 	REG(0x038),
885 	REG(0x03c),
886 	REG(0x168),
887 	REG(0x140),
888 	REG(0x110),
889 	REG(0x1c0),
890 	REG(0x1c4),
891 	REG(0x1c8),
892 	REG(0x180),
893 	REG16(0x2b4),
894 
895 	NOP(5),
896 	LRI(9, POSTED),
897 	REG16(0x3a8),
898 	REG16(0x28c),
899 	REG16(0x288),
900 	REG16(0x284),
901 	REG16(0x280),
902 	REG16(0x27c),
903 	REG16(0x278),
904 	REG16(0x274),
905 	REG16(0x270),
906 
907 	LRI(3, POSTED),
908 	REG(0x1b0),
909 	REG16(0x5a8),
910 	REG16(0x5ac),
911 
912 	NOP(6),
913 	LRI(1, 0),
914 	REG(0x0c8),
915 
916 	END(80)
917 };
918 
919 #undef END
920 #undef REG16
921 #undef REG
922 #undef LRI
923 #undef NOP
924 
925 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
926 {
927 	/*
928 	 * The gen12+ lists only have the registers we program in the basic
929 	 * default state. We rely on the context image using relative
930 	 * addressing to automatic fixup the register state between the
931 	 * physical engines for virtual engine.
932 	 */
933 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
934 		   !intel_engine_has_relative_mmio(engine));
935 
936 	if (engine->class == RENDER_CLASS) {
937 		if (INTEL_GEN(engine->i915) >= 12)
938 			return gen12_rcs_offsets;
939 		else if (INTEL_GEN(engine->i915) >= 11)
940 			return gen11_rcs_offsets;
941 		else if (INTEL_GEN(engine->i915) >= 9)
942 			return gen9_rcs_offsets;
943 		else
944 			return gen8_rcs_offsets;
945 	} else {
946 		if (INTEL_GEN(engine->i915) >= 12)
947 			return gen12_xcs_offsets;
948 		else if (INTEL_GEN(engine->i915) >= 9)
949 			return gen9_xcs_offsets;
950 		else
951 			return gen8_xcs_offsets;
952 	}
953 }
954 
955 static struct i915_request *
956 __unwind_incomplete_requests(struct intel_engine_cs *engine)
957 {
958 	struct i915_request *rq, *rn, *active = NULL;
959 	struct list_head *uninitialized_var(pl);
960 	int prio = I915_PRIORITY_INVALID;
961 
962 	lockdep_assert_held(&engine->active.lock);
963 
964 	list_for_each_entry_safe_reverse(rq, rn,
965 					 &engine->active.requests,
966 					 sched.link) {
967 		if (i915_request_completed(rq))
968 			continue; /* XXX */
969 
970 		__i915_request_unsubmit(rq);
971 
972 		/*
973 		 * Push the request back into the queue for later resubmission.
974 		 * If this request is not native to this physical engine (i.e.
975 		 * it came from a virtual source), push it back onto the virtual
976 		 * engine so that it can be moved across onto another physical
977 		 * engine as load dictates.
978 		 */
979 		if (likely(rq->execution_mask == engine->mask)) {
980 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
981 			if (rq_prio(rq) != prio) {
982 				prio = rq_prio(rq);
983 				pl = i915_sched_lookup_priolist(engine, prio);
984 			}
985 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
986 
987 			list_move(&rq->sched.link, pl);
988 			active = rq;
989 		} else {
990 			struct intel_engine_cs *owner = rq->context->engine;
991 
992 			/*
993 			 * Decouple the virtual breadcrumb before moving it
994 			 * back to the virtual engine -- we don't want the
995 			 * request to complete in the background and try
996 			 * and cancel the breadcrumb on the virtual engine
997 			 * (instead of the old engine where it is linked)!
998 			 */
999 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1000 				     &rq->fence.flags)) {
1001 				spin_lock_nested(&rq->lock,
1002 						 SINGLE_DEPTH_NESTING);
1003 				i915_request_cancel_breadcrumb(rq);
1004 				spin_unlock(&rq->lock);
1005 			}
1006 			rq->engine = owner;
1007 			owner->submit_request(rq);
1008 			active = NULL;
1009 		}
1010 	}
1011 
1012 	return active;
1013 }
1014 
1015 struct i915_request *
1016 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1017 {
1018 	struct intel_engine_cs *engine =
1019 		container_of(execlists, typeof(*engine), execlists);
1020 
1021 	return __unwind_incomplete_requests(engine);
1022 }
1023 
1024 static inline void
1025 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1026 {
1027 	/*
1028 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1029 	 * The compiler should eliminate this function as dead-code.
1030 	 */
1031 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1032 		return;
1033 
1034 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1035 				   status, rq);
1036 }
1037 
1038 static void intel_engine_context_in(struct intel_engine_cs *engine)
1039 {
1040 	unsigned long flags;
1041 
1042 	if (READ_ONCE(engine->stats.enabled) == 0)
1043 		return;
1044 
1045 	write_seqlock_irqsave(&engine->stats.lock, flags);
1046 
1047 	if (engine->stats.enabled > 0) {
1048 		if (engine->stats.active++ == 0)
1049 			engine->stats.start = ktime_get();
1050 		GEM_BUG_ON(engine->stats.active == 0);
1051 	}
1052 
1053 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1054 }
1055 
1056 static void intel_engine_context_out(struct intel_engine_cs *engine)
1057 {
1058 	unsigned long flags;
1059 
1060 	if (READ_ONCE(engine->stats.enabled) == 0)
1061 		return;
1062 
1063 	write_seqlock_irqsave(&engine->stats.lock, flags);
1064 
1065 	if (engine->stats.enabled > 0) {
1066 		ktime_t last;
1067 
1068 		if (engine->stats.active && --engine->stats.active == 0) {
1069 			/*
1070 			 * Decrement the active context count and in case GPU
1071 			 * is now idle add up to the running total.
1072 			 */
1073 			last = ktime_sub(ktime_get(), engine->stats.start);
1074 
1075 			engine->stats.total = ktime_add(engine->stats.total,
1076 							last);
1077 		} else if (engine->stats.active == 0) {
1078 			/*
1079 			 * After turning on engine stats, context out might be
1080 			 * the first event in which case we account from the
1081 			 * time stats gathering was turned on.
1082 			 */
1083 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1084 
1085 			engine->stats.total = ktime_add(engine->stats.total,
1086 							last);
1087 		}
1088 	}
1089 
1090 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1091 }
1092 
1093 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1094 {
1095 	if (INTEL_GEN(engine->i915) >= 12)
1096 		return 0x60;
1097 	else if (INTEL_GEN(engine->i915) >= 9)
1098 		return 0x54;
1099 	else if (engine->class == RENDER_CLASS)
1100 		return 0x58;
1101 	else
1102 		return -1;
1103 }
1104 
1105 static void
1106 execlists_check_context(const struct intel_context *ce,
1107 			const struct intel_engine_cs *engine)
1108 {
1109 	const struct intel_ring *ring = ce->ring;
1110 	u32 *regs = ce->lrc_reg_state;
1111 	bool valid = true;
1112 	int x;
1113 
1114 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1115 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1116 		       engine->name,
1117 		       regs[CTX_RING_START],
1118 		       i915_ggtt_offset(ring->vma));
1119 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1120 		valid = false;
1121 	}
1122 
1123 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1124 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1125 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1126 		       engine->name,
1127 		       regs[CTX_RING_CTL],
1128 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1129 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1130 		valid = false;
1131 	}
1132 
1133 	x = lrc_ring_mi_mode(engine);
1134 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1135 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1136 		       engine->name, regs[x + 1]);
1137 		regs[x + 1] &= ~STOP_RING;
1138 		regs[x + 1] |= STOP_RING << 16;
1139 		valid = false;
1140 	}
1141 
1142 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1143 }
1144 
1145 static void restore_default_state(struct intel_context *ce,
1146 				  struct intel_engine_cs *engine)
1147 {
1148 	u32 *regs = ce->lrc_reg_state;
1149 
1150 	if (engine->pinned_default_state)
1151 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
1152 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1153 		       engine->context_size - PAGE_SIZE);
1154 
1155 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1156 }
1157 
1158 static void reset_active(struct i915_request *rq,
1159 			 struct intel_engine_cs *engine)
1160 {
1161 	struct intel_context * const ce = rq->context;
1162 	u32 head;
1163 
1164 	/*
1165 	 * The executing context has been cancelled. We want to prevent
1166 	 * further execution along this context and propagate the error on
1167 	 * to anything depending on its results.
1168 	 *
1169 	 * In __i915_request_submit(), we apply the -EIO and remove the
1170 	 * requests' payloads for any banned requests. But first, we must
1171 	 * rewind the context back to the start of the incomplete request so
1172 	 * that we do not jump back into the middle of the batch.
1173 	 *
1174 	 * We preserve the breadcrumbs and semaphores of the incomplete
1175 	 * requests so that inter-timeline dependencies (i.e other timelines)
1176 	 * remain correctly ordered. And we defer to __i915_request_submit()
1177 	 * so that all asynchronous waits are correctly handled.
1178 	 */
1179 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1180 		     rq->fence.context, rq->fence.seqno);
1181 
1182 	/* On resubmission of the active request, payload will be scrubbed */
1183 	if (i915_request_completed(rq))
1184 		head = rq->tail;
1185 	else
1186 		head = active_request(ce->timeline, rq)->head;
1187 	ce->ring->head = intel_ring_wrap(ce->ring, head);
1188 	intel_ring_update_space(ce->ring);
1189 
1190 	/* Scrub the context image to prevent replaying the previous batch */
1191 	restore_default_state(ce, engine);
1192 	__execlists_update_reg_state(ce, engine);
1193 
1194 	/* We've switched away, so this should be a no-op, but intent matters */
1195 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1196 }
1197 
1198 static inline struct intel_engine_cs *
1199 __execlists_schedule_in(struct i915_request *rq)
1200 {
1201 	struct intel_engine_cs * const engine = rq->engine;
1202 	struct intel_context * const ce = rq->context;
1203 
1204 	intel_context_get(ce);
1205 
1206 	if (unlikely(intel_context_is_banned(ce)))
1207 		reset_active(rq, engine);
1208 
1209 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1210 		execlists_check_context(ce, engine);
1211 
1212 	if (ce->tag) {
1213 		/* Use a fixed tag for OA and friends */
1214 		ce->lrc_desc |= (u64)ce->tag << 32;
1215 	} else {
1216 		/* We don't need a strict matching tag, just different values */
1217 		ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1218 		ce->lrc_desc |=
1219 			(u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1220 			GEN11_SW_CTX_ID_SHIFT;
1221 		BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1222 	}
1223 
1224 	__intel_gt_pm_get(engine->gt);
1225 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1226 	intel_engine_context_in(engine);
1227 
1228 	return engine;
1229 }
1230 
1231 static inline struct i915_request *
1232 execlists_schedule_in(struct i915_request *rq, int idx)
1233 {
1234 	struct intel_context * const ce = rq->context;
1235 	struct intel_engine_cs *old;
1236 
1237 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1238 	trace_i915_request_in(rq, idx);
1239 
1240 	old = READ_ONCE(ce->inflight);
1241 	do {
1242 		if (!old) {
1243 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1244 			break;
1245 		}
1246 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1247 
1248 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1249 	return i915_request_get(rq);
1250 }
1251 
1252 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1253 {
1254 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1255 	struct i915_request *next = READ_ONCE(ve->request);
1256 
1257 	if (next && next->execution_mask & ~rq->execution_mask)
1258 		tasklet_schedule(&ve->base.execlists.tasklet);
1259 }
1260 
1261 static inline void
1262 __execlists_schedule_out(struct i915_request *rq,
1263 			 struct intel_engine_cs * const engine)
1264 {
1265 	struct intel_context * const ce = rq->context;
1266 
1267 	/*
1268 	 * NB process_csb() is not under the engine->active.lock and hence
1269 	 * schedule_out can race with schedule_in meaning that we should
1270 	 * refrain from doing non-trivial work here.
1271 	 */
1272 
1273 	/*
1274 	 * If we have just completed this context, the engine may now be
1275 	 * idle and we want to re-enter powersaving.
1276 	 */
1277 	if (list_is_last(&rq->link, &ce->timeline->requests) &&
1278 	    i915_request_completed(rq))
1279 		intel_engine_add_retire(engine, ce->timeline);
1280 
1281 	intel_engine_context_out(engine);
1282 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1283 	intel_gt_pm_put_async(engine->gt);
1284 
1285 	/*
1286 	 * If this is part of a virtual engine, its next request may
1287 	 * have been blocked waiting for access to the active context.
1288 	 * We have to kick all the siblings again in case we need to
1289 	 * switch (e.g. the next request is not runnable on this
1290 	 * engine). Hopefully, we will already have submitted the next
1291 	 * request before the tasklet runs and do not need to rebuild
1292 	 * each virtual tree and kick everyone again.
1293 	 */
1294 	if (ce->engine != engine)
1295 		kick_siblings(rq, ce);
1296 
1297 	intel_context_put(ce);
1298 }
1299 
1300 static inline void
1301 execlists_schedule_out(struct i915_request *rq)
1302 {
1303 	struct intel_context * const ce = rq->context;
1304 	struct intel_engine_cs *cur, *old;
1305 
1306 	trace_i915_request_out(rq);
1307 
1308 	old = READ_ONCE(ce->inflight);
1309 	do
1310 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1311 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1312 	if (!cur)
1313 		__execlists_schedule_out(rq, old);
1314 
1315 	i915_request_put(rq);
1316 }
1317 
1318 static u64 execlists_update_context(struct i915_request *rq)
1319 {
1320 	struct intel_context *ce = rq->context;
1321 	u64 desc = ce->lrc_desc;
1322 	u32 tail;
1323 
1324 	/*
1325 	 * WaIdleLiteRestore:bdw,skl
1326 	 *
1327 	 * We should never submit the context with the same RING_TAIL twice
1328 	 * just in case we submit an empty ring, which confuses the HW.
1329 	 *
1330 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1331 	 * the normal request to be able to always advance the RING_TAIL on
1332 	 * subsequent resubmissions (for lite restore). Should that fail us,
1333 	 * and we try and submit the same tail again, force the context
1334 	 * reload.
1335 	 */
1336 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1337 	if (unlikely(ce->lrc_reg_state[CTX_RING_TAIL] == tail))
1338 		desc |= CTX_DESC_FORCE_RESTORE;
1339 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1340 	rq->tail = rq->wa_tail;
1341 
1342 	/*
1343 	 * Make sure the context image is complete before we submit it to HW.
1344 	 *
1345 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1346 	 * an uncached write such as our mmio register access, the empirical
1347 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1348 	 * may not be visible to the HW prior to the completion of the UC
1349 	 * register write and that we may begin execution from the context
1350 	 * before its image is complete leading to invalid PD chasing.
1351 	 */
1352 	wmb();
1353 
1354 	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1355 	return desc;
1356 }
1357 
1358 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1359 {
1360 	if (execlists->ctrl_reg) {
1361 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1362 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1363 	} else {
1364 		writel(upper_32_bits(desc), execlists->submit_reg);
1365 		writel(lower_32_bits(desc), execlists->submit_reg);
1366 	}
1367 }
1368 
1369 static __maybe_unused void
1370 trace_ports(const struct intel_engine_execlists *execlists,
1371 	    const char *msg,
1372 	    struct i915_request * const *ports)
1373 {
1374 	const struct intel_engine_cs *engine =
1375 		container_of(execlists, typeof(*engine), execlists);
1376 
1377 	if (!ports[0])
1378 		return;
1379 
1380 	ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1381 		     ports[0]->fence.context,
1382 		     ports[0]->fence.seqno,
1383 		     i915_request_completed(ports[0]) ? "!" :
1384 		     i915_request_started(ports[0]) ? "*" :
1385 		     "",
1386 		     ports[1] ? ports[1]->fence.context : 0,
1387 		     ports[1] ? ports[1]->fence.seqno : 0);
1388 }
1389 
1390 static __maybe_unused bool
1391 assert_pending_valid(const struct intel_engine_execlists *execlists,
1392 		     const char *msg)
1393 {
1394 	struct i915_request * const *port, *rq;
1395 	struct intel_context *ce = NULL;
1396 
1397 	trace_ports(execlists, msg, execlists->pending);
1398 
1399 	if (!execlists->pending[0]) {
1400 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1401 		return false;
1402 	}
1403 
1404 	if (execlists->pending[execlists_num_ports(execlists)]) {
1405 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1406 			      execlists_num_ports(execlists));
1407 		return false;
1408 	}
1409 
1410 	for (port = execlists->pending; (rq = *port); port++) {
1411 		unsigned long flags;
1412 		bool ok = true;
1413 
1414 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1415 		GEM_BUG_ON(!i915_request_is_active(rq));
1416 
1417 		if (ce == rq->context) {
1418 			GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1419 				      ce->timeline->fence_context,
1420 				      port - execlists->pending);
1421 			return false;
1422 		}
1423 		ce = rq->context;
1424 
1425 		/* Hold tightly onto the lock to prevent concurrent retires! */
1426 		if (!spin_trylock_irqsave(&rq->lock, flags))
1427 			continue;
1428 
1429 		if (i915_request_completed(rq))
1430 			goto unlock;
1431 
1432 		if (i915_active_is_idle(&ce->active) &&
1433 		    !intel_context_is_barrier(ce)) {
1434 			GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1435 				      ce->timeline->fence_context,
1436 				      port - execlists->pending);
1437 			ok = false;
1438 			goto unlock;
1439 		}
1440 
1441 		if (!i915_vma_is_pinned(ce->state)) {
1442 			GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1443 				      ce->timeline->fence_context,
1444 				      port - execlists->pending);
1445 			ok = false;
1446 			goto unlock;
1447 		}
1448 
1449 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1450 			GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1451 				      ce->timeline->fence_context,
1452 				      port - execlists->pending);
1453 			ok = false;
1454 			goto unlock;
1455 		}
1456 
1457 unlock:
1458 		spin_unlock_irqrestore(&rq->lock, flags);
1459 		if (!ok)
1460 			return false;
1461 	}
1462 
1463 	return ce;
1464 }
1465 
1466 static void execlists_submit_ports(struct intel_engine_cs *engine)
1467 {
1468 	struct intel_engine_execlists *execlists = &engine->execlists;
1469 	unsigned int n;
1470 
1471 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1472 
1473 	/*
1474 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1475 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1476 	 * not be relinquished until the device is idle (see
1477 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1478 	 * that all ELSP are drained i.e. we have processed the CSB,
1479 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1480 	 */
1481 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1482 
1483 	/*
1484 	 * ELSQ note: the submit queue is not cleared after being submitted
1485 	 * to the HW so we need to make sure we always clean it up. This is
1486 	 * currently ensured by the fact that we always write the same number
1487 	 * of elsq entries, keep this in mind before changing the loop below.
1488 	 */
1489 	for (n = execlists_num_ports(execlists); n--; ) {
1490 		struct i915_request *rq = execlists->pending[n];
1491 
1492 		write_desc(execlists,
1493 			   rq ? execlists_update_context(rq) : 0,
1494 			   n);
1495 	}
1496 
1497 	/* we need to manually load the submit queue */
1498 	if (execlists->ctrl_reg)
1499 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1500 }
1501 
1502 static bool ctx_single_port_submission(const struct intel_context *ce)
1503 {
1504 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1505 		intel_context_force_single_submission(ce));
1506 }
1507 
1508 static bool can_merge_ctx(const struct intel_context *prev,
1509 			  const struct intel_context *next)
1510 {
1511 	if (prev != next)
1512 		return false;
1513 
1514 	if (ctx_single_port_submission(prev))
1515 		return false;
1516 
1517 	return true;
1518 }
1519 
1520 static bool can_merge_rq(const struct i915_request *prev,
1521 			 const struct i915_request *next)
1522 {
1523 	GEM_BUG_ON(prev == next);
1524 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1525 
1526 	/*
1527 	 * We do not submit known completed requests. Therefore if the next
1528 	 * request is already completed, we can pretend to merge it in
1529 	 * with the previous context (and we will skip updating the ELSP
1530 	 * and tracking). Thus hopefully keeping the ELSP full with active
1531 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1532 	 * us.
1533 	 */
1534 	if (i915_request_completed(next))
1535 		return true;
1536 
1537 	if (unlikely((prev->fence.flags ^ next->fence.flags) &
1538 		     (I915_FENCE_FLAG_NOPREEMPT | I915_FENCE_FLAG_SENTINEL)))
1539 		return false;
1540 
1541 	if (!can_merge_ctx(prev->context, next->context))
1542 		return false;
1543 
1544 	return true;
1545 }
1546 
1547 static void virtual_update_register_offsets(u32 *regs,
1548 					    struct intel_engine_cs *engine)
1549 {
1550 	set_offsets(regs, reg_offsets(engine), engine, false);
1551 }
1552 
1553 static bool virtual_matches(const struct virtual_engine *ve,
1554 			    const struct i915_request *rq,
1555 			    const struct intel_engine_cs *engine)
1556 {
1557 	const struct intel_engine_cs *inflight;
1558 
1559 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1560 		return false;
1561 
1562 	/*
1563 	 * We track when the HW has completed saving the context image
1564 	 * (i.e. when we have seen the final CS event switching out of
1565 	 * the context) and must not overwrite the context image before
1566 	 * then. This restricts us to only using the active engine
1567 	 * while the previous virtualized request is inflight (so
1568 	 * we reuse the register offsets). This is a very small
1569 	 * hystersis on the greedy seelction algorithm.
1570 	 */
1571 	inflight = intel_context_inflight(&ve->context);
1572 	if (inflight && inflight != engine)
1573 		return false;
1574 
1575 	return true;
1576 }
1577 
1578 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1579 				     struct intel_engine_cs *engine)
1580 {
1581 	struct intel_engine_cs *old = ve->siblings[0];
1582 
1583 	/* All unattached (rq->engine == old) must already be completed */
1584 
1585 	spin_lock(&old->breadcrumbs.irq_lock);
1586 	if (!list_empty(&ve->context.signal_link)) {
1587 		list_move_tail(&ve->context.signal_link,
1588 			       &engine->breadcrumbs.signalers);
1589 		intel_engine_signal_breadcrumbs(engine);
1590 	}
1591 	spin_unlock(&old->breadcrumbs.irq_lock);
1592 }
1593 
1594 static struct i915_request *
1595 last_active(const struct intel_engine_execlists *execlists)
1596 {
1597 	struct i915_request * const *last = READ_ONCE(execlists->active);
1598 
1599 	while (*last && i915_request_completed(*last))
1600 		last++;
1601 
1602 	return *last;
1603 }
1604 
1605 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1606 {
1607 	LIST_HEAD(list);
1608 
1609 	/*
1610 	 * We want to move the interrupted request to the back of
1611 	 * the round-robin list (i.e. its priority level), but
1612 	 * in doing so, we must then move all requests that were in
1613 	 * flight and were waiting for the interrupted request to
1614 	 * be run after it again.
1615 	 */
1616 	do {
1617 		struct i915_dependency *p;
1618 
1619 		GEM_BUG_ON(i915_request_is_active(rq));
1620 		list_move_tail(&rq->sched.link, pl);
1621 
1622 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
1623 			struct i915_request *w =
1624 				container_of(p->waiter, typeof(*w), sched);
1625 
1626 			/* Leave semaphores spinning on the other engines */
1627 			if (w->engine != rq->engine)
1628 				continue;
1629 
1630 			/* No waiter should start before its signaler */
1631 			GEM_BUG_ON(i915_request_started(w) &&
1632 				   !i915_request_completed(rq));
1633 
1634 			GEM_BUG_ON(i915_request_is_active(w));
1635 			if (list_empty(&w->sched.link))
1636 				continue; /* Not yet submitted; unready */
1637 
1638 			if (rq_prio(w) < rq_prio(rq))
1639 				continue;
1640 
1641 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1642 			list_move_tail(&w->sched.link, &list);
1643 		}
1644 
1645 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1646 	} while (rq);
1647 }
1648 
1649 static void defer_active(struct intel_engine_cs *engine)
1650 {
1651 	struct i915_request *rq;
1652 
1653 	rq = __unwind_incomplete_requests(engine);
1654 	if (!rq)
1655 		return;
1656 
1657 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1658 }
1659 
1660 static bool
1661 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1662 {
1663 	int hint;
1664 
1665 	if (!intel_engine_has_timeslices(engine))
1666 		return false;
1667 
1668 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1669 		return false;
1670 
1671 	hint = max(rq_prio(list_next_entry(rq, sched.link)),
1672 		   engine->execlists.queue_priority_hint);
1673 
1674 	return hint >= effective_prio(rq);
1675 }
1676 
1677 static int
1678 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1679 {
1680 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1681 		return INT_MIN;
1682 
1683 	return rq_prio(list_next_entry(rq, sched.link));
1684 }
1685 
1686 static inline unsigned long
1687 timeslice(const struct intel_engine_cs *engine)
1688 {
1689 	return READ_ONCE(engine->props.timeslice_duration_ms);
1690 }
1691 
1692 static unsigned long
1693 active_timeslice(const struct intel_engine_cs *engine)
1694 {
1695 	const struct i915_request *rq = *engine->execlists.active;
1696 
1697 	if (!rq || i915_request_completed(rq))
1698 		return 0;
1699 
1700 	if (engine->execlists.switch_priority_hint < effective_prio(rq))
1701 		return 0;
1702 
1703 	return timeslice(engine);
1704 }
1705 
1706 static void set_timeslice(struct intel_engine_cs *engine)
1707 {
1708 	if (!intel_engine_has_timeslices(engine))
1709 		return;
1710 
1711 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1712 }
1713 
1714 static void record_preemption(struct intel_engine_execlists *execlists)
1715 {
1716 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1717 }
1718 
1719 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
1720 {
1721 	struct i915_request *rq;
1722 
1723 	rq = last_active(&engine->execlists);
1724 	if (!rq)
1725 		return 0;
1726 
1727 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1728 	if (unlikely(intel_context_is_banned(rq->context)))
1729 		return 1;
1730 
1731 	return READ_ONCE(engine->props.preempt_timeout_ms);
1732 }
1733 
1734 static void set_preempt_timeout(struct intel_engine_cs *engine)
1735 {
1736 	if (!intel_engine_has_preempt_reset(engine))
1737 		return;
1738 
1739 	set_timer_ms(&engine->execlists.preempt,
1740 		     active_preempt_timeout(engine));
1741 }
1742 
1743 static inline void clear_ports(struct i915_request **ports, int count)
1744 {
1745 	memset_p((void **)ports, NULL, count);
1746 }
1747 
1748 static void execlists_dequeue(struct intel_engine_cs *engine)
1749 {
1750 	struct intel_engine_execlists * const execlists = &engine->execlists;
1751 	struct i915_request **port = execlists->pending;
1752 	struct i915_request ** const last_port = port + execlists->port_mask;
1753 	struct i915_request *last;
1754 	struct rb_node *rb;
1755 	bool submit = false;
1756 
1757 	/*
1758 	 * Hardware submission is through 2 ports. Conceptually each port
1759 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1760 	 * static for a context, and unique to each, so we only execute
1761 	 * requests belonging to a single context from each ring. RING_HEAD
1762 	 * is maintained by the CS in the context image, it marks the place
1763 	 * where it got up to last time, and through RING_TAIL we tell the CS
1764 	 * where we want to execute up to this time.
1765 	 *
1766 	 * In this list the requests are in order of execution. Consecutive
1767 	 * requests from the same context are adjacent in the ringbuffer. We
1768 	 * can combine these requests into a single RING_TAIL update:
1769 	 *
1770 	 *              RING_HEAD...req1...req2
1771 	 *                                    ^- RING_TAIL
1772 	 * since to execute req2 the CS must first execute req1.
1773 	 *
1774 	 * Our goal then is to point each port to the end of a consecutive
1775 	 * sequence of requests as being the most optimal (fewest wake ups
1776 	 * and context switches) submission.
1777 	 */
1778 
1779 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1780 		struct virtual_engine *ve =
1781 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1782 		struct i915_request *rq = READ_ONCE(ve->request);
1783 
1784 		if (!rq) { /* lazily cleanup after another engine handled rq */
1785 			rb_erase_cached(rb, &execlists->virtual);
1786 			RB_CLEAR_NODE(rb);
1787 			rb = rb_first_cached(&execlists->virtual);
1788 			continue;
1789 		}
1790 
1791 		if (!virtual_matches(ve, rq, engine)) {
1792 			rb = rb_next(rb);
1793 			continue;
1794 		}
1795 
1796 		break;
1797 	}
1798 
1799 	/*
1800 	 * If the queue is higher priority than the last
1801 	 * request in the currently active context, submit afresh.
1802 	 * We will resubmit again afterwards in case we need to split
1803 	 * the active context to interject the preemption request,
1804 	 * i.e. we will retrigger preemption following the ack in case
1805 	 * of trouble.
1806 	 */
1807 	last = last_active(execlists);
1808 	if (last) {
1809 		if (need_preempt(engine, last, rb)) {
1810 			ENGINE_TRACE(engine,
1811 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1812 				     last->fence.context,
1813 				     last->fence.seqno,
1814 				     last->sched.attr.priority,
1815 				     execlists->queue_priority_hint);
1816 			record_preemption(execlists);
1817 
1818 			/*
1819 			 * Don't let the RING_HEAD advance past the breadcrumb
1820 			 * as we unwind (and until we resubmit) so that we do
1821 			 * not accidentally tell it to go backwards.
1822 			 */
1823 			ring_set_paused(engine, 1);
1824 
1825 			/*
1826 			 * Note that we have not stopped the GPU at this point,
1827 			 * so we are unwinding the incomplete requests as they
1828 			 * remain inflight and so by the time we do complete
1829 			 * the preemption, some of the unwound requests may
1830 			 * complete!
1831 			 */
1832 			__unwind_incomplete_requests(engine);
1833 
1834 			/*
1835 			 * If we need to return to the preempted context, we
1836 			 * need to skip the lite-restore and force it to
1837 			 * reload the RING_TAIL. Otherwise, the HW has a
1838 			 * tendency to ignore us rewinding the TAIL to the
1839 			 * end of an earlier request.
1840 			 */
1841 			last->context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1842 			last = NULL;
1843 		} else if (need_timeslice(engine, last) &&
1844 			   timer_expired(&engine->execlists.timer)) {
1845 			ENGINE_TRACE(engine,
1846 				     "expired last=%llx:%lld, prio=%d, hint=%d\n",
1847 				     last->fence.context,
1848 				     last->fence.seqno,
1849 				     last->sched.attr.priority,
1850 				     execlists->queue_priority_hint);
1851 
1852 			ring_set_paused(engine, 1);
1853 			defer_active(engine);
1854 
1855 			/*
1856 			 * Unlike for preemption, if we rewind and continue
1857 			 * executing the same context as previously active,
1858 			 * the order of execution will remain the same and
1859 			 * the tail will only advance. We do not need to
1860 			 * force a full context restore, as a lite-restore
1861 			 * is sufficient to resample the monotonic TAIL.
1862 			 *
1863 			 * If we switch to any other context, similarly we
1864 			 * will not rewind TAIL of current context, and
1865 			 * normal save/restore will preserve state and allow
1866 			 * us to later continue executing the same request.
1867 			 */
1868 			last = NULL;
1869 		} else {
1870 			/*
1871 			 * Otherwise if we already have a request pending
1872 			 * for execution after the current one, we can
1873 			 * just wait until the next CS event before
1874 			 * queuing more. In either case we will force a
1875 			 * lite-restore preemption event, but if we wait
1876 			 * we hopefully coalesce several updates into a single
1877 			 * submission.
1878 			 */
1879 			if (!list_is_last(&last->sched.link,
1880 					  &engine->active.requests)) {
1881 				/*
1882 				 * Even if ELSP[1] is occupied and not worthy
1883 				 * of timeslices, our queue might be.
1884 				 */
1885 				if (!execlists->timer.expires &&
1886 				    need_timeslice(engine, last))
1887 					set_timer_ms(&execlists->timer,
1888 						     timeslice(engine));
1889 
1890 				return;
1891 			}
1892 		}
1893 	}
1894 
1895 	while (rb) { /* XXX virtual is always taking precedence */
1896 		struct virtual_engine *ve =
1897 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1898 		struct i915_request *rq;
1899 
1900 		spin_lock(&ve->base.active.lock);
1901 
1902 		rq = ve->request;
1903 		if (unlikely(!rq)) { /* lost the race to a sibling */
1904 			spin_unlock(&ve->base.active.lock);
1905 			rb_erase_cached(rb, &execlists->virtual);
1906 			RB_CLEAR_NODE(rb);
1907 			rb = rb_first_cached(&execlists->virtual);
1908 			continue;
1909 		}
1910 
1911 		GEM_BUG_ON(rq != ve->request);
1912 		GEM_BUG_ON(rq->engine != &ve->base);
1913 		GEM_BUG_ON(rq->context != &ve->context);
1914 
1915 		if (rq_prio(rq) >= queue_prio(execlists)) {
1916 			if (!virtual_matches(ve, rq, engine)) {
1917 				spin_unlock(&ve->base.active.lock);
1918 				rb = rb_next(rb);
1919 				continue;
1920 			}
1921 
1922 			if (last && !can_merge_rq(last, rq)) {
1923 				spin_unlock(&ve->base.active.lock);
1924 				return; /* leave this for another */
1925 			}
1926 
1927 			ENGINE_TRACE(engine,
1928 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
1929 				     rq->fence.context,
1930 				     rq->fence.seqno,
1931 				     i915_request_completed(rq) ? "!" :
1932 				     i915_request_started(rq) ? "*" :
1933 				     "",
1934 				     yesno(engine != ve->siblings[0]));
1935 
1936 			ve->request = NULL;
1937 			ve->base.execlists.queue_priority_hint = INT_MIN;
1938 			rb_erase_cached(rb, &execlists->virtual);
1939 			RB_CLEAR_NODE(rb);
1940 
1941 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1942 			rq->engine = engine;
1943 
1944 			if (engine != ve->siblings[0]) {
1945 				u32 *regs = ve->context.lrc_reg_state;
1946 				unsigned int n;
1947 
1948 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1949 
1950 				if (!intel_engine_has_relative_mmio(engine))
1951 					virtual_update_register_offsets(regs,
1952 									engine);
1953 
1954 				if (!list_empty(&ve->context.signals))
1955 					virtual_xfer_breadcrumbs(ve, engine);
1956 
1957 				/*
1958 				 * Move the bound engine to the top of the list
1959 				 * for future execution. We then kick this
1960 				 * tasklet first before checking others, so that
1961 				 * we preferentially reuse this set of bound
1962 				 * registers.
1963 				 */
1964 				for (n = 1; n < ve->num_siblings; n++) {
1965 					if (ve->siblings[n] == engine) {
1966 						swap(ve->siblings[n],
1967 						     ve->siblings[0]);
1968 						break;
1969 					}
1970 				}
1971 
1972 				GEM_BUG_ON(ve->siblings[0] != engine);
1973 			}
1974 
1975 			if (__i915_request_submit(rq)) {
1976 				submit = true;
1977 				last = rq;
1978 			}
1979 			i915_request_put(rq);
1980 
1981 			/*
1982 			 * Hmm, we have a bunch of virtual engine requests,
1983 			 * but the first one was already completed (thanks
1984 			 * preempt-to-busy!). Keep looking at the veng queue
1985 			 * until we have no more relevant requests (i.e.
1986 			 * the normal submit queue has higher priority).
1987 			 */
1988 			if (!submit) {
1989 				spin_unlock(&ve->base.active.lock);
1990 				rb = rb_first_cached(&execlists->virtual);
1991 				continue;
1992 			}
1993 		}
1994 
1995 		spin_unlock(&ve->base.active.lock);
1996 		break;
1997 	}
1998 
1999 	while ((rb = rb_first_cached(&execlists->queue))) {
2000 		struct i915_priolist *p = to_priolist(rb);
2001 		struct i915_request *rq, *rn;
2002 		int i;
2003 
2004 		priolist_for_each_request_consume(rq, rn, p, i) {
2005 			bool merge = true;
2006 
2007 			/*
2008 			 * Can we combine this request with the current port?
2009 			 * It has to be the same context/ringbuffer and not
2010 			 * have any exceptions (e.g. GVT saying never to
2011 			 * combine contexts).
2012 			 *
2013 			 * If we can combine the requests, we can execute both
2014 			 * by updating the RING_TAIL to point to the end of the
2015 			 * second request, and so we never need to tell the
2016 			 * hardware about the first.
2017 			 */
2018 			if (last && !can_merge_rq(last, rq)) {
2019 				/*
2020 				 * If we are on the second port and cannot
2021 				 * combine this request with the last, then we
2022 				 * are done.
2023 				 */
2024 				if (port == last_port)
2025 					goto done;
2026 
2027 				/*
2028 				 * We must not populate both ELSP[] with the
2029 				 * same LRCA, i.e. we must submit 2 different
2030 				 * contexts if we submit 2 ELSP.
2031 				 */
2032 				if (last->context == rq->context)
2033 					goto done;
2034 
2035 				if (i915_request_has_sentinel(last))
2036 					goto done;
2037 
2038 				/*
2039 				 * If GVT overrides us we only ever submit
2040 				 * port[0], leaving port[1] empty. Note that we
2041 				 * also have to be careful that we don't queue
2042 				 * the same context (even though a different
2043 				 * request) to the second port.
2044 				 */
2045 				if (ctx_single_port_submission(last->context) ||
2046 				    ctx_single_port_submission(rq->context))
2047 					goto done;
2048 
2049 				merge = false;
2050 			}
2051 
2052 			if (__i915_request_submit(rq)) {
2053 				if (!merge) {
2054 					*port = execlists_schedule_in(last, port - execlists->pending);
2055 					port++;
2056 					last = NULL;
2057 				}
2058 
2059 				GEM_BUG_ON(last &&
2060 					   !can_merge_ctx(last->context,
2061 							  rq->context));
2062 
2063 				submit = true;
2064 				last = rq;
2065 			}
2066 		}
2067 
2068 		rb_erase_cached(&p->node, &execlists->queue);
2069 		i915_priolist_free(p);
2070 	}
2071 
2072 done:
2073 	/*
2074 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2075 	 *
2076 	 * We choose the priority hint such that if we add a request of greater
2077 	 * priority than this, we kick the submission tasklet to decide on
2078 	 * the right order of submitting the requests to hardware. We must
2079 	 * also be prepared to reorder requests as they are in-flight on the
2080 	 * HW. We derive the priority hint then as the first "hole" in
2081 	 * the HW submission ports and if there are no available slots,
2082 	 * the priority of the lowest executing request, i.e. last.
2083 	 *
2084 	 * When we do receive a higher priority request ready to run from the
2085 	 * user, see queue_request(), the priority hint is bumped to that
2086 	 * request triggering preemption on the next dequeue (or subsequent
2087 	 * interrupt for secondary ports).
2088 	 */
2089 	execlists->queue_priority_hint = queue_prio(execlists);
2090 
2091 	if (submit) {
2092 		*port = execlists_schedule_in(last, port - execlists->pending);
2093 		execlists->switch_priority_hint =
2094 			switch_prio(engine, *execlists->pending);
2095 
2096 		/*
2097 		 * Skip if we ended up with exactly the same set of requests,
2098 		 * e.g. trying to timeslice a pair of ordered contexts
2099 		 */
2100 		if (!memcmp(execlists->active, execlists->pending,
2101 			    (port - execlists->pending + 1) * sizeof(*port))) {
2102 			do
2103 				execlists_schedule_out(fetch_and_zero(port));
2104 			while (port-- != execlists->pending);
2105 
2106 			goto skip_submit;
2107 		}
2108 		clear_ports(port + 1, last_port - port);
2109 
2110 		execlists_submit_ports(engine);
2111 		set_preempt_timeout(engine);
2112 	} else {
2113 skip_submit:
2114 		ring_set_paused(engine, 0);
2115 	}
2116 }
2117 
2118 static void
2119 cancel_port_requests(struct intel_engine_execlists * const execlists)
2120 {
2121 	struct i915_request * const *port;
2122 
2123 	for (port = execlists->pending; *port; port++)
2124 		execlists_schedule_out(*port);
2125 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2126 
2127 	/* Mark the end of active before we overwrite *active */
2128 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2129 		execlists_schedule_out(*port);
2130 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2131 
2132 	WRITE_ONCE(execlists->active, execlists->inflight);
2133 }
2134 
2135 static inline void
2136 invalidate_csb_entries(const u32 *first, const u32 *last)
2137 {
2138 	clflush((void *)first);
2139 	clflush((void *)last);
2140 }
2141 
2142 static inline bool
2143 reset_in_progress(const struct intel_engine_execlists *execlists)
2144 {
2145 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
2146 }
2147 
2148 /*
2149  * Starting with Gen12, the status has a new format:
2150  *
2151  *     bit  0:     switched to new queue
2152  *     bit  1:     reserved
2153  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2154  *                 switch detail is set to "wait on semaphore"
2155  *     bits 3-5:   engine class
2156  *     bits 6-11:  engine instance
2157  *     bits 12-14: reserved
2158  *     bits 15-25: sw context id of the lrc the GT switched to
2159  *     bits 26-31: sw counter of the lrc the GT switched to
2160  *     bits 32-35: context switch detail
2161  *                  - 0: ctx complete
2162  *                  - 1: wait on sync flip
2163  *                  - 2: wait on vblank
2164  *                  - 3: wait on scanline
2165  *                  - 4: wait on semaphore
2166  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2167  *                       WAIT_FOR_EVENT)
2168  *     bit  36:    reserved
2169  *     bits 37-43: wait detail (for switch detail 1 to 4)
2170  *     bits 44-46: reserved
2171  *     bits 47-57: sw context id of the lrc the GT switched away from
2172  *     bits 58-63: sw counter of the lrc the GT switched away from
2173  */
2174 static inline bool
2175 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2176 {
2177 	u32 lower_dw = csb[0];
2178 	u32 upper_dw = csb[1];
2179 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2180 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2181 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2182 
2183 	/*
2184 	 * The context switch detail is not guaranteed to be 5 when a preemption
2185 	 * occurs, so we can't just check for that. The check below works for
2186 	 * all the cases we care about, including preemptions of WAIT
2187 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2188 	 * would require some extra handling, but we don't support that.
2189 	 */
2190 	if (!ctx_away_valid || new_queue) {
2191 		GEM_BUG_ON(!ctx_to_valid);
2192 		return true;
2193 	}
2194 
2195 	/*
2196 	 * switch detail = 5 is covered by the case above and we do not expect a
2197 	 * context switch on an unsuccessful wait instruction since we always
2198 	 * use polling mode.
2199 	 */
2200 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2201 	return false;
2202 }
2203 
2204 static inline bool
2205 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2206 {
2207 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2208 }
2209 
2210 static void process_csb(struct intel_engine_cs *engine)
2211 {
2212 	struct intel_engine_execlists * const execlists = &engine->execlists;
2213 	const u32 * const buf = execlists->csb_status;
2214 	const u8 num_entries = execlists->csb_size;
2215 	u8 head, tail;
2216 
2217 	/*
2218 	 * As we modify our execlists state tracking we require exclusive
2219 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2220 	 * and we assume that is only inside the reset paths and so serialised.
2221 	 */
2222 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2223 		   !reset_in_progress(execlists));
2224 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2225 
2226 	/*
2227 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2228 	 * When reading from the csb_write mmio register, we have to be
2229 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2230 	 * the low 4bits. As it happens we know the next 4bits are always
2231 	 * zero and so we can simply masked off the low u8 of the register
2232 	 * and treat it identically to reading from the HWSP (without having
2233 	 * to use explicit shifting and masking, and probably bifurcating
2234 	 * the code to handle the legacy mmio read).
2235 	 */
2236 	head = execlists->csb_head;
2237 	tail = READ_ONCE(*execlists->csb_write);
2238 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2239 	if (unlikely(head == tail))
2240 		return;
2241 
2242 	/*
2243 	 * Hopefully paired with a wmb() in HW!
2244 	 *
2245 	 * We must complete the read of the write pointer before any reads
2246 	 * from the CSB, so that we do not see stale values. Without an rmb
2247 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2248 	 * we perform the READ_ONCE(*csb_write).
2249 	 */
2250 	rmb();
2251 
2252 	do {
2253 		bool promote;
2254 
2255 		if (++head == num_entries)
2256 			head = 0;
2257 
2258 		/*
2259 		 * We are flying near dragons again.
2260 		 *
2261 		 * We hold a reference to the request in execlist_port[]
2262 		 * but no more than that. We are operating in softirq
2263 		 * context and so cannot hold any mutex or sleep. That
2264 		 * prevents us stopping the requests we are processing
2265 		 * in port[] from being retired simultaneously (the
2266 		 * breadcrumb will be complete before we see the
2267 		 * context-switch). As we only hold the reference to the
2268 		 * request, any pointer chasing underneath the request
2269 		 * is subject to a potential use-after-free. Thus we
2270 		 * store all of the bookkeeping within port[] as
2271 		 * required, and avoid using unguarded pointers beneath
2272 		 * request itself. The same applies to the atomic
2273 		 * status notifier.
2274 		 */
2275 
2276 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2277 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2278 
2279 		if (INTEL_GEN(engine->i915) >= 12)
2280 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2281 		else
2282 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2283 		if (promote) {
2284 			struct i915_request * const *old = execlists->active;
2285 
2286 			/* Point active to the new ELSP; prevent overwriting */
2287 			WRITE_ONCE(execlists->active, execlists->pending);
2288 
2289 			if (!inject_preempt_hang(execlists))
2290 				ring_set_paused(engine, 0);
2291 
2292 			/* cancel old inflight, prepare for switch */
2293 			trace_ports(execlists, "preempted", old);
2294 			while (*old)
2295 				execlists_schedule_out(*old++);
2296 
2297 			/* switch pending to inflight */
2298 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2299 			WRITE_ONCE(execlists->active,
2300 				   memcpy(execlists->inflight,
2301 					  execlists->pending,
2302 					  execlists_num_ports(execlists) *
2303 					  sizeof(*execlists->pending)));
2304 
2305 			WRITE_ONCE(execlists->pending[0], NULL);
2306 		} else {
2307 			GEM_BUG_ON(!*execlists->active);
2308 
2309 			/* port0 completed, advanced to port1 */
2310 			trace_ports(execlists, "completed", execlists->active);
2311 
2312 			/*
2313 			 * We rely on the hardware being strongly
2314 			 * ordered, that the breadcrumb write is
2315 			 * coherent (visible from the CPU) before the
2316 			 * user interrupt and CSB is processed.
2317 			 */
2318 			GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2319 				   !reset_in_progress(execlists));
2320 			execlists_schedule_out(*execlists->active++);
2321 
2322 			GEM_BUG_ON(execlists->active - execlists->inflight >
2323 				   execlists_num_ports(execlists));
2324 		}
2325 	} while (head != tail);
2326 
2327 	execlists->csb_head = head;
2328 	set_timeslice(engine);
2329 
2330 	/*
2331 	 * Gen11 has proven to fail wrt global observation point between
2332 	 * entry and tail update, failing on the ordering and thus
2333 	 * we see an old entry in the context status buffer.
2334 	 *
2335 	 * Forcibly evict out entries for the next gpu csb update,
2336 	 * to increase the odds that we get a fresh entries with non
2337 	 * working hardware. The cost for doing so comes out mostly with
2338 	 * the wash as hardware, working or not, will need to do the
2339 	 * invalidation before.
2340 	 */
2341 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2342 }
2343 
2344 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2345 {
2346 	lockdep_assert_held(&engine->active.lock);
2347 	if (!engine->execlists.pending[0]) {
2348 		rcu_read_lock(); /* protect peeking at execlists->active */
2349 		execlists_dequeue(engine);
2350 		rcu_read_unlock();
2351 	}
2352 }
2353 
2354 static noinline void preempt_reset(struct intel_engine_cs *engine)
2355 {
2356 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2357 	unsigned long *lock = &engine->gt->reset.flags;
2358 
2359 	if (i915_modparams.reset < 3)
2360 		return;
2361 
2362 	if (test_and_set_bit(bit, lock))
2363 		return;
2364 
2365 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2366 	tasklet_disable_nosync(&engine->execlists.tasklet);
2367 
2368 	ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
2369 		     READ_ONCE(engine->props.preempt_timeout_ms),
2370 		     jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2371 	intel_engine_reset(engine, "preemption time out");
2372 
2373 	tasklet_enable(&engine->execlists.tasklet);
2374 	clear_and_wake_up_bit(bit, lock);
2375 }
2376 
2377 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2378 {
2379 	const struct timer_list *t = &engine->execlists.preempt;
2380 
2381 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2382 		return false;
2383 
2384 	if (!timer_expired(t))
2385 		return false;
2386 
2387 	return READ_ONCE(engine->execlists.pending[0]);
2388 }
2389 
2390 /*
2391  * Check the unread Context Status Buffers and manage the submission of new
2392  * contexts to the ELSP accordingly.
2393  */
2394 static void execlists_submission_tasklet(unsigned long data)
2395 {
2396 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2397 	bool timeout = preempt_timeout(engine);
2398 
2399 	process_csb(engine);
2400 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2401 		unsigned long flags;
2402 
2403 		spin_lock_irqsave(&engine->active.lock, flags);
2404 		__execlists_submission_tasklet(engine);
2405 		spin_unlock_irqrestore(&engine->active.lock, flags);
2406 
2407 		/* Recheck after serialising with direct-submission */
2408 		if (timeout && preempt_timeout(engine))
2409 			preempt_reset(engine);
2410 	}
2411 }
2412 
2413 static void __execlists_kick(struct intel_engine_execlists *execlists)
2414 {
2415 	/* Kick the tasklet for some interrupt coalescing and reset handling */
2416 	tasklet_hi_schedule(&execlists->tasklet);
2417 }
2418 
2419 #define execlists_kick(t, member) \
2420 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2421 
2422 static void execlists_timeslice(struct timer_list *timer)
2423 {
2424 	execlists_kick(timer, timer);
2425 }
2426 
2427 static void execlists_preempt(struct timer_list *timer)
2428 {
2429 	execlists_kick(timer, preempt);
2430 }
2431 
2432 static void queue_request(struct intel_engine_cs *engine,
2433 			  struct i915_sched_node *node,
2434 			  int prio)
2435 {
2436 	GEM_BUG_ON(!list_empty(&node->link));
2437 	list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
2438 }
2439 
2440 static void __submit_queue_imm(struct intel_engine_cs *engine)
2441 {
2442 	struct intel_engine_execlists * const execlists = &engine->execlists;
2443 
2444 	if (reset_in_progress(execlists))
2445 		return; /* defer until we restart the engine following reset */
2446 
2447 	if (execlists->tasklet.func == execlists_submission_tasklet)
2448 		__execlists_submission_tasklet(engine);
2449 	else
2450 		tasklet_hi_schedule(&execlists->tasklet);
2451 }
2452 
2453 static void submit_queue(struct intel_engine_cs *engine,
2454 			 const struct i915_request *rq)
2455 {
2456 	struct intel_engine_execlists *execlists = &engine->execlists;
2457 
2458 	if (rq_prio(rq) <= execlists->queue_priority_hint)
2459 		return;
2460 
2461 	execlists->queue_priority_hint = rq_prio(rq);
2462 	__submit_queue_imm(engine);
2463 }
2464 
2465 static void execlists_submit_request(struct i915_request *request)
2466 {
2467 	struct intel_engine_cs *engine = request->engine;
2468 	unsigned long flags;
2469 
2470 	/* Will be called from irq-context when using foreign fences. */
2471 	spin_lock_irqsave(&engine->active.lock, flags);
2472 
2473 	queue_request(engine, &request->sched, rq_prio(request));
2474 
2475 	GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2476 	GEM_BUG_ON(list_empty(&request->sched.link));
2477 
2478 	submit_queue(engine, request);
2479 
2480 	spin_unlock_irqrestore(&engine->active.lock, flags);
2481 }
2482 
2483 static void __execlists_context_fini(struct intel_context *ce)
2484 {
2485 	intel_ring_put(ce->ring);
2486 	i915_vma_put(ce->state);
2487 }
2488 
2489 static void execlists_context_destroy(struct kref *kref)
2490 {
2491 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2492 
2493 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2494 	GEM_BUG_ON(intel_context_is_pinned(ce));
2495 
2496 	if (ce->state)
2497 		__execlists_context_fini(ce);
2498 
2499 	intel_context_fini(ce);
2500 	intel_context_free(ce);
2501 }
2502 
2503 static void
2504 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2505 {
2506 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2507 		return;
2508 
2509 	vaddr += engine->context_size;
2510 
2511 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
2512 }
2513 
2514 static void
2515 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2516 {
2517 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2518 		return;
2519 
2520 	vaddr += engine->context_size;
2521 
2522 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
2523 		dev_err_once(engine->i915->drm.dev,
2524 			     "%s context redzone overwritten!\n",
2525 			     engine->name);
2526 }
2527 
2528 static void execlists_context_unpin(struct intel_context *ce)
2529 {
2530 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2531 		      ce->engine);
2532 
2533 	i915_gem_object_unpin_map(ce->state->obj);
2534 	intel_ring_reset(ce->ring, ce->ring->tail);
2535 }
2536 
2537 static void
2538 __execlists_update_reg_state(const struct intel_context *ce,
2539 			     const struct intel_engine_cs *engine)
2540 {
2541 	struct intel_ring *ring = ce->ring;
2542 	u32 *regs = ce->lrc_reg_state;
2543 
2544 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
2545 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2546 
2547 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
2548 	regs[CTX_RING_HEAD] = ring->head;
2549 	regs[CTX_RING_TAIL] = ring->tail;
2550 
2551 	/* RPCS */
2552 	if (engine->class == RENDER_CLASS) {
2553 		regs[CTX_R_PWR_CLK_STATE] =
2554 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2555 
2556 		i915_oa_init_reg_state(ce, engine);
2557 	}
2558 }
2559 
2560 static int
2561 __execlists_context_pin(struct intel_context *ce,
2562 			struct intel_engine_cs *engine)
2563 {
2564 	void *vaddr;
2565 
2566 	GEM_BUG_ON(!ce->state);
2567 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2568 
2569 	vaddr = i915_gem_object_pin_map(ce->state->obj,
2570 					i915_coherent_map_type(engine->i915) |
2571 					I915_MAP_OVERRIDE);
2572 	if (IS_ERR(vaddr))
2573 		return PTR_ERR(vaddr);
2574 
2575 	ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
2576 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2577 	__execlists_update_reg_state(ce, engine);
2578 
2579 	return 0;
2580 }
2581 
2582 static int execlists_context_pin(struct intel_context *ce)
2583 {
2584 	return __execlists_context_pin(ce, ce->engine);
2585 }
2586 
2587 static int execlists_context_alloc(struct intel_context *ce)
2588 {
2589 	return __execlists_context_alloc(ce, ce->engine);
2590 }
2591 
2592 static void execlists_context_reset(struct intel_context *ce)
2593 {
2594 	CE_TRACE(ce, "reset\n");
2595 	GEM_BUG_ON(!intel_context_is_pinned(ce));
2596 
2597 	/*
2598 	 * Because we emit WA_TAIL_DWORDS there may be a disparity
2599 	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2600 	 * that stored in context. As we only write new commands from
2601 	 * ce->ring->tail onwards, everything before that is junk. If the GPU
2602 	 * starts reading from its RING_HEAD from the context, it may try to
2603 	 * execute that junk and die.
2604 	 *
2605 	 * The contexts that are stilled pinned on resume belong to the
2606 	 * kernel, and are local to each engine. All other contexts will
2607 	 * have their head/tail sanitized upon pinning before use, so they
2608 	 * will never see garbage,
2609 	 *
2610 	 * So to avoid that we reset the context images upon resume. For
2611 	 * simplicity, we just zero everything out.
2612 	 */
2613 	intel_ring_reset(ce->ring, ce->ring->emit);
2614 
2615 	/* Scrub away the garbage */
2616 	execlists_init_reg_state(ce->lrc_reg_state,
2617 				 ce, ce->engine, ce->ring, true);
2618 	__execlists_update_reg_state(ce, ce->engine);
2619 
2620 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
2621 }
2622 
2623 static const struct intel_context_ops execlists_context_ops = {
2624 	.alloc = execlists_context_alloc,
2625 
2626 	.pin = execlists_context_pin,
2627 	.unpin = execlists_context_unpin,
2628 
2629 	.enter = intel_context_enter_engine,
2630 	.exit = intel_context_exit_engine,
2631 
2632 	.reset = execlists_context_reset,
2633 	.destroy = execlists_context_destroy,
2634 };
2635 
2636 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2637 {
2638 	u32 *cs;
2639 
2640 	GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2641 
2642 	cs = intel_ring_begin(rq, 6);
2643 	if (IS_ERR(cs))
2644 		return PTR_ERR(cs);
2645 
2646 	/*
2647 	 * Check if we have been preempted before we even get started.
2648 	 *
2649 	 * After this point i915_request_started() reports true, even if
2650 	 * we get preempted and so are no longer running.
2651 	 */
2652 	*cs++ = MI_ARB_CHECK;
2653 	*cs++ = MI_NOOP;
2654 
2655 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2656 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
2657 	*cs++ = 0;
2658 	*cs++ = rq->fence.seqno - 1;
2659 
2660 	intel_ring_advance(rq, cs);
2661 
2662 	/* Record the updated position of the request's payload */
2663 	rq->infix = intel_ring_offset(rq, cs);
2664 
2665 	return 0;
2666 }
2667 
2668 static int execlists_request_alloc(struct i915_request *request)
2669 {
2670 	int ret;
2671 
2672 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
2673 
2674 	/*
2675 	 * Flush enough space to reduce the likelihood of waiting after
2676 	 * we start building the request - in which case we will just
2677 	 * have to repeat work.
2678 	 */
2679 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
2680 
2681 	/*
2682 	 * Note that after this point, we have committed to using
2683 	 * this request as it is being used to both track the
2684 	 * state of engine initialisation and liveness of the
2685 	 * golden renderstate above. Think twice before you try
2686 	 * to cancel/unwind this request now.
2687 	 */
2688 
2689 	/* Unconditionally invalidate GPU caches and TLBs. */
2690 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
2691 	if (ret)
2692 		return ret;
2693 
2694 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
2695 	return 0;
2696 }
2697 
2698 /*
2699  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
2700  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
2701  * but there is a slight complication as this is applied in WA batch where the
2702  * values are only initialized once so we cannot take register value at the
2703  * beginning and reuse it further; hence we save its value to memory, upload a
2704  * constant value with bit21 set and then we restore it back with the saved value.
2705  * To simplify the WA, a constant value is formed by using the default value
2706  * of this register. This shouldn't be a problem because we are only modifying
2707  * it for a short period and this batch in non-premptible. We can ofcourse
2708  * use additional instructions that read the actual value of the register
2709  * at that time and set our bit of interest but it makes the WA complicated.
2710  *
2711  * This WA is also required for Gen9 so extracting as a function avoids
2712  * code duplication.
2713  */
2714 static u32 *
2715 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
2716 {
2717 	/* NB no one else is allowed to scribble over scratch + 256! */
2718 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2719 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2720 	*batch++ = intel_gt_scratch_offset(engine->gt,
2721 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2722 	*batch++ = 0;
2723 
2724 	*batch++ = MI_LOAD_REGISTER_IMM(1);
2725 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2726 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
2727 
2728 	batch = gen8_emit_pipe_control(batch,
2729 				       PIPE_CONTROL_CS_STALL |
2730 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
2731 				       0);
2732 
2733 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2734 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2735 	*batch++ = intel_gt_scratch_offset(engine->gt,
2736 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2737 	*batch++ = 0;
2738 
2739 	return batch;
2740 }
2741 
2742 /*
2743  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
2744  * initialized at the beginning and shared across all contexts but this field
2745  * helps us to have multiple batches at different offsets and select them based
2746  * on a criteria. At the moment this batch always start at the beginning of the page
2747  * and at this point we don't have multiple wa_ctx batch buffers.
2748  *
2749  * The number of WA applied are not known at the beginning; we use this field
2750  * to return the no of DWORDS written.
2751  *
2752  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
2753  * so it adds NOOPs as padding to make it cacheline aligned.
2754  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
2755  * makes a complete batch buffer.
2756  */
2757 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2758 {
2759 	/* WaDisableCtxRestoreArbitration:bdw,chv */
2760 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2761 
2762 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
2763 	if (IS_BROADWELL(engine->i915))
2764 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2765 
2766 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
2767 	/* Actual scratch location is at 128 bytes offset */
2768 	batch = gen8_emit_pipe_control(batch,
2769 				       PIPE_CONTROL_FLUSH_L3 |
2770 				       PIPE_CONTROL_STORE_DATA_INDEX |
2771 				       PIPE_CONTROL_CS_STALL |
2772 				       PIPE_CONTROL_QW_WRITE,
2773 				       LRC_PPHWSP_SCRATCH_ADDR);
2774 
2775 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2776 
2777 	/* Pad to end of cacheline */
2778 	while ((unsigned long)batch % CACHELINE_BYTES)
2779 		*batch++ = MI_NOOP;
2780 
2781 	/*
2782 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
2783 	 * execution depends on the length specified in terms of cache lines
2784 	 * in the register CTX_RCS_INDIRECT_CTX
2785 	 */
2786 
2787 	return batch;
2788 }
2789 
2790 struct lri {
2791 	i915_reg_t reg;
2792 	u32 value;
2793 };
2794 
2795 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
2796 {
2797 	GEM_BUG_ON(!count || count > 63);
2798 
2799 	*batch++ = MI_LOAD_REGISTER_IMM(count);
2800 	do {
2801 		*batch++ = i915_mmio_reg_offset(lri->reg);
2802 		*batch++ = lri->value;
2803 	} while (lri++, --count);
2804 	*batch++ = MI_NOOP;
2805 
2806 	return batch;
2807 }
2808 
2809 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2810 {
2811 	static const struct lri lri[] = {
2812 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
2813 		{
2814 			COMMON_SLICE_CHICKEN2,
2815 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
2816 				       0),
2817 		},
2818 
2819 		/* BSpec: 11391 */
2820 		{
2821 			FF_SLICE_CHICKEN,
2822 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
2823 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
2824 		},
2825 
2826 		/* BSpec: 11299 */
2827 		{
2828 			_3D_CHICKEN3,
2829 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
2830 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
2831 		}
2832 	};
2833 
2834 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2835 
2836 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
2837 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2838 
2839 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
2840 	batch = gen8_emit_pipe_control(batch,
2841 				       PIPE_CONTROL_FLUSH_L3 |
2842 				       PIPE_CONTROL_STORE_DATA_INDEX |
2843 				       PIPE_CONTROL_CS_STALL |
2844 				       PIPE_CONTROL_QW_WRITE,
2845 				       LRC_PPHWSP_SCRATCH_ADDR);
2846 
2847 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
2848 
2849 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
2850 	if (HAS_POOLED_EU(engine->i915)) {
2851 		/*
2852 		 * EU pool configuration is setup along with golden context
2853 		 * during context initialization. This value depends on
2854 		 * device type (2x6 or 3x6) and needs to be updated based
2855 		 * on which subslice is disabled especially for 2x6
2856 		 * devices, however it is safe to load default
2857 		 * configuration of 3x6 device instead of masking off
2858 		 * corresponding bits because HW ignores bits of a disabled
2859 		 * subslice and drops down to appropriate config. Please
2860 		 * see render_state_setup() in i915_gem_render_state.c for
2861 		 * possible configurations, to avoid duplication they are
2862 		 * not shown here again.
2863 		 */
2864 		*batch++ = GEN9_MEDIA_POOL_STATE;
2865 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
2866 		*batch++ = 0x00777000;
2867 		*batch++ = 0;
2868 		*batch++ = 0;
2869 		*batch++ = 0;
2870 	}
2871 
2872 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2873 
2874 	/* Pad to end of cacheline */
2875 	while ((unsigned long)batch % CACHELINE_BYTES)
2876 		*batch++ = MI_NOOP;
2877 
2878 	return batch;
2879 }
2880 
2881 static u32 *
2882 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2883 {
2884 	int i;
2885 
2886 	/*
2887 	 * WaPipeControlBefore3DStateSamplePattern: cnl
2888 	 *
2889 	 * Ensure the engine is idle prior to programming a
2890 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
2891 	 */
2892 	batch = gen8_emit_pipe_control(batch,
2893 				       PIPE_CONTROL_CS_STALL,
2894 				       0);
2895 	/*
2896 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
2897 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
2898 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
2899 	 * confusing. Since gen8_emit_pipe_control() already advances the
2900 	 * batch by 6 dwords, we advance the other 10 here, completing a
2901 	 * cacheline. It's not clear if the workaround requires this padding
2902 	 * before other commands, or if it's just the regular padding we would
2903 	 * already have for the workaround bb, so leave it here for now.
2904 	 */
2905 	for (i = 0; i < 10; i++)
2906 		*batch++ = MI_NOOP;
2907 
2908 	/* Pad to end of cacheline */
2909 	while ((unsigned long)batch % CACHELINE_BYTES)
2910 		*batch++ = MI_NOOP;
2911 
2912 	return batch;
2913 }
2914 
2915 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
2916 
2917 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
2918 {
2919 	struct drm_i915_gem_object *obj;
2920 	struct i915_vma *vma;
2921 	int err;
2922 
2923 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
2924 	if (IS_ERR(obj))
2925 		return PTR_ERR(obj);
2926 
2927 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
2928 	if (IS_ERR(vma)) {
2929 		err = PTR_ERR(vma);
2930 		goto err;
2931 	}
2932 
2933 	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2934 	if (err)
2935 		goto err;
2936 
2937 	engine->wa_ctx.vma = vma;
2938 	return 0;
2939 
2940 err:
2941 	i915_gem_object_put(obj);
2942 	return err;
2943 }
2944 
2945 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
2946 {
2947 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
2948 }
2949 
2950 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
2951 
2952 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
2953 {
2954 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2955 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
2956 					    &wa_ctx->per_ctx };
2957 	wa_bb_func_t wa_bb_fn[2];
2958 	struct page *page;
2959 	void *batch, *batch_ptr;
2960 	unsigned int i;
2961 	int ret;
2962 
2963 	if (engine->class != RENDER_CLASS)
2964 		return 0;
2965 
2966 	switch (INTEL_GEN(engine->i915)) {
2967 	case 12:
2968 	case 11:
2969 		return 0;
2970 	case 10:
2971 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
2972 		wa_bb_fn[1] = NULL;
2973 		break;
2974 	case 9:
2975 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
2976 		wa_bb_fn[1] = NULL;
2977 		break;
2978 	case 8:
2979 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
2980 		wa_bb_fn[1] = NULL;
2981 		break;
2982 	default:
2983 		MISSING_CASE(INTEL_GEN(engine->i915));
2984 		return 0;
2985 	}
2986 
2987 	ret = lrc_setup_wa_ctx(engine);
2988 	if (ret) {
2989 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
2990 		return ret;
2991 	}
2992 
2993 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
2994 	batch = batch_ptr = kmap_atomic(page);
2995 
2996 	/*
2997 	 * Emit the two workaround batch buffers, recording the offset from the
2998 	 * start of the workaround batch buffer object for each and their
2999 	 * respective sizes.
3000 	 */
3001 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3002 		wa_bb[i]->offset = batch_ptr - batch;
3003 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3004 						  CACHELINE_BYTES))) {
3005 			ret = -EINVAL;
3006 			break;
3007 		}
3008 		if (wa_bb_fn[i])
3009 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3010 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3011 	}
3012 
3013 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3014 
3015 	kunmap_atomic(batch);
3016 	if (ret)
3017 		lrc_destroy_wa_ctx(engine);
3018 
3019 	return ret;
3020 }
3021 
3022 static void enable_execlists(struct intel_engine_cs *engine)
3023 {
3024 	u32 mode;
3025 
3026 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3027 
3028 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3029 
3030 	if (INTEL_GEN(engine->i915) >= 11)
3031 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3032 	else
3033 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3034 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3035 
3036 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3037 
3038 	ENGINE_WRITE_FW(engine,
3039 			RING_HWS_PGA,
3040 			i915_ggtt_offset(engine->status_page.vma));
3041 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3042 
3043 	engine->context_tag = 0;
3044 }
3045 
3046 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3047 {
3048 	bool unexpected = false;
3049 
3050 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3051 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3052 		unexpected = true;
3053 	}
3054 
3055 	return unexpected;
3056 }
3057 
3058 static int execlists_resume(struct intel_engine_cs *engine)
3059 {
3060 	intel_engine_apply_workarounds(engine);
3061 	intel_engine_apply_whitelist(engine);
3062 
3063 	intel_mocs_init_engine(engine);
3064 
3065 	intel_engine_reset_breadcrumbs(engine);
3066 
3067 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3068 		struct drm_printer p = drm_debug_printer(__func__);
3069 
3070 		intel_engine_dump(engine, &p, NULL);
3071 	}
3072 
3073 	enable_execlists(engine);
3074 
3075 	return 0;
3076 }
3077 
3078 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3079 {
3080 	struct intel_engine_execlists * const execlists = &engine->execlists;
3081 	unsigned long flags;
3082 
3083 	ENGINE_TRACE(engine, "depth<-%d\n",
3084 		     atomic_read(&execlists->tasklet.count));
3085 
3086 	/*
3087 	 * Prevent request submission to the hardware until we have
3088 	 * completed the reset in i915_gem_reset_finish(). If a request
3089 	 * is completed by one engine, it may then queue a request
3090 	 * to a second via its execlists->tasklet *just* as we are
3091 	 * calling engine->resume() and also writing the ELSP.
3092 	 * Turning off the execlists->tasklet until the reset is over
3093 	 * prevents the race.
3094 	 */
3095 	__tasklet_disable_sync_once(&execlists->tasklet);
3096 	GEM_BUG_ON(!reset_in_progress(execlists));
3097 
3098 	/* And flush any current direct submission. */
3099 	spin_lock_irqsave(&engine->active.lock, flags);
3100 	spin_unlock_irqrestore(&engine->active.lock, flags);
3101 
3102 	/*
3103 	 * We stop engines, otherwise we might get failed reset and a
3104 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
3105 	 * from system hang if batchbuffer is progressing when
3106 	 * the reset is issued, regardless of READY_TO_RESET ack.
3107 	 * Thus assume it is best to stop engines on all gens
3108 	 * where we have a gpu reset.
3109 	 *
3110 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3111 	 *
3112 	 * FIXME: Wa for more modern gens needs to be validated
3113 	 */
3114 	intel_engine_stop_cs(engine);
3115 }
3116 
3117 static void reset_csb_pointers(struct intel_engine_cs *engine)
3118 {
3119 	struct intel_engine_execlists * const execlists = &engine->execlists;
3120 	const unsigned int reset_value = execlists->csb_size - 1;
3121 
3122 	ring_set_paused(engine, 0);
3123 
3124 	/*
3125 	 * After a reset, the HW starts writing into CSB entry [0]. We
3126 	 * therefore have to set our HEAD pointer back one entry so that
3127 	 * the *first* entry we check is entry 0. To complicate this further,
3128 	 * as we don't wait for the first interrupt after reset, we have to
3129 	 * fake the HW write to point back to the last entry so that our
3130 	 * inline comparison of our cached head position against the last HW
3131 	 * write works even before the first interrupt.
3132 	 */
3133 	execlists->csb_head = reset_value;
3134 	WRITE_ONCE(*execlists->csb_write, reset_value);
3135 	wmb(); /* Make sure this is visible to HW (paranoia?) */
3136 
3137 	/*
3138 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3139 	 * Bludgeon them with a mmio update to be sure.
3140 	 */
3141 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3142 		     reset_value << 8 | reset_value);
3143 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3144 
3145 	invalidate_csb_entries(&execlists->csb_status[0],
3146 			       &execlists->csb_status[reset_value]);
3147 }
3148 
3149 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3150 {
3151 	int x;
3152 
3153 	x = lrc_ring_mi_mode(engine);
3154 	if (x != -1) {
3155 		regs[x + 1] &= ~STOP_RING;
3156 		regs[x + 1] |= STOP_RING << 16;
3157 	}
3158 }
3159 
3160 static void __execlists_reset_reg_state(const struct intel_context *ce,
3161 					const struct intel_engine_cs *engine)
3162 {
3163 	u32 *regs = ce->lrc_reg_state;
3164 
3165 	__reset_stop_ring(regs, engine);
3166 }
3167 
3168 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3169 {
3170 	struct intel_engine_execlists * const execlists = &engine->execlists;
3171 	struct intel_context *ce;
3172 	struct i915_request *rq;
3173 
3174 	mb(); /* paranoia: read the CSB pointers from after the reset */
3175 	clflush(execlists->csb_write);
3176 	mb();
3177 
3178 	process_csb(engine); /* drain preemption events */
3179 
3180 	/* Following the reset, we need to reload the CSB read/write pointers */
3181 	reset_csb_pointers(engine);
3182 
3183 	/*
3184 	 * Save the currently executing context, even if we completed
3185 	 * its request, it was still running at the time of the
3186 	 * reset and will have been clobbered.
3187 	 */
3188 	rq = execlists_active(execlists);
3189 	if (!rq)
3190 		goto unwind;
3191 
3192 	/* We still have requests in-flight; the engine should be active */
3193 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3194 
3195 	ce = rq->context;
3196 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3197 
3198 	if (i915_request_completed(rq)) {
3199 		/* Idle context; tidy up the ring so we can restart afresh */
3200 		ce->ring->head = intel_ring_wrap(ce->ring, rq->tail);
3201 		goto out_replay;
3202 	}
3203 
3204 	/* Context has requests still in-flight; it should not be idle! */
3205 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3206 	rq = active_request(ce->timeline, rq);
3207 	ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
3208 	GEM_BUG_ON(ce->ring->head == ce->ring->tail);
3209 
3210 	/*
3211 	 * If this request hasn't started yet, e.g. it is waiting on a
3212 	 * semaphore, we need to avoid skipping the request or else we
3213 	 * break the signaling chain. However, if the context is corrupt
3214 	 * the request will not restart and we will be stuck with a wedged
3215 	 * device. It is quite often the case that if we issue a reset
3216 	 * while the GPU is loading the context image, that the context
3217 	 * image becomes corrupt.
3218 	 *
3219 	 * Otherwise, if we have not started yet, the request should replay
3220 	 * perfectly and we do not need to flag the result as being erroneous.
3221 	 */
3222 	if (!i915_request_started(rq))
3223 		goto out_replay;
3224 
3225 	/*
3226 	 * If the request was innocent, we leave the request in the ELSP
3227 	 * and will try to replay it on restarting. The context image may
3228 	 * have been corrupted by the reset, in which case we may have
3229 	 * to service a new GPU hang, but more likely we can continue on
3230 	 * without impact.
3231 	 *
3232 	 * If the request was guilty, we presume the context is corrupt
3233 	 * and have to at least restore the RING register in the context
3234 	 * image back to the expected values to skip over the guilty request.
3235 	 */
3236 	__i915_request_reset(rq, stalled);
3237 	if (!stalled)
3238 		goto out_replay;
3239 
3240 	/*
3241 	 * We want a simple context + ring to execute the breadcrumb update.
3242 	 * We cannot rely on the context being intact across the GPU hang,
3243 	 * so clear it and rebuild just what we need for the breadcrumb.
3244 	 * All pending requests for this context will be zapped, and any
3245 	 * future request will be after userspace has had the opportunity
3246 	 * to recreate its own state.
3247 	 */
3248 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3249 	restore_default_state(ce, engine);
3250 
3251 out_replay:
3252 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3253 		     ce->ring->head, ce->ring->tail);
3254 	intel_ring_update_space(ce->ring);
3255 	__execlists_reset_reg_state(ce, engine);
3256 	__execlists_update_reg_state(ce, engine);
3257 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3258 
3259 unwind:
3260 	/* Push back any incomplete requests for replay after the reset. */
3261 	cancel_port_requests(execlists);
3262 	__unwind_incomplete_requests(engine);
3263 }
3264 
3265 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3266 {
3267 	unsigned long flags;
3268 
3269 	ENGINE_TRACE(engine, "\n");
3270 
3271 	spin_lock_irqsave(&engine->active.lock, flags);
3272 
3273 	__execlists_reset(engine, stalled);
3274 
3275 	spin_unlock_irqrestore(&engine->active.lock, flags);
3276 }
3277 
3278 static void nop_submission_tasklet(unsigned long data)
3279 {
3280 	/* The driver is wedged; don't process any more events. */
3281 }
3282 
3283 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3284 {
3285 	struct intel_engine_execlists * const execlists = &engine->execlists;
3286 	struct i915_request *rq, *rn;
3287 	struct rb_node *rb;
3288 	unsigned long flags;
3289 
3290 	ENGINE_TRACE(engine, "\n");
3291 
3292 	/*
3293 	 * Before we call engine->cancel_requests(), we should have exclusive
3294 	 * access to the submission state. This is arranged for us by the
3295 	 * caller disabling the interrupt generation, the tasklet and other
3296 	 * threads that may then access the same state, giving us a free hand
3297 	 * to reset state. However, we still need to let lockdep be aware that
3298 	 * we know this state may be accessed in hardirq context, so we
3299 	 * disable the irq around this manipulation and we want to keep
3300 	 * the spinlock focused on its duties and not accidentally conflate
3301 	 * coverage to the submission's irq state. (Similarly, although we
3302 	 * shouldn't need to disable irq around the manipulation of the
3303 	 * submission's irq state, we also wish to remind ourselves that
3304 	 * it is irq state.)
3305 	 */
3306 	spin_lock_irqsave(&engine->active.lock, flags);
3307 
3308 	__execlists_reset(engine, true);
3309 
3310 	/* Mark all executing requests as skipped. */
3311 	list_for_each_entry(rq, &engine->active.requests, sched.link)
3312 		mark_eio(rq);
3313 
3314 	/* Flush the queued requests to the timeline list (for retiring). */
3315 	while ((rb = rb_first_cached(&execlists->queue))) {
3316 		struct i915_priolist *p = to_priolist(rb);
3317 		int i;
3318 
3319 		priolist_for_each_request_consume(rq, rn, p, i) {
3320 			mark_eio(rq);
3321 			__i915_request_submit(rq);
3322 		}
3323 
3324 		rb_erase_cached(&p->node, &execlists->queue);
3325 		i915_priolist_free(p);
3326 	}
3327 
3328 	/* Cancel all attached virtual engines */
3329 	while ((rb = rb_first_cached(&execlists->virtual))) {
3330 		struct virtual_engine *ve =
3331 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3332 
3333 		rb_erase_cached(rb, &execlists->virtual);
3334 		RB_CLEAR_NODE(rb);
3335 
3336 		spin_lock(&ve->base.active.lock);
3337 		rq = fetch_and_zero(&ve->request);
3338 		if (rq) {
3339 			mark_eio(rq);
3340 
3341 			rq->engine = engine;
3342 			__i915_request_submit(rq);
3343 			i915_request_put(rq);
3344 
3345 			ve->base.execlists.queue_priority_hint = INT_MIN;
3346 		}
3347 		spin_unlock(&ve->base.active.lock);
3348 	}
3349 
3350 	/* Remaining _unready_ requests will be nop'ed when submitted */
3351 
3352 	execlists->queue_priority_hint = INT_MIN;
3353 	execlists->queue = RB_ROOT_CACHED;
3354 
3355 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3356 	execlists->tasklet.func = nop_submission_tasklet;
3357 
3358 	spin_unlock_irqrestore(&engine->active.lock, flags);
3359 }
3360 
3361 static void execlists_reset_finish(struct intel_engine_cs *engine)
3362 {
3363 	struct intel_engine_execlists * const execlists = &engine->execlists;
3364 
3365 	/*
3366 	 * After a GPU reset, we may have requests to replay. Do so now while
3367 	 * we still have the forcewake to be sure that the GPU is not allowed
3368 	 * to sleep before we restart and reload a context.
3369 	 */
3370 	GEM_BUG_ON(!reset_in_progress(execlists));
3371 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3372 		execlists->tasklet.func(execlists->tasklet.data);
3373 
3374 	if (__tasklet_enable(&execlists->tasklet))
3375 		/* And kick in case we missed a new request submission. */
3376 		tasklet_hi_schedule(&execlists->tasklet);
3377 	ENGINE_TRACE(engine, "depth->%d\n",
3378 		     atomic_read(&execlists->tasklet.count));
3379 }
3380 
3381 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3382 				    u64 offset, u32 len,
3383 				    const unsigned int flags)
3384 {
3385 	u32 *cs;
3386 
3387 	cs = intel_ring_begin(rq, 4);
3388 	if (IS_ERR(cs))
3389 		return PTR_ERR(cs);
3390 
3391 	/*
3392 	 * WaDisableCtxRestoreArbitration:bdw,chv
3393 	 *
3394 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3395 	 * particular all the gen that do not need the w/a at all!), if we
3396 	 * took care to make sure that on every switch into this context
3397 	 * (both ordinary and for preemption) that arbitrartion was enabled
3398 	 * we would be fine.  However, for gen8 there is another w/a that
3399 	 * requires us to not preempt inside GPGPU execution, so we keep
3400 	 * arbitration disabled for gen8 batches. Arbitration will be
3401 	 * re-enabled before we close the request
3402 	 * (engine->emit_fini_breadcrumb).
3403 	 */
3404 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3405 
3406 	/* FIXME(BDW+): Address space and security selectors. */
3407 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3408 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3409 	*cs++ = lower_32_bits(offset);
3410 	*cs++ = upper_32_bits(offset);
3411 
3412 	intel_ring_advance(rq, cs);
3413 
3414 	return 0;
3415 }
3416 
3417 static int gen8_emit_bb_start(struct i915_request *rq,
3418 			      u64 offset, u32 len,
3419 			      const unsigned int flags)
3420 {
3421 	u32 *cs;
3422 
3423 	cs = intel_ring_begin(rq, 6);
3424 	if (IS_ERR(cs))
3425 		return PTR_ERR(cs);
3426 
3427 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3428 
3429 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3430 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3431 	*cs++ = lower_32_bits(offset);
3432 	*cs++ = upper_32_bits(offset);
3433 
3434 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3435 	*cs++ = MI_NOOP;
3436 
3437 	intel_ring_advance(rq, cs);
3438 
3439 	return 0;
3440 }
3441 
3442 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3443 {
3444 	ENGINE_WRITE(engine, RING_IMR,
3445 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
3446 	ENGINE_POSTING_READ(engine, RING_IMR);
3447 }
3448 
3449 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3450 {
3451 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3452 }
3453 
3454 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3455 {
3456 	u32 cmd, *cs;
3457 
3458 	cs = intel_ring_begin(request, 4);
3459 	if (IS_ERR(cs))
3460 		return PTR_ERR(cs);
3461 
3462 	cmd = MI_FLUSH_DW + 1;
3463 
3464 	/* We always require a command barrier so that subsequent
3465 	 * commands, such as breadcrumb interrupts, are strictly ordered
3466 	 * wrt the contents of the write cache being flushed to memory
3467 	 * (and thus being coherent from the CPU).
3468 	 */
3469 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3470 
3471 	if (mode & EMIT_INVALIDATE) {
3472 		cmd |= MI_INVALIDATE_TLB;
3473 		if (request->engine->class == VIDEO_DECODE_CLASS)
3474 			cmd |= MI_INVALIDATE_BSD;
3475 	}
3476 
3477 	*cs++ = cmd;
3478 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3479 	*cs++ = 0; /* upper addr */
3480 	*cs++ = 0; /* value */
3481 	intel_ring_advance(request, cs);
3482 
3483 	return 0;
3484 }
3485 
3486 static int gen8_emit_flush_render(struct i915_request *request,
3487 				  u32 mode)
3488 {
3489 	bool vf_flush_wa = false, dc_flush_wa = false;
3490 	u32 *cs, flags = 0;
3491 	int len;
3492 
3493 	flags |= PIPE_CONTROL_CS_STALL;
3494 
3495 	if (mode & EMIT_FLUSH) {
3496 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3497 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3498 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3499 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3500 	}
3501 
3502 	if (mode & EMIT_INVALIDATE) {
3503 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3504 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3505 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3506 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3507 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3508 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3509 		flags |= PIPE_CONTROL_QW_WRITE;
3510 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3511 
3512 		/*
3513 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3514 		 * pipe control.
3515 		 */
3516 		if (IS_GEN(request->i915, 9))
3517 			vf_flush_wa = true;
3518 
3519 		/* WaForGAMHang:kbl */
3520 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3521 			dc_flush_wa = true;
3522 	}
3523 
3524 	len = 6;
3525 
3526 	if (vf_flush_wa)
3527 		len += 6;
3528 
3529 	if (dc_flush_wa)
3530 		len += 12;
3531 
3532 	cs = intel_ring_begin(request, len);
3533 	if (IS_ERR(cs))
3534 		return PTR_ERR(cs);
3535 
3536 	if (vf_flush_wa)
3537 		cs = gen8_emit_pipe_control(cs, 0, 0);
3538 
3539 	if (dc_flush_wa)
3540 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3541 					    0);
3542 
3543 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3544 
3545 	if (dc_flush_wa)
3546 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3547 
3548 	intel_ring_advance(request, cs);
3549 
3550 	return 0;
3551 }
3552 
3553 static int gen11_emit_flush_render(struct i915_request *request,
3554 				   u32 mode)
3555 {
3556 	if (mode & EMIT_FLUSH) {
3557 		u32 *cs;
3558 		u32 flags = 0;
3559 
3560 		flags |= PIPE_CONTROL_CS_STALL;
3561 
3562 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3563 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3564 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3565 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3566 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3567 		flags |= PIPE_CONTROL_QW_WRITE;
3568 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3569 
3570 		cs = intel_ring_begin(request, 6);
3571 		if (IS_ERR(cs))
3572 			return PTR_ERR(cs);
3573 
3574 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3575 		intel_ring_advance(request, cs);
3576 	}
3577 
3578 	if (mode & EMIT_INVALIDATE) {
3579 		u32 *cs;
3580 		u32 flags = 0;
3581 
3582 		flags |= PIPE_CONTROL_CS_STALL;
3583 
3584 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3585 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3586 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3587 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3588 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3589 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3590 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3591 		flags |= PIPE_CONTROL_QW_WRITE;
3592 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3593 
3594 		cs = intel_ring_begin(request, 6);
3595 		if (IS_ERR(cs))
3596 			return PTR_ERR(cs);
3597 
3598 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3599 		intel_ring_advance(request, cs);
3600 	}
3601 
3602 	return 0;
3603 }
3604 
3605 static u32 preparser_disable(bool state)
3606 {
3607 	return MI_ARB_CHECK | 1 << 8 | state;
3608 }
3609 
3610 static int gen12_emit_flush_render(struct i915_request *request,
3611 				   u32 mode)
3612 {
3613 	if (mode & EMIT_FLUSH) {
3614 		u32 flags = 0;
3615 		u32 *cs;
3616 
3617 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3618 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3619 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3620 		/* Wa_1409600907:tgl */
3621 		flags |= PIPE_CONTROL_DEPTH_STALL;
3622 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3623 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3624 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3625 
3626 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3627 		flags |= PIPE_CONTROL_QW_WRITE;
3628 
3629 		flags |= PIPE_CONTROL_CS_STALL;
3630 
3631 		cs = intel_ring_begin(request, 6);
3632 		if (IS_ERR(cs))
3633 			return PTR_ERR(cs);
3634 
3635 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3636 		intel_ring_advance(request, cs);
3637 	}
3638 
3639 	if (mode & EMIT_INVALIDATE) {
3640 		u32 flags = 0;
3641 		u32 *cs;
3642 
3643 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3644 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3645 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3646 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3647 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3648 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3649 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3650 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
3651 
3652 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3653 		flags |= PIPE_CONTROL_QW_WRITE;
3654 
3655 		flags |= PIPE_CONTROL_CS_STALL;
3656 
3657 		cs = intel_ring_begin(request, 8);
3658 		if (IS_ERR(cs))
3659 			return PTR_ERR(cs);
3660 
3661 		/*
3662 		 * Prevent the pre-parser from skipping past the TLB
3663 		 * invalidate and loading a stale page for the batch
3664 		 * buffer / request payload.
3665 		 */
3666 		*cs++ = preparser_disable(true);
3667 
3668 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3669 
3670 		*cs++ = preparser_disable(false);
3671 		intel_ring_advance(request, cs);
3672 
3673 		/*
3674 		 * Wa_1604544889:tgl
3675 		 */
3676 		if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
3677 			flags = 0;
3678 			flags |= PIPE_CONTROL_CS_STALL;
3679 			flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3680 
3681 			flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3682 			flags |= PIPE_CONTROL_QW_WRITE;
3683 
3684 			cs = intel_ring_begin(request, 6);
3685 			if (IS_ERR(cs))
3686 				return PTR_ERR(cs);
3687 
3688 			cs = gen8_emit_pipe_control(cs, flags,
3689 						    LRC_PPHWSP_SCRATCH_ADDR);
3690 			intel_ring_advance(request, cs);
3691 		}
3692 	}
3693 
3694 	return 0;
3695 }
3696 
3697 /*
3698  * Reserve space for 2 NOOPs at the end of each request to be
3699  * used as a workaround for not being allowed to do lite
3700  * restore with HEAD==TAIL (WaIdleLiteRestore).
3701  */
3702 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
3703 {
3704 	/* Ensure there's always at least one preemption point per-request. */
3705 	*cs++ = MI_ARB_CHECK;
3706 	*cs++ = MI_NOOP;
3707 	request->wa_tail = intel_ring_offset(request, cs);
3708 
3709 	return cs;
3710 }
3711 
3712 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
3713 {
3714 	*cs++ = MI_SEMAPHORE_WAIT |
3715 		MI_SEMAPHORE_GLOBAL_GTT |
3716 		MI_SEMAPHORE_POLL |
3717 		MI_SEMAPHORE_SAD_EQ_SDD;
3718 	*cs++ = 0;
3719 	*cs++ = intel_hws_preempt_address(request->engine);
3720 	*cs++ = 0;
3721 
3722 	return cs;
3723 }
3724 
3725 static __always_inline u32*
3726 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
3727 				 u32 *cs)
3728 {
3729 	*cs++ = MI_USER_INTERRUPT;
3730 
3731 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3732 	if (intel_engine_has_semaphores(request->engine))
3733 		cs = emit_preempt_busywait(request, cs);
3734 
3735 	request->tail = intel_ring_offset(request, cs);
3736 	assert_ring_tail_valid(request->ring, request->tail);
3737 
3738 	return gen8_emit_wa_tail(request, cs);
3739 }
3740 
3741 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3742 {
3743 	cs = gen8_emit_ggtt_write(cs,
3744 				  request->fence.seqno,
3745 				  i915_request_active_timeline(request)->hwsp_offset,
3746 				  0);
3747 
3748 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3749 }
3750 
3751 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3752 {
3753 	cs = gen8_emit_pipe_control(cs,
3754 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3755 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3756 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
3757 				    0);
3758 
3759 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
3760 	cs = gen8_emit_ggtt_write_rcs(cs,
3761 				      request->fence.seqno,
3762 				      i915_request_active_timeline(request)->hwsp_offset,
3763 				      PIPE_CONTROL_FLUSH_ENABLE |
3764 				      PIPE_CONTROL_CS_STALL);
3765 
3766 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3767 }
3768 
3769 static u32 *
3770 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3771 {
3772 	cs = gen8_emit_ggtt_write_rcs(cs,
3773 				      request->fence.seqno,
3774 				      i915_request_active_timeline(request)->hwsp_offset,
3775 				      PIPE_CONTROL_CS_STALL |
3776 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
3777 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3778 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3779 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
3780 				      PIPE_CONTROL_FLUSH_ENABLE);
3781 
3782 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3783 }
3784 
3785 /*
3786  * Note that the CS instruction pre-parser will not stall on the breadcrumb
3787  * flush and will continue pre-fetching the instructions after it before the
3788  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
3789  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
3790  * of the next request before the memory has been flushed, we're guaranteed that
3791  * we won't access the batch itself too early.
3792  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
3793  * so, if the current request is modifying an instruction in the next request on
3794  * the same intel_context, we might pre-fetch and then execute the pre-update
3795  * instruction. To avoid this, the users of self-modifying code should either
3796  * disable the parser around the code emitting the memory writes, via a new flag
3797  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
3798  * the in-kernel use-cases we've opted to use a separate context, see
3799  * reloc_gpu() as an example.
3800  * All the above applies only to the instructions themselves. Non-inline data
3801  * used by the instructions is not pre-fetched.
3802  */
3803 
3804 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
3805 {
3806 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
3807 		MI_SEMAPHORE_GLOBAL_GTT |
3808 		MI_SEMAPHORE_POLL |
3809 		MI_SEMAPHORE_SAD_EQ_SDD;
3810 	*cs++ = 0;
3811 	*cs++ = intel_hws_preempt_address(request->engine);
3812 	*cs++ = 0;
3813 	*cs++ = 0;
3814 	*cs++ = MI_NOOP;
3815 
3816 	return cs;
3817 }
3818 
3819 static __always_inline u32*
3820 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
3821 {
3822 	*cs++ = MI_USER_INTERRUPT;
3823 
3824 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3825 	if (intel_engine_has_semaphores(request->engine))
3826 		cs = gen12_emit_preempt_busywait(request, cs);
3827 
3828 	request->tail = intel_ring_offset(request, cs);
3829 	assert_ring_tail_valid(request->ring, request->tail);
3830 
3831 	return gen8_emit_wa_tail(request, cs);
3832 }
3833 
3834 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3835 {
3836 	cs = gen8_emit_ggtt_write(cs,
3837 				  request->fence.seqno,
3838 				  i915_request_active_timeline(request)->hwsp_offset,
3839 				  0);
3840 
3841 	return gen12_emit_fini_breadcrumb_footer(request, cs);
3842 }
3843 
3844 static u32 *
3845 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3846 {
3847 	cs = gen8_emit_ggtt_write_rcs(cs,
3848 				      request->fence.seqno,
3849 				      i915_request_active_timeline(request)->hwsp_offset,
3850 				      PIPE_CONTROL_CS_STALL |
3851 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
3852 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3853 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3854 				      /* Wa_1409600907:tgl */
3855 				      PIPE_CONTROL_DEPTH_STALL |
3856 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
3857 				      PIPE_CONTROL_FLUSH_ENABLE |
3858 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
3859 
3860 	return gen12_emit_fini_breadcrumb_footer(request, cs);
3861 }
3862 
3863 static void execlists_park(struct intel_engine_cs *engine)
3864 {
3865 	cancel_timer(&engine->execlists.timer);
3866 	cancel_timer(&engine->execlists.preempt);
3867 }
3868 
3869 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
3870 {
3871 	engine->submit_request = execlists_submit_request;
3872 	engine->schedule = i915_schedule;
3873 	engine->execlists.tasklet.func = execlists_submission_tasklet;
3874 
3875 	engine->reset.prepare = execlists_reset_prepare;
3876 	engine->reset.rewind = execlists_reset_rewind;
3877 	engine->reset.cancel = execlists_reset_cancel;
3878 	engine->reset.finish = execlists_reset_finish;
3879 
3880 	engine->park = execlists_park;
3881 	engine->unpark = NULL;
3882 
3883 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
3884 	if (!intel_vgpu_active(engine->i915)) {
3885 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
3886 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
3887 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3888 	}
3889 
3890 	if (INTEL_GEN(engine->i915) >= 12)
3891 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
3892 
3893 	if (intel_engine_has_preemption(engine))
3894 		engine->emit_bb_start = gen8_emit_bb_start;
3895 	else
3896 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
3897 }
3898 
3899 static void execlists_shutdown(struct intel_engine_cs *engine)
3900 {
3901 	/* Synchronise with residual timers and any softirq they raise */
3902 	del_timer_sync(&engine->execlists.timer);
3903 	del_timer_sync(&engine->execlists.preempt);
3904 	tasklet_kill(&engine->execlists.tasklet);
3905 }
3906 
3907 static void execlists_release(struct intel_engine_cs *engine)
3908 {
3909 	execlists_shutdown(engine);
3910 
3911 	intel_engine_cleanup_common(engine);
3912 	lrc_destroy_wa_ctx(engine);
3913 }
3914 
3915 static void
3916 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3917 {
3918 	/* Default vfuncs which can be overriden by each engine. */
3919 
3920 	engine->resume = execlists_resume;
3921 
3922 	engine->cops = &execlists_context_ops;
3923 	engine->request_alloc = execlists_request_alloc;
3924 
3925 	engine->emit_flush = gen8_emit_flush;
3926 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3927 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
3928 	if (INTEL_GEN(engine->i915) >= 12)
3929 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
3930 
3931 	engine->set_default_submission = intel_execlists_set_default_submission;
3932 
3933 	if (INTEL_GEN(engine->i915) < 11) {
3934 		engine->irq_enable = gen8_logical_ring_enable_irq;
3935 		engine->irq_disable = gen8_logical_ring_disable_irq;
3936 	} else {
3937 		/*
3938 		 * TODO: On Gen11 interrupt masks need to be clear
3939 		 * to allow C6 entry. Keep interrupts enabled at
3940 		 * and take the hit of generating extra interrupts
3941 		 * until a more refined solution exists.
3942 		 */
3943 	}
3944 }
3945 
3946 static inline void
3947 logical_ring_default_irqs(struct intel_engine_cs *engine)
3948 {
3949 	unsigned int shift = 0;
3950 
3951 	if (INTEL_GEN(engine->i915) < 11) {
3952 		const u8 irq_shifts[] = {
3953 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
3954 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
3955 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
3956 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
3957 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
3958 		};
3959 
3960 		shift = irq_shifts[engine->id];
3961 	}
3962 
3963 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3964 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3965 }
3966 
3967 static void rcs_submission_override(struct intel_engine_cs *engine)
3968 {
3969 	switch (INTEL_GEN(engine->i915)) {
3970 	case 12:
3971 		engine->emit_flush = gen12_emit_flush_render;
3972 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
3973 		break;
3974 	case 11:
3975 		engine->emit_flush = gen11_emit_flush_render;
3976 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3977 		break;
3978 	default:
3979 		engine->emit_flush = gen8_emit_flush_render;
3980 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3981 		break;
3982 	}
3983 }
3984 
3985 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3986 {
3987 	struct intel_engine_execlists * const execlists = &engine->execlists;
3988 	struct drm_i915_private *i915 = engine->i915;
3989 	struct intel_uncore *uncore = engine->uncore;
3990 	u32 base = engine->mmio_base;
3991 
3992 	tasklet_init(&engine->execlists.tasklet,
3993 		     execlists_submission_tasklet, (unsigned long)engine);
3994 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
3995 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
3996 
3997 	logical_ring_default_vfuncs(engine);
3998 	logical_ring_default_irqs(engine);
3999 
4000 	if (engine->class == RENDER_CLASS)
4001 		rcs_submission_override(engine);
4002 
4003 	if (intel_init_workaround_bb(engine))
4004 		/*
4005 		 * We continue even if we fail to initialize WA batch
4006 		 * because we only expect rare glitches but nothing
4007 		 * critical to prevent us from using GPU
4008 		 */
4009 		DRM_ERROR("WA batch buffer initialization failed\n");
4010 
4011 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
4012 		execlists->submit_reg = uncore->regs +
4013 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4014 		execlists->ctrl_reg = uncore->regs +
4015 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4016 	} else {
4017 		execlists->submit_reg = uncore->regs +
4018 			i915_mmio_reg_offset(RING_ELSP(base));
4019 	}
4020 
4021 	execlists->csb_status =
4022 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4023 
4024 	execlists->csb_write =
4025 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
4026 
4027 	if (INTEL_GEN(i915) < 11)
4028 		execlists->csb_size = GEN8_CSB_ENTRIES;
4029 	else
4030 		execlists->csb_size = GEN11_CSB_ENTRIES;
4031 
4032 	reset_csb_pointers(engine);
4033 
4034 	/* Finally, take ownership and responsibility for cleanup! */
4035 	engine->release = execlists_release;
4036 
4037 	return 0;
4038 }
4039 
4040 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4041 {
4042 	u32 indirect_ctx_offset;
4043 
4044 	switch (INTEL_GEN(engine->i915)) {
4045 	default:
4046 		MISSING_CASE(INTEL_GEN(engine->i915));
4047 		/* fall through */
4048 	case 12:
4049 		indirect_ctx_offset =
4050 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4051 		break;
4052 	case 11:
4053 		indirect_ctx_offset =
4054 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4055 		break;
4056 	case 10:
4057 		indirect_ctx_offset =
4058 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4059 		break;
4060 	case 9:
4061 		indirect_ctx_offset =
4062 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4063 		break;
4064 	case 8:
4065 		indirect_ctx_offset =
4066 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4067 		break;
4068 	}
4069 
4070 	return indirect_ctx_offset;
4071 }
4072 
4073 
4074 static void init_common_reg_state(u32 * const regs,
4075 				  const struct intel_engine_cs *engine,
4076 				  const struct intel_ring *ring,
4077 				  bool inhibit)
4078 {
4079 	u32 ctl;
4080 
4081 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4082 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4083 	if (inhibit)
4084 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4085 	if (INTEL_GEN(engine->i915) < 11)
4086 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4087 					   CTX_CTRL_RS_CTX_ENABLE);
4088 	regs[CTX_CONTEXT_CONTROL] = ctl;
4089 
4090 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4091 }
4092 
4093 static void init_wa_bb_reg_state(u32 * const regs,
4094 				 const struct intel_engine_cs *engine,
4095 				 u32 pos_bb_per_ctx)
4096 {
4097 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4098 
4099 	if (wa_ctx->per_ctx.size) {
4100 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4101 
4102 		regs[pos_bb_per_ctx] =
4103 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4104 	}
4105 
4106 	if (wa_ctx->indirect_ctx.size) {
4107 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4108 
4109 		regs[pos_bb_per_ctx + 2] =
4110 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
4111 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4112 
4113 		regs[pos_bb_per_ctx + 4] =
4114 			intel_lr_indirect_ctx_offset(engine) << 6;
4115 	}
4116 }
4117 
4118 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4119 {
4120 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
4121 		/* 64b PPGTT (48bit canonical)
4122 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
4123 		 * other PDP Descriptors are ignored.
4124 		 */
4125 		ASSIGN_CTX_PML4(ppgtt, regs);
4126 	} else {
4127 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
4128 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
4129 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
4130 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
4131 	}
4132 }
4133 
4134 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4135 {
4136 	if (i915_is_ggtt(vm))
4137 		return i915_vm_to_ggtt(vm)->alias;
4138 	else
4139 		return i915_vm_to_ppgtt(vm);
4140 }
4141 
4142 static void execlists_init_reg_state(u32 *regs,
4143 				     const struct intel_context *ce,
4144 				     const struct intel_engine_cs *engine,
4145 				     const struct intel_ring *ring,
4146 				     bool inhibit)
4147 {
4148 	/*
4149 	 * A context is actually a big batch buffer with several
4150 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4151 	 * values we are setting here are only for the first context restore:
4152 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
4153 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4154 	 * we are not initializing here).
4155 	 *
4156 	 * Must keep consistent with virtual_update_register_offsets().
4157 	 */
4158 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
4159 
4160 	init_common_reg_state(regs, engine, ring, inhibit);
4161 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4162 
4163 	init_wa_bb_reg_state(regs, engine,
4164 			     INTEL_GEN(engine->i915) >= 12 ?
4165 			     GEN12_CTX_BB_PER_CTX_PTR :
4166 			     CTX_BB_PER_CTX_PTR);
4167 
4168 	__reset_stop_ring(regs, engine);
4169 }
4170 
4171 static int
4172 populate_lr_context(struct intel_context *ce,
4173 		    struct drm_i915_gem_object *ctx_obj,
4174 		    struct intel_engine_cs *engine,
4175 		    struct intel_ring *ring)
4176 {
4177 	bool inhibit = true;
4178 	void *vaddr;
4179 	int ret;
4180 
4181 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4182 	if (IS_ERR(vaddr)) {
4183 		ret = PTR_ERR(vaddr);
4184 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4185 		return ret;
4186 	}
4187 
4188 	set_redzone(vaddr, engine);
4189 
4190 	if (engine->default_state) {
4191 		void *defaults;
4192 
4193 		defaults = i915_gem_object_pin_map(engine->default_state,
4194 						   I915_MAP_WB);
4195 		if (IS_ERR(defaults)) {
4196 			ret = PTR_ERR(defaults);
4197 			goto err_unpin_ctx;
4198 		}
4199 
4200 		memcpy(vaddr, defaults, engine->context_size);
4201 		i915_gem_object_unpin_map(engine->default_state);
4202 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
4203 		inhibit = false;
4204 	}
4205 
4206 	/* The second page of the context object contains some fields which must
4207 	 * be set up prior to the first execution. */
4208 	execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4209 				 ce, engine, ring, inhibit);
4210 
4211 	ret = 0;
4212 err_unpin_ctx:
4213 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4214 	i915_gem_object_unpin_map(ctx_obj);
4215 	return ret;
4216 }
4217 
4218 static int __execlists_context_alloc(struct intel_context *ce,
4219 				     struct intel_engine_cs *engine)
4220 {
4221 	struct drm_i915_gem_object *ctx_obj;
4222 	struct intel_ring *ring;
4223 	struct i915_vma *vma;
4224 	u32 context_size;
4225 	int ret;
4226 
4227 	GEM_BUG_ON(ce->state);
4228 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4229 
4230 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4231 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4232 
4233 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4234 	if (IS_ERR(ctx_obj))
4235 		return PTR_ERR(ctx_obj);
4236 
4237 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4238 	if (IS_ERR(vma)) {
4239 		ret = PTR_ERR(vma);
4240 		goto error_deref_obj;
4241 	}
4242 
4243 	if (!ce->timeline) {
4244 		struct intel_timeline *tl;
4245 
4246 		tl = intel_timeline_create(engine->gt, NULL);
4247 		if (IS_ERR(tl)) {
4248 			ret = PTR_ERR(tl);
4249 			goto error_deref_obj;
4250 		}
4251 
4252 		ce->timeline = tl;
4253 	}
4254 
4255 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4256 	if (IS_ERR(ring)) {
4257 		ret = PTR_ERR(ring);
4258 		goto error_deref_obj;
4259 	}
4260 
4261 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4262 	if (ret) {
4263 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4264 		goto error_ring_free;
4265 	}
4266 
4267 	ce->ring = ring;
4268 	ce->state = vma;
4269 
4270 	return 0;
4271 
4272 error_ring_free:
4273 	intel_ring_put(ring);
4274 error_deref_obj:
4275 	i915_gem_object_put(ctx_obj);
4276 	return ret;
4277 }
4278 
4279 static struct list_head *virtual_queue(struct virtual_engine *ve)
4280 {
4281 	return &ve->base.execlists.default_priolist.requests[0];
4282 }
4283 
4284 static void virtual_context_destroy(struct kref *kref)
4285 {
4286 	struct virtual_engine *ve =
4287 		container_of(kref, typeof(*ve), context.ref);
4288 	unsigned int n;
4289 
4290 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4291 	GEM_BUG_ON(ve->request);
4292 	GEM_BUG_ON(ve->context.inflight);
4293 
4294 	for (n = 0; n < ve->num_siblings; n++) {
4295 		struct intel_engine_cs *sibling = ve->siblings[n];
4296 		struct rb_node *node = &ve->nodes[sibling->id].rb;
4297 		unsigned long flags;
4298 
4299 		if (RB_EMPTY_NODE(node))
4300 			continue;
4301 
4302 		spin_lock_irqsave(&sibling->active.lock, flags);
4303 
4304 		/* Detachment is lazily performed in the execlists tasklet */
4305 		if (!RB_EMPTY_NODE(node))
4306 			rb_erase_cached(node, &sibling->execlists.virtual);
4307 
4308 		spin_unlock_irqrestore(&sibling->active.lock, flags);
4309 	}
4310 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4311 
4312 	if (ve->context.state)
4313 		__execlists_context_fini(&ve->context);
4314 	intel_context_fini(&ve->context);
4315 
4316 	kfree(ve->bonds);
4317 	kfree(ve);
4318 }
4319 
4320 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4321 {
4322 	int swp;
4323 
4324 	/*
4325 	 * Pick a random sibling on starting to help spread the load around.
4326 	 *
4327 	 * New contexts are typically created with exactly the same order
4328 	 * of siblings, and often started in batches. Due to the way we iterate
4329 	 * the array of sibling when submitting requests, sibling[0] is
4330 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4331 	 * randomised across the system, we also help spread the load by the
4332 	 * first engine we inspect being different each time.
4333 	 *
4334 	 * NB This does not force us to execute on this engine, it will just
4335 	 * typically be the first we inspect for submission.
4336 	 */
4337 	swp = prandom_u32_max(ve->num_siblings);
4338 	if (!swp)
4339 		return;
4340 
4341 	swap(ve->siblings[swp], ve->siblings[0]);
4342 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4343 		virtual_update_register_offsets(ve->context.lrc_reg_state,
4344 						ve->siblings[0]);
4345 }
4346 
4347 static int virtual_context_alloc(struct intel_context *ce)
4348 {
4349 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4350 
4351 	return __execlists_context_alloc(ce, ve->siblings[0]);
4352 }
4353 
4354 static int virtual_context_pin(struct intel_context *ce)
4355 {
4356 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4357 	int err;
4358 
4359 	/* Note: we must use a real engine class for setting up reg state */
4360 	err = __execlists_context_pin(ce, ve->siblings[0]);
4361 	if (err)
4362 		return err;
4363 
4364 	virtual_engine_initial_hint(ve);
4365 	return 0;
4366 }
4367 
4368 static void virtual_context_enter(struct intel_context *ce)
4369 {
4370 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4371 	unsigned int n;
4372 
4373 	for (n = 0; n < ve->num_siblings; n++)
4374 		intel_engine_pm_get(ve->siblings[n]);
4375 
4376 	intel_timeline_enter(ce->timeline);
4377 }
4378 
4379 static void virtual_context_exit(struct intel_context *ce)
4380 {
4381 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4382 	unsigned int n;
4383 
4384 	intel_timeline_exit(ce->timeline);
4385 
4386 	for (n = 0; n < ve->num_siblings; n++)
4387 		intel_engine_pm_put(ve->siblings[n]);
4388 }
4389 
4390 static const struct intel_context_ops virtual_context_ops = {
4391 	.alloc = virtual_context_alloc,
4392 
4393 	.pin = virtual_context_pin,
4394 	.unpin = execlists_context_unpin,
4395 
4396 	.enter = virtual_context_enter,
4397 	.exit = virtual_context_exit,
4398 
4399 	.destroy = virtual_context_destroy,
4400 };
4401 
4402 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4403 {
4404 	struct i915_request *rq;
4405 	intel_engine_mask_t mask;
4406 
4407 	rq = READ_ONCE(ve->request);
4408 	if (!rq)
4409 		return 0;
4410 
4411 	/* The rq is ready for submission; rq->execution_mask is now stable. */
4412 	mask = rq->execution_mask;
4413 	if (unlikely(!mask)) {
4414 		/* Invalid selection, submit to a random engine in error */
4415 		i915_request_skip(rq, -ENODEV);
4416 		mask = ve->siblings[0]->mask;
4417 	}
4418 
4419 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4420 		     rq->fence.context, rq->fence.seqno,
4421 		     mask, ve->base.execlists.queue_priority_hint);
4422 
4423 	return mask;
4424 }
4425 
4426 static void virtual_submission_tasklet(unsigned long data)
4427 {
4428 	struct virtual_engine * const ve = (struct virtual_engine *)data;
4429 	const int prio = ve->base.execlists.queue_priority_hint;
4430 	intel_engine_mask_t mask;
4431 	unsigned int n;
4432 
4433 	rcu_read_lock();
4434 	mask = virtual_submission_mask(ve);
4435 	rcu_read_unlock();
4436 	if (unlikely(!mask))
4437 		return;
4438 
4439 	local_irq_disable();
4440 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4441 		struct intel_engine_cs *sibling = ve->siblings[n];
4442 		struct ve_node * const node = &ve->nodes[sibling->id];
4443 		struct rb_node **parent, *rb;
4444 		bool first;
4445 
4446 		if (unlikely(!(mask & sibling->mask))) {
4447 			if (!RB_EMPTY_NODE(&node->rb)) {
4448 				spin_lock(&sibling->active.lock);
4449 				rb_erase_cached(&node->rb,
4450 						&sibling->execlists.virtual);
4451 				RB_CLEAR_NODE(&node->rb);
4452 				spin_unlock(&sibling->active.lock);
4453 			}
4454 			continue;
4455 		}
4456 
4457 		spin_lock(&sibling->active.lock);
4458 
4459 		if (!RB_EMPTY_NODE(&node->rb)) {
4460 			/*
4461 			 * Cheat and avoid rebalancing the tree if we can
4462 			 * reuse this node in situ.
4463 			 */
4464 			first = rb_first_cached(&sibling->execlists.virtual) ==
4465 				&node->rb;
4466 			if (prio == node->prio || (prio > node->prio && first))
4467 				goto submit_engine;
4468 
4469 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4470 		}
4471 
4472 		rb = NULL;
4473 		first = true;
4474 		parent = &sibling->execlists.virtual.rb_root.rb_node;
4475 		while (*parent) {
4476 			struct ve_node *other;
4477 
4478 			rb = *parent;
4479 			other = rb_entry(rb, typeof(*other), rb);
4480 			if (prio > other->prio) {
4481 				parent = &rb->rb_left;
4482 			} else {
4483 				parent = &rb->rb_right;
4484 				first = false;
4485 			}
4486 		}
4487 
4488 		rb_link_node(&node->rb, rb, parent);
4489 		rb_insert_color_cached(&node->rb,
4490 				       &sibling->execlists.virtual,
4491 				       first);
4492 
4493 submit_engine:
4494 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4495 		node->prio = prio;
4496 		if (first && prio > sibling->execlists.queue_priority_hint) {
4497 			sibling->execlists.queue_priority_hint = prio;
4498 			tasklet_hi_schedule(&sibling->execlists.tasklet);
4499 		}
4500 
4501 		spin_unlock(&sibling->active.lock);
4502 	}
4503 	local_irq_enable();
4504 }
4505 
4506 static void virtual_submit_request(struct i915_request *rq)
4507 {
4508 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4509 	struct i915_request *old;
4510 	unsigned long flags;
4511 
4512 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
4513 		     rq->fence.context,
4514 		     rq->fence.seqno);
4515 
4516 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4517 
4518 	spin_lock_irqsave(&ve->base.active.lock, flags);
4519 
4520 	old = ve->request;
4521 	if (old) { /* background completion event from preempt-to-busy */
4522 		GEM_BUG_ON(!i915_request_completed(old));
4523 		__i915_request_submit(old);
4524 		i915_request_put(old);
4525 	}
4526 
4527 	if (i915_request_completed(rq)) {
4528 		__i915_request_submit(rq);
4529 
4530 		ve->base.execlists.queue_priority_hint = INT_MIN;
4531 		ve->request = NULL;
4532 	} else {
4533 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
4534 		ve->request = i915_request_get(rq);
4535 
4536 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4537 		list_move_tail(&rq->sched.link, virtual_queue(ve));
4538 
4539 		tasklet_schedule(&ve->base.execlists.tasklet);
4540 	}
4541 
4542 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
4543 }
4544 
4545 static struct ve_bond *
4546 virtual_find_bond(struct virtual_engine *ve,
4547 		  const struct intel_engine_cs *master)
4548 {
4549 	int i;
4550 
4551 	for (i = 0; i < ve->num_bonds; i++) {
4552 		if (ve->bonds[i].master == master)
4553 			return &ve->bonds[i];
4554 	}
4555 
4556 	return NULL;
4557 }
4558 
4559 static void
4560 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4561 {
4562 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4563 	intel_engine_mask_t allowed, exec;
4564 	struct ve_bond *bond;
4565 
4566 	allowed = ~to_request(signal)->engine->mask;
4567 
4568 	bond = virtual_find_bond(ve, to_request(signal)->engine);
4569 	if (bond)
4570 		allowed &= bond->sibling_mask;
4571 
4572 	/* Restrict the bonded request to run on only the available engines */
4573 	exec = READ_ONCE(rq->execution_mask);
4574 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4575 		;
4576 
4577 	/* Prevent the master from being re-run on the bonded engines */
4578 	to_request(signal)->execution_mask &= ~allowed;
4579 }
4580 
4581 struct intel_context *
4582 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
4583 			       unsigned int count)
4584 {
4585 	struct virtual_engine *ve;
4586 	unsigned int n;
4587 	int err;
4588 
4589 	if (count == 0)
4590 		return ERR_PTR(-EINVAL);
4591 
4592 	if (count == 1)
4593 		return intel_context_create(siblings[0]);
4594 
4595 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4596 	if (!ve)
4597 		return ERR_PTR(-ENOMEM);
4598 
4599 	ve->base.i915 = siblings[0]->i915;
4600 	ve->base.gt = siblings[0]->gt;
4601 	ve->base.uncore = siblings[0]->uncore;
4602 	ve->base.id = -1;
4603 
4604 	ve->base.class = OTHER_CLASS;
4605 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4606 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4607 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4608 
4609 	/*
4610 	 * The decision on whether to submit a request using semaphores
4611 	 * depends on the saturated state of the engine. We only compute
4612 	 * this during HW submission of the request, and we need for this
4613 	 * state to be globally applied to all requests being submitted
4614 	 * to this engine. Virtual engines encompass more than one physical
4615 	 * engine and so we cannot accurately tell in advance if one of those
4616 	 * engines is already saturated and so cannot afford to use a semaphore
4617 	 * and be pessimized in priority for doing so -- if we are the only
4618 	 * context using semaphores after all other clients have stopped, we
4619 	 * will be starved on the saturated system. Such a global switch for
4620 	 * semaphores is less than ideal, but alas is the current compromise.
4621 	 */
4622 	ve->base.saturated = ALL_ENGINES;
4623 
4624 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4625 
4626 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4627 	intel_engine_init_breadcrumbs(&ve->base);
4628 	intel_engine_init_execlists(&ve->base);
4629 
4630 	ve->base.cops = &virtual_context_ops;
4631 	ve->base.request_alloc = execlists_request_alloc;
4632 
4633 	ve->base.schedule = i915_schedule;
4634 	ve->base.submit_request = virtual_submit_request;
4635 	ve->base.bond_execute = virtual_bond_execute;
4636 
4637 	INIT_LIST_HEAD(virtual_queue(ve));
4638 	ve->base.execlists.queue_priority_hint = INT_MIN;
4639 	tasklet_init(&ve->base.execlists.tasklet,
4640 		     virtual_submission_tasklet,
4641 		     (unsigned long)ve);
4642 
4643 	intel_context_init(&ve->context, &ve->base);
4644 
4645 	for (n = 0; n < count; n++) {
4646 		struct intel_engine_cs *sibling = siblings[n];
4647 
4648 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
4649 		if (sibling->mask & ve->base.mask) {
4650 			DRM_DEBUG("duplicate %s entry in load balancer\n",
4651 				  sibling->name);
4652 			err = -EINVAL;
4653 			goto err_put;
4654 		}
4655 
4656 		/*
4657 		 * The virtual engine implementation is tightly coupled to
4658 		 * the execlists backend -- we push out request directly
4659 		 * into a tree inside each physical engine. We could support
4660 		 * layering if we handle cloning of the requests and
4661 		 * submitting a copy into each backend.
4662 		 */
4663 		if (sibling->execlists.tasklet.func !=
4664 		    execlists_submission_tasklet) {
4665 			err = -ENODEV;
4666 			goto err_put;
4667 		}
4668 
4669 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4670 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4671 
4672 		ve->siblings[ve->num_siblings++] = sibling;
4673 		ve->base.mask |= sibling->mask;
4674 
4675 		/*
4676 		 * All physical engines must be compatible for their emission
4677 		 * functions (as we build the instructions during request
4678 		 * construction and do not alter them before submission
4679 		 * on the physical engine). We use the engine class as a guide
4680 		 * here, although that could be refined.
4681 		 */
4682 		if (ve->base.class != OTHER_CLASS) {
4683 			if (ve->base.class != sibling->class) {
4684 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
4685 					  sibling->class, ve->base.class);
4686 				err = -EINVAL;
4687 				goto err_put;
4688 			}
4689 			continue;
4690 		}
4691 
4692 		ve->base.class = sibling->class;
4693 		ve->base.uabi_class = sibling->uabi_class;
4694 		snprintf(ve->base.name, sizeof(ve->base.name),
4695 			 "v%dx%d", ve->base.class, count);
4696 		ve->base.context_size = sibling->context_size;
4697 
4698 		ve->base.emit_bb_start = sibling->emit_bb_start;
4699 		ve->base.emit_flush = sibling->emit_flush;
4700 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
4701 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
4702 		ve->base.emit_fini_breadcrumb_dw =
4703 			sibling->emit_fini_breadcrumb_dw;
4704 
4705 		ve->base.flags = sibling->flags;
4706 	}
4707 
4708 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
4709 
4710 	return &ve->context;
4711 
4712 err_put:
4713 	intel_context_put(&ve->context);
4714 	return ERR_PTR(err);
4715 }
4716 
4717 struct intel_context *
4718 intel_execlists_clone_virtual(struct intel_engine_cs *src)
4719 {
4720 	struct virtual_engine *se = to_virtual_engine(src);
4721 	struct intel_context *dst;
4722 
4723 	dst = intel_execlists_create_virtual(se->siblings,
4724 					     se->num_siblings);
4725 	if (IS_ERR(dst))
4726 		return dst;
4727 
4728 	if (se->num_bonds) {
4729 		struct virtual_engine *de = to_virtual_engine(dst->engine);
4730 
4731 		de->bonds = kmemdup(se->bonds,
4732 				    sizeof(*se->bonds) * se->num_bonds,
4733 				    GFP_KERNEL);
4734 		if (!de->bonds) {
4735 			intel_context_put(dst);
4736 			return ERR_PTR(-ENOMEM);
4737 		}
4738 
4739 		de->num_bonds = se->num_bonds;
4740 	}
4741 
4742 	return dst;
4743 }
4744 
4745 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
4746 				     const struct intel_engine_cs *master,
4747 				     const struct intel_engine_cs *sibling)
4748 {
4749 	struct virtual_engine *ve = to_virtual_engine(engine);
4750 	struct ve_bond *bond;
4751 	int n;
4752 
4753 	/* Sanity check the sibling is part of the virtual engine */
4754 	for (n = 0; n < ve->num_siblings; n++)
4755 		if (sibling == ve->siblings[n])
4756 			break;
4757 	if (n == ve->num_siblings)
4758 		return -EINVAL;
4759 
4760 	bond = virtual_find_bond(ve, master);
4761 	if (bond) {
4762 		bond->sibling_mask |= sibling->mask;
4763 		return 0;
4764 	}
4765 
4766 	bond = krealloc(ve->bonds,
4767 			sizeof(*bond) * (ve->num_bonds + 1),
4768 			GFP_KERNEL);
4769 	if (!bond)
4770 		return -ENOMEM;
4771 
4772 	bond[ve->num_bonds].master = master;
4773 	bond[ve->num_bonds].sibling_mask = sibling->mask;
4774 
4775 	ve->bonds = bond;
4776 	ve->num_bonds++;
4777 
4778 	return 0;
4779 }
4780 
4781 struct intel_engine_cs *
4782 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
4783 				 unsigned int sibling)
4784 {
4785 	struct virtual_engine *ve = to_virtual_engine(engine);
4786 
4787 	if (sibling >= ve->num_siblings)
4788 		return NULL;
4789 
4790 	return ve->siblings[sibling];
4791 }
4792 
4793 void intel_execlists_show_requests(struct intel_engine_cs *engine,
4794 				   struct drm_printer *m,
4795 				   void (*show_request)(struct drm_printer *m,
4796 							struct i915_request *rq,
4797 							const char *prefix),
4798 				   unsigned int max)
4799 {
4800 	const struct intel_engine_execlists *execlists = &engine->execlists;
4801 	struct i915_request *rq, *last;
4802 	unsigned long flags;
4803 	unsigned int count;
4804 	struct rb_node *rb;
4805 
4806 	spin_lock_irqsave(&engine->active.lock, flags);
4807 
4808 	last = NULL;
4809 	count = 0;
4810 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
4811 		if (count++ < max - 1)
4812 			show_request(m, rq, "\t\tE ");
4813 		else
4814 			last = rq;
4815 	}
4816 	if (last) {
4817 		if (count > max) {
4818 			drm_printf(m,
4819 				   "\t\t...skipping %d executing requests...\n",
4820 				   count - max);
4821 		}
4822 		show_request(m, last, "\t\tE ");
4823 	}
4824 
4825 	last = NULL;
4826 	count = 0;
4827 	if (execlists->queue_priority_hint != INT_MIN)
4828 		drm_printf(m, "\t\tQueue priority hint: %d\n",
4829 			   execlists->queue_priority_hint);
4830 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
4831 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
4832 		int i;
4833 
4834 		priolist_for_each_request(rq, p, i) {
4835 			if (count++ < max - 1)
4836 				show_request(m, rq, "\t\tQ ");
4837 			else
4838 				last = rq;
4839 		}
4840 	}
4841 	if (last) {
4842 		if (count > max) {
4843 			drm_printf(m,
4844 				   "\t\t...skipping %d queued requests...\n",
4845 				   count - max);
4846 		}
4847 		show_request(m, last, "\t\tQ ");
4848 	}
4849 
4850 	last = NULL;
4851 	count = 0;
4852 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
4853 		struct virtual_engine *ve =
4854 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4855 		struct i915_request *rq = READ_ONCE(ve->request);
4856 
4857 		if (rq) {
4858 			if (count++ < max - 1)
4859 				show_request(m, rq, "\t\tV ");
4860 			else
4861 				last = rq;
4862 		}
4863 	}
4864 	if (last) {
4865 		if (count > max) {
4866 			drm_printf(m,
4867 				   "\t\t...skipping %d virtual requests...\n",
4868 				   count - max);
4869 		}
4870 		show_request(m, last, "\t\tV ");
4871 	}
4872 
4873 	spin_unlock_irqrestore(&engine->active.lock, flags);
4874 }
4875 
4876 void intel_lr_context_reset(struct intel_engine_cs *engine,
4877 			    struct intel_context *ce,
4878 			    u32 head,
4879 			    bool scrub)
4880 {
4881 	GEM_BUG_ON(!intel_context_is_pinned(ce));
4882 
4883 	/*
4884 	 * We want a simple context + ring to execute the breadcrumb update.
4885 	 * We cannot rely on the context being intact across the GPU hang,
4886 	 * so clear it and rebuild just what we need for the breadcrumb.
4887 	 * All pending requests for this context will be zapped, and any
4888 	 * future request will be after userspace has had the opportunity
4889 	 * to recreate its own state.
4890 	 */
4891 	if (scrub)
4892 		restore_default_state(ce, engine);
4893 
4894 	/* Rerun the request; its payload has been neutered (if guilty). */
4895 	ce->ring->head = head;
4896 	intel_ring_update_space(ce->ring);
4897 
4898 	__execlists_update_reg_state(ce, engine);
4899 }
4900 
4901 bool
4902 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
4903 {
4904 	return engine->set_default_submission ==
4905 	       intel_execlists_set_default_submission;
4906 }
4907 
4908 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4909 #include "selftest_lrc.c"
4910 #endif
4911