xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 890f0b0d)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 
151 #define RING_EXECLIST_QFULL		(1 << 0x2)
152 #define RING_EXECLIST1_VALID		(1 << 0x3)
153 #define RING_EXECLIST0_VALID		(1 << 0x4)
154 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
155 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
156 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
157 
158 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
159 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
161 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
162 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
163 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
164 
165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
166 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
167 
168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
169 
170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
172 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
173 #define GEN12_IDLE_CTX_ID		0x7FF
174 #define GEN12_CSB_CTX_VALID(csb_dw) \
175 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
176 
177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
179 #define WA_TAIL_DWORDS 2
180 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
181 
182 struct virtual_engine {
183 	struct intel_engine_cs base;
184 	struct intel_context context;
185 
186 	/*
187 	 * We allow only a single request through the virtual engine at a time
188 	 * (each request in the timeline waits for the completion fence of
189 	 * the previous before being submitted). By restricting ourselves to
190 	 * only submitting a single request, each request is placed on to a
191 	 * physical to maximise load spreading (by virtue of the late greedy
192 	 * scheduling -- each real engine takes the next available request
193 	 * upon idling).
194 	 */
195 	struct i915_request *request;
196 
197 	/*
198 	 * We keep a rbtree of available virtual engines inside each physical
199 	 * engine, sorted by priority. Here we preallocate the nodes we need
200 	 * for the virtual engine, indexed by physical_engine->id.
201 	 */
202 	struct ve_node {
203 		struct rb_node rb;
204 		int prio;
205 	} nodes[I915_NUM_ENGINES];
206 
207 	/*
208 	 * Keep track of bonded pairs -- restrictions upon on our selection
209 	 * of physical engines any particular request may be submitted to.
210 	 * If we receive a submit-fence from a master engine, we will only
211 	 * use one of sibling_mask physical engines.
212 	 */
213 	struct ve_bond {
214 		const struct intel_engine_cs *master;
215 		intel_engine_mask_t sibling_mask;
216 	} *bonds;
217 	unsigned int num_bonds;
218 
219 	/* And finally, which physical engines this virtual engine maps onto. */
220 	unsigned int num_siblings;
221 	struct intel_engine_cs *siblings[0];
222 };
223 
224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
225 {
226 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
227 	return container_of(engine, struct virtual_engine, base);
228 }
229 
230 static int __execlists_context_alloc(struct intel_context *ce,
231 				     struct intel_engine_cs *engine);
232 
233 static void execlists_init_reg_state(u32 *reg_state,
234 				     const struct intel_context *ce,
235 				     const struct intel_engine_cs *engine,
236 				     const struct intel_ring *ring,
237 				     bool close);
238 static void
239 __execlists_update_reg_state(const struct intel_context *ce,
240 			     const struct intel_engine_cs *engine,
241 			     u32 head);
242 
243 static void mark_eio(struct i915_request *rq)
244 {
245 	if (i915_request_completed(rq))
246 		return;
247 
248 	GEM_BUG_ON(i915_request_signaled(rq));
249 
250 	dma_fence_set_error(&rq->fence, -EIO);
251 	i915_request_mark_complete(rq);
252 }
253 
254 static struct i915_request *
255 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
256 {
257 	struct i915_request *active = rq;
258 
259 	rcu_read_lock();
260 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
261 		if (i915_request_completed(rq))
262 			break;
263 
264 		active = rq;
265 	}
266 	rcu_read_unlock();
267 
268 	return active;
269 }
270 
271 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
272 {
273 	return (i915_ggtt_offset(engine->status_page.vma) +
274 		I915_GEM_HWS_PREEMPT_ADDR);
275 }
276 
277 static inline void
278 ring_set_paused(const struct intel_engine_cs *engine, int state)
279 {
280 	/*
281 	 * We inspect HWS_PREEMPT with a semaphore inside
282 	 * engine->emit_fini_breadcrumb. If the dword is true,
283 	 * the ring is paused as the semaphore will busywait
284 	 * until the dword is false.
285 	 */
286 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
287 	if (state)
288 		wmb();
289 }
290 
291 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
292 {
293 	return rb_entry(rb, struct i915_priolist, node);
294 }
295 
296 static inline int rq_prio(const struct i915_request *rq)
297 {
298 	return rq->sched.attr.priority;
299 }
300 
301 static int effective_prio(const struct i915_request *rq)
302 {
303 	int prio = rq_prio(rq);
304 
305 	/*
306 	 * If this request is special and must not be interrupted at any
307 	 * cost, so be it. Note we are only checking the most recent request
308 	 * in the context and so may be masking an earlier vip request. It
309 	 * is hoped that under the conditions where nopreempt is used, this
310 	 * will not matter (i.e. all requests to that context will be
311 	 * nopreempt for as long as desired).
312 	 */
313 	if (i915_request_has_nopreempt(rq))
314 		prio = I915_PRIORITY_UNPREEMPTABLE;
315 
316 	/*
317 	 * On unwinding the active request, we give it a priority bump
318 	 * if it has completed waiting on any semaphore. If we know that
319 	 * the request has already started, we can prevent an unwanted
320 	 * preempt-to-idle cycle by taking that into account now.
321 	 */
322 	if (__i915_request_has_started(rq))
323 		prio |= I915_PRIORITY_NOSEMAPHORE;
324 
325 	/* Restrict mere WAIT boosts from triggering preemption */
326 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
327 	return prio | __NO_PREEMPTION;
328 }
329 
330 static int queue_prio(const struct intel_engine_execlists *execlists)
331 {
332 	struct i915_priolist *p;
333 	struct rb_node *rb;
334 
335 	rb = rb_first_cached(&execlists->queue);
336 	if (!rb)
337 		return INT_MIN;
338 
339 	/*
340 	 * As the priolist[] are inverted, with the highest priority in [0],
341 	 * we have to flip the index value to become priority.
342 	 */
343 	p = to_priolist(rb);
344 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
345 }
346 
347 static inline bool need_preempt(const struct intel_engine_cs *engine,
348 				const struct i915_request *rq,
349 				struct rb_node *rb)
350 {
351 	int last_prio;
352 
353 	if (!intel_engine_has_semaphores(engine))
354 		return false;
355 
356 	/*
357 	 * Check if the current priority hint merits a preemption attempt.
358 	 *
359 	 * We record the highest value priority we saw during rescheduling
360 	 * prior to this dequeue, therefore we know that if it is strictly
361 	 * less than the current tail of ESLP[0], we do not need to force
362 	 * a preempt-to-idle cycle.
363 	 *
364 	 * However, the priority hint is a mere hint that we may need to
365 	 * preempt. If that hint is stale or we may be trying to preempt
366 	 * ourselves, ignore the request.
367 	 *
368 	 * More naturally we would write
369 	 *      prio >= max(0, last);
370 	 * except that we wish to prevent triggering preemption at the same
371 	 * priority level: the task that is running should remain running
372 	 * to preserve FIFO ordering of dependencies.
373 	 */
374 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
375 	if (engine->execlists.queue_priority_hint <= last_prio)
376 		return false;
377 
378 	/*
379 	 * Check against the first request in ELSP[1], it will, thanks to the
380 	 * power of PI, be the highest priority of that context.
381 	 */
382 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
383 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
384 		return true;
385 
386 	if (rb) {
387 		struct virtual_engine *ve =
388 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
389 		bool preempt = false;
390 
391 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
392 			struct i915_request *next;
393 
394 			rcu_read_lock();
395 			next = READ_ONCE(ve->request);
396 			if (next)
397 				preempt = rq_prio(next) > last_prio;
398 			rcu_read_unlock();
399 		}
400 
401 		if (preempt)
402 			return preempt;
403 	}
404 
405 	/*
406 	 * If the inflight context did not trigger the preemption, then maybe
407 	 * it was the set of queued requests? Pick the highest priority in
408 	 * the queue (the first active priolist) and see if it deserves to be
409 	 * running instead of ELSP[0].
410 	 *
411 	 * The highest priority request in the queue can not be either
412 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
413 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
414 	 */
415 	return queue_prio(&engine->execlists) > last_prio;
416 }
417 
418 __maybe_unused static inline bool
419 assert_priority_queue(const struct i915_request *prev,
420 		      const struct i915_request *next)
421 {
422 	/*
423 	 * Without preemption, the prev may refer to the still active element
424 	 * which we refuse to let go.
425 	 *
426 	 * Even with preemption, there are times when we think it is better not
427 	 * to preempt and leave an ostensibly lower priority request in flight.
428 	 */
429 	if (i915_request_is_active(prev))
430 		return true;
431 
432 	return rq_prio(prev) >= rq_prio(next);
433 }
434 
435 /*
436  * The context descriptor encodes various attributes of a context,
437  * including its GTT address and some flags. Because it's fairly
438  * expensive to calculate, we'll just do it once and cache the result,
439  * which remains valid until the context is unpinned.
440  *
441  * This is what a descriptor looks like, from LSB to MSB::
442  *
443  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
444  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
445  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
446  *      bits 53-54:    mbz, reserved for use by hardware
447  *      bits 55-63:    group ID, currently unused and set to 0
448  *
449  * Starting from Gen11, the upper dword of the descriptor has a new format:
450  *
451  *      bits 32-36:    reserved
452  *      bits 37-47:    SW context ID
453  *      bits 48:53:    engine instance
454  *      bit 54:        mbz, reserved for use by hardware
455  *      bits 55-60:    SW counter
456  *      bits 61-63:    engine class
457  *
458  * engine info, SW context ID and SW counter need to form a unique number
459  * (Context ID) per lrc.
460  */
461 static u64
462 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
463 {
464 	u64 desc;
465 
466 	desc = INTEL_LEGACY_32B_CONTEXT;
467 	if (i915_vm_is_4lvl(ce->vm))
468 		desc = INTEL_LEGACY_64B_CONTEXT;
469 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
470 
471 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
472 	if (IS_GEN(engine->i915, 8))
473 		desc |= GEN8_CTX_L3LLC_COHERENT;
474 
475 	desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
476 	/*
477 	 * The following 32bits are copied into the OA reports (dword 2).
478 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
479 	 * anything below.
480 	 */
481 	if (INTEL_GEN(engine->i915) >= 11) {
482 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
483 								/* bits 48-53 */
484 
485 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
486 								/* bits 61-63 */
487 	}
488 
489 	return desc;
490 }
491 
492 static inline unsigned int dword_in_page(void *addr)
493 {
494 	return offset_in_page(addr) / sizeof(u32);
495 }
496 
497 static void set_offsets(u32 *regs,
498 			const u8 *data,
499 			const struct intel_engine_cs *engine,
500 			bool clear)
501 #define NOP(x) (BIT(7) | (x))
502 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
503 #define POSTED BIT(0)
504 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
505 #define REG16(x) \
506 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
507 	(((x) >> 2) & 0x7f)
508 #define END(x) 0, (x)
509 {
510 	const u32 base = engine->mmio_base;
511 
512 	while (*data) {
513 		u8 count, flags;
514 
515 		if (*data & BIT(7)) { /* skip */
516 			count = *data++ & ~BIT(7);
517 			if (clear)
518 				memset32(regs, MI_NOOP, count);
519 			regs += count;
520 			continue;
521 		}
522 
523 		count = *data & 0x3f;
524 		flags = *data >> 6;
525 		data++;
526 
527 		*regs = MI_LOAD_REGISTER_IMM(count);
528 		if (flags & POSTED)
529 			*regs |= MI_LRI_FORCE_POSTED;
530 		if (INTEL_GEN(engine->i915) >= 11)
531 			*regs |= MI_LRI_CS_MMIO;
532 		regs++;
533 
534 		GEM_BUG_ON(!count);
535 		do {
536 			u32 offset = 0;
537 			u8 v;
538 
539 			do {
540 				v = *data++;
541 				offset <<= 7;
542 				offset |= v & ~BIT(7);
543 			} while (v & BIT(7));
544 
545 			regs[0] = base + (offset << 2);
546 			if (clear)
547 				regs[1] = 0;
548 			regs += 2;
549 		} while (--count);
550 	}
551 
552 	if (clear) {
553 		u8 count = *++data;
554 
555 		/* Clear past the tail for HW access */
556 		GEM_BUG_ON(dword_in_page(regs) > count);
557 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
558 
559 		/* Close the batch; used mainly by live_lrc_layout() */
560 		*regs = MI_BATCH_BUFFER_END;
561 		if (INTEL_GEN(engine->i915) >= 10)
562 			*regs |= BIT(0);
563 	}
564 }
565 
566 static const u8 gen8_xcs_offsets[] = {
567 	NOP(1),
568 	LRI(11, 0),
569 	REG16(0x244),
570 	REG(0x034),
571 	REG(0x030),
572 	REG(0x038),
573 	REG(0x03c),
574 	REG(0x168),
575 	REG(0x140),
576 	REG(0x110),
577 	REG(0x11c),
578 	REG(0x114),
579 	REG(0x118),
580 
581 	NOP(9),
582 	LRI(9, 0),
583 	REG16(0x3a8),
584 	REG16(0x28c),
585 	REG16(0x288),
586 	REG16(0x284),
587 	REG16(0x280),
588 	REG16(0x27c),
589 	REG16(0x278),
590 	REG16(0x274),
591 	REG16(0x270),
592 
593 	NOP(13),
594 	LRI(2, 0),
595 	REG16(0x200),
596 	REG(0x028),
597 
598 	END(80)
599 };
600 
601 static const u8 gen9_xcs_offsets[] = {
602 	NOP(1),
603 	LRI(14, POSTED),
604 	REG16(0x244),
605 	REG(0x034),
606 	REG(0x030),
607 	REG(0x038),
608 	REG(0x03c),
609 	REG(0x168),
610 	REG(0x140),
611 	REG(0x110),
612 	REG(0x11c),
613 	REG(0x114),
614 	REG(0x118),
615 	REG(0x1c0),
616 	REG(0x1c4),
617 	REG(0x1c8),
618 
619 	NOP(3),
620 	LRI(9, POSTED),
621 	REG16(0x3a8),
622 	REG16(0x28c),
623 	REG16(0x288),
624 	REG16(0x284),
625 	REG16(0x280),
626 	REG16(0x27c),
627 	REG16(0x278),
628 	REG16(0x274),
629 	REG16(0x270),
630 
631 	NOP(13),
632 	LRI(1, POSTED),
633 	REG16(0x200),
634 
635 	NOP(13),
636 	LRI(44, POSTED),
637 	REG(0x028),
638 	REG(0x09c),
639 	REG(0x0c0),
640 	REG(0x178),
641 	REG(0x17c),
642 	REG16(0x358),
643 	REG(0x170),
644 	REG(0x150),
645 	REG(0x154),
646 	REG(0x158),
647 	REG16(0x41c),
648 	REG16(0x600),
649 	REG16(0x604),
650 	REG16(0x608),
651 	REG16(0x60c),
652 	REG16(0x610),
653 	REG16(0x614),
654 	REG16(0x618),
655 	REG16(0x61c),
656 	REG16(0x620),
657 	REG16(0x624),
658 	REG16(0x628),
659 	REG16(0x62c),
660 	REG16(0x630),
661 	REG16(0x634),
662 	REG16(0x638),
663 	REG16(0x63c),
664 	REG16(0x640),
665 	REG16(0x644),
666 	REG16(0x648),
667 	REG16(0x64c),
668 	REG16(0x650),
669 	REG16(0x654),
670 	REG16(0x658),
671 	REG16(0x65c),
672 	REG16(0x660),
673 	REG16(0x664),
674 	REG16(0x668),
675 	REG16(0x66c),
676 	REG16(0x670),
677 	REG16(0x674),
678 	REG16(0x678),
679 	REG16(0x67c),
680 	REG(0x068),
681 
682 	END(176)
683 };
684 
685 static const u8 gen12_xcs_offsets[] = {
686 	NOP(1),
687 	LRI(13, POSTED),
688 	REG16(0x244),
689 	REG(0x034),
690 	REG(0x030),
691 	REG(0x038),
692 	REG(0x03c),
693 	REG(0x168),
694 	REG(0x140),
695 	REG(0x110),
696 	REG(0x1c0),
697 	REG(0x1c4),
698 	REG(0x1c8),
699 	REG(0x180),
700 	REG16(0x2b4),
701 
702 	NOP(5),
703 	LRI(9, POSTED),
704 	REG16(0x3a8),
705 	REG16(0x28c),
706 	REG16(0x288),
707 	REG16(0x284),
708 	REG16(0x280),
709 	REG16(0x27c),
710 	REG16(0x278),
711 	REG16(0x274),
712 	REG16(0x270),
713 
714 	END(80)
715 };
716 
717 static const u8 gen8_rcs_offsets[] = {
718 	NOP(1),
719 	LRI(14, POSTED),
720 	REG16(0x244),
721 	REG(0x034),
722 	REG(0x030),
723 	REG(0x038),
724 	REG(0x03c),
725 	REG(0x168),
726 	REG(0x140),
727 	REG(0x110),
728 	REG(0x11c),
729 	REG(0x114),
730 	REG(0x118),
731 	REG(0x1c0),
732 	REG(0x1c4),
733 	REG(0x1c8),
734 
735 	NOP(3),
736 	LRI(9, POSTED),
737 	REG16(0x3a8),
738 	REG16(0x28c),
739 	REG16(0x288),
740 	REG16(0x284),
741 	REG16(0x280),
742 	REG16(0x27c),
743 	REG16(0x278),
744 	REG16(0x274),
745 	REG16(0x270),
746 
747 	NOP(13),
748 	LRI(1, 0),
749 	REG(0x0c8),
750 
751 	END(80)
752 };
753 
754 static const u8 gen9_rcs_offsets[] = {
755 	NOP(1),
756 	LRI(14, POSTED),
757 	REG16(0x244),
758 	REG(0x34),
759 	REG(0x30),
760 	REG(0x38),
761 	REG(0x3c),
762 	REG(0x168),
763 	REG(0x140),
764 	REG(0x110),
765 	REG(0x11c),
766 	REG(0x114),
767 	REG(0x118),
768 	REG(0x1c0),
769 	REG(0x1c4),
770 	REG(0x1c8),
771 
772 	NOP(3),
773 	LRI(9, POSTED),
774 	REG16(0x3a8),
775 	REG16(0x28c),
776 	REG16(0x288),
777 	REG16(0x284),
778 	REG16(0x280),
779 	REG16(0x27c),
780 	REG16(0x278),
781 	REG16(0x274),
782 	REG16(0x270),
783 
784 	NOP(13),
785 	LRI(1, 0),
786 	REG(0xc8),
787 
788 	NOP(13),
789 	LRI(44, POSTED),
790 	REG(0x28),
791 	REG(0x9c),
792 	REG(0xc0),
793 	REG(0x178),
794 	REG(0x17c),
795 	REG16(0x358),
796 	REG(0x170),
797 	REG(0x150),
798 	REG(0x154),
799 	REG(0x158),
800 	REG16(0x41c),
801 	REG16(0x600),
802 	REG16(0x604),
803 	REG16(0x608),
804 	REG16(0x60c),
805 	REG16(0x610),
806 	REG16(0x614),
807 	REG16(0x618),
808 	REG16(0x61c),
809 	REG16(0x620),
810 	REG16(0x624),
811 	REG16(0x628),
812 	REG16(0x62c),
813 	REG16(0x630),
814 	REG16(0x634),
815 	REG16(0x638),
816 	REG16(0x63c),
817 	REG16(0x640),
818 	REG16(0x644),
819 	REG16(0x648),
820 	REG16(0x64c),
821 	REG16(0x650),
822 	REG16(0x654),
823 	REG16(0x658),
824 	REG16(0x65c),
825 	REG16(0x660),
826 	REG16(0x664),
827 	REG16(0x668),
828 	REG16(0x66c),
829 	REG16(0x670),
830 	REG16(0x674),
831 	REG16(0x678),
832 	REG16(0x67c),
833 	REG(0x68),
834 
835 	END(176)
836 };
837 
838 static const u8 gen11_rcs_offsets[] = {
839 	NOP(1),
840 	LRI(15, POSTED),
841 	REG16(0x244),
842 	REG(0x034),
843 	REG(0x030),
844 	REG(0x038),
845 	REG(0x03c),
846 	REG(0x168),
847 	REG(0x140),
848 	REG(0x110),
849 	REG(0x11c),
850 	REG(0x114),
851 	REG(0x118),
852 	REG(0x1c0),
853 	REG(0x1c4),
854 	REG(0x1c8),
855 	REG(0x180),
856 
857 	NOP(1),
858 	LRI(9, POSTED),
859 	REG16(0x3a8),
860 	REG16(0x28c),
861 	REG16(0x288),
862 	REG16(0x284),
863 	REG16(0x280),
864 	REG16(0x27c),
865 	REG16(0x278),
866 	REG16(0x274),
867 	REG16(0x270),
868 
869 	LRI(1, POSTED),
870 	REG(0x1b0),
871 
872 	NOP(10),
873 	LRI(1, 0),
874 	REG(0x0c8),
875 
876 	END(80)
877 };
878 
879 static const u8 gen12_rcs_offsets[] = {
880 	NOP(1),
881 	LRI(13, POSTED),
882 	REG16(0x244),
883 	REG(0x034),
884 	REG(0x030),
885 	REG(0x038),
886 	REG(0x03c),
887 	REG(0x168),
888 	REG(0x140),
889 	REG(0x110),
890 	REG(0x1c0),
891 	REG(0x1c4),
892 	REG(0x1c8),
893 	REG(0x180),
894 	REG16(0x2b4),
895 
896 	NOP(5),
897 	LRI(9, POSTED),
898 	REG16(0x3a8),
899 	REG16(0x28c),
900 	REG16(0x288),
901 	REG16(0x284),
902 	REG16(0x280),
903 	REG16(0x27c),
904 	REG16(0x278),
905 	REG16(0x274),
906 	REG16(0x270),
907 
908 	LRI(3, POSTED),
909 	REG(0x1b0),
910 	REG16(0x5a8),
911 	REG16(0x5ac),
912 
913 	NOP(6),
914 	LRI(1, 0),
915 	REG(0x0c8),
916 
917 	END(80)
918 };
919 
920 #undef END
921 #undef REG16
922 #undef REG
923 #undef LRI
924 #undef NOP
925 
926 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
927 {
928 	/*
929 	 * The gen12+ lists only have the registers we program in the basic
930 	 * default state. We rely on the context image using relative
931 	 * addressing to automatic fixup the register state between the
932 	 * physical engines for virtual engine.
933 	 */
934 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
935 		   !intel_engine_has_relative_mmio(engine));
936 
937 	if (engine->class == RENDER_CLASS) {
938 		if (INTEL_GEN(engine->i915) >= 12)
939 			return gen12_rcs_offsets;
940 		else if (INTEL_GEN(engine->i915) >= 11)
941 			return gen11_rcs_offsets;
942 		else if (INTEL_GEN(engine->i915) >= 9)
943 			return gen9_rcs_offsets;
944 		else
945 			return gen8_rcs_offsets;
946 	} else {
947 		if (INTEL_GEN(engine->i915) >= 12)
948 			return gen12_xcs_offsets;
949 		else if (INTEL_GEN(engine->i915) >= 9)
950 			return gen9_xcs_offsets;
951 		else
952 			return gen8_xcs_offsets;
953 	}
954 }
955 
956 static struct i915_request *
957 __unwind_incomplete_requests(struct intel_engine_cs *engine)
958 {
959 	struct i915_request *rq, *rn, *active = NULL;
960 	struct list_head *uninitialized_var(pl);
961 	int prio = I915_PRIORITY_INVALID;
962 
963 	lockdep_assert_held(&engine->active.lock);
964 
965 	list_for_each_entry_safe_reverse(rq, rn,
966 					 &engine->active.requests,
967 					 sched.link) {
968 		if (i915_request_completed(rq))
969 			continue; /* XXX */
970 
971 		__i915_request_unsubmit(rq);
972 
973 		/*
974 		 * Push the request back into the queue for later resubmission.
975 		 * If this request is not native to this physical engine (i.e.
976 		 * it came from a virtual source), push it back onto the virtual
977 		 * engine so that it can be moved across onto another physical
978 		 * engine as load dictates.
979 		 */
980 		if (likely(rq->execution_mask == engine->mask)) {
981 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
982 			if (rq_prio(rq) != prio) {
983 				prio = rq_prio(rq);
984 				pl = i915_sched_lookup_priolist(engine, prio);
985 			}
986 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
987 
988 			list_move(&rq->sched.link, pl);
989 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
990 
991 			active = rq;
992 		} else {
993 			struct intel_engine_cs *owner = rq->context->engine;
994 
995 			/*
996 			 * Decouple the virtual breadcrumb before moving it
997 			 * back to the virtual engine -- we don't want the
998 			 * request to complete in the background and try
999 			 * and cancel the breadcrumb on the virtual engine
1000 			 * (instead of the old engine where it is linked)!
1001 			 */
1002 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1003 				     &rq->fence.flags)) {
1004 				spin_lock_nested(&rq->lock,
1005 						 SINGLE_DEPTH_NESTING);
1006 				i915_request_cancel_breadcrumb(rq);
1007 				spin_unlock(&rq->lock);
1008 			}
1009 			rq->engine = owner;
1010 			owner->submit_request(rq);
1011 			active = NULL;
1012 		}
1013 	}
1014 
1015 	return active;
1016 }
1017 
1018 struct i915_request *
1019 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1020 {
1021 	struct intel_engine_cs *engine =
1022 		container_of(execlists, typeof(*engine), execlists);
1023 
1024 	return __unwind_incomplete_requests(engine);
1025 }
1026 
1027 static inline void
1028 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1029 {
1030 	/*
1031 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1032 	 * The compiler should eliminate this function as dead-code.
1033 	 */
1034 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1035 		return;
1036 
1037 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1038 				   status, rq);
1039 }
1040 
1041 static void intel_engine_context_in(struct intel_engine_cs *engine)
1042 {
1043 	unsigned long flags;
1044 
1045 	if (READ_ONCE(engine->stats.enabled) == 0)
1046 		return;
1047 
1048 	write_seqlock_irqsave(&engine->stats.lock, flags);
1049 
1050 	if (engine->stats.enabled > 0) {
1051 		if (engine->stats.active++ == 0)
1052 			engine->stats.start = ktime_get();
1053 		GEM_BUG_ON(engine->stats.active == 0);
1054 	}
1055 
1056 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1057 }
1058 
1059 static void intel_engine_context_out(struct intel_engine_cs *engine)
1060 {
1061 	unsigned long flags;
1062 
1063 	if (READ_ONCE(engine->stats.enabled) == 0)
1064 		return;
1065 
1066 	write_seqlock_irqsave(&engine->stats.lock, flags);
1067 
1068 	if (engine->stats.enabled > 0) {
1069 		ktime_t last;
1070 
1071 		if (engine->stats.active && --engine->stats.active == 0) {
1072 			/*
1073 			 * Decrement the active context count and in case GPU
1074 			 * is now idle add up to the running total.
1075 			 */
1076 			last = ktime_sub(ktime_get(), engine->stats.start);
1077 
1078 			engine->stats.total = ktime_add(engine->stats.total,
1079 							last);
1080 		} else if (engine->stats.active == 0) {
1081 			/*
1082 			 * After turning on engine stats, context out might be
1083 			 * the first event in which case we account from the
1084 			 * time stats gathering was turned on.
1085 			 */
1086 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1087 
1088 			engine->stats.total = ktime_add(engine->stats.total,
1089 							last);
1090 		}
1091 	}
1092 
1093 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1094 }
1095 
1096 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1097 {
1098 	if (INTEL_GEN(engine->i915) >= 12)
1099 		return 0x60;
1100 	else if (INTEL_GEN(engine->i915) >= 9)
1101 		return 0x54;
1102 	else if (engine->class == RENDER_CLASS)
1103 		return 0x58;
1104 	else
1105 		return -1;
1106 }
1107 
1108 static void
1109 execlists_check_context(const struct intel_context *ce,
1110 			const struct intel_engine_cs *engine)
1111 {
1112 	const struct intel_ring *ring = ce->ring;
1113 	u32 *regs = ce->lrc_reg_state;
1114 	bool valid = true;
1115 	int x;
1116 
1117 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1118 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1119 		       engine->name,
1120 		       regs[CTX_RING_START],
1121 		       i915_ggtt_offset(ring->vma));
1122 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1123 		valid = false;
1124 	}
1125 
1126 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1127 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1128 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1129 		       engine->name,
1130 		       regs[CTX_RING_CTL],
1131 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1132 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1133 		valid = false;
1134 	}
1135 
1136 	x = lrc_ring_mi_mode(engine);
1137 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1138 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1139 		       engine->name, regs[x + 1]);
1140 		regs[x + 1] &= ~STOP_RING;
1141 		regs[x + 1] |= STOP_RING << 16;
1142 		valid = false;
1143 	}
1144 
1145 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1146 }
1147 
1148 static void restore_default_state(struct intel_context *ce,
1149 				  struct intel_engine_cs *engine)
1150 {
1151 	u32 *regs = ce->lrc_reg_state;
1152 
1153 	if (engine->pinned_default_state)
1154 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
1155 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1156 		       engine->context_size - PAGE_SIZE);
1157 
1158 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1159 }
1160 
1161 static void reset_active(struct i915_request *rq,
1162 			 struct intel_engine_cs *engine)
1163 {
1164 	struct intel_context * const ce = rq->context;
1165 	u32 head;
1166 
1167 	/*
1168 	 * The executing context has been cancelled. We want to prevent
1169 	 * further execution along this context and propagate the error on
1170 	 * to anything depending on its results.
1171 	 *
1172 	 * In __i915_request_submit(), we apply the -EIO and remove the
1173 	 * requests' payloads for any banned requests. But first, we must
1174 	 * rewind the context back to the start of the incomplete request so
1175 	 * that we do not jump back into the middle of the batch.
1176 	 *
1177 	 * We preserve the breadcrumbs and semaphores of the incomplete
1178 	 * requests so that inter-timeline dependencies (i.e other timelines)
1179 	 * remain correctly ordered. And we defer to __i915_request_submit()
1180 	 * so that all asynchronous waits are correctly handled.
1181 	 */
1182 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1183 		     rq->fence.context, rq->fence.seqno);
1184 
1185 	/* On resubmission of the active request, payload will be scrubbed */
1186 	if (i915_request_completed(rq))
1187 		head = rq->tail;
1188 	else
1189 		head = active_request(ce->timeline, rq)->head;
1190 	head = intel_ring_wrap(ce->ring, head);
1191 
1192 	/* Scrub the context image to prevent replaying the previous batch */
1193 	restore_default_state(ce, engine);
1194 	__execlists_update_reg_state(ce, engine, head);
1195 
1196 	/* We've switched away, so this should be a no-op, but intent matters */
1197 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1198 }
1199 
1200 static inline struct intel_engine_cs *
1201 __execlists_schedule_in(struct i915_request *rq)
1202 {
1203 	struct intel_engine_cs * const engine = rq->engine;
1204 	struct intel_context * const ce = rq->context;
1205 
1206 	intel_context_get(ce);
1207 
1208 	if (unlikely(intel_context_is_banned(ce)))
1209 		reset_active(rq, engine);
1210 
1211 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1212 		execlists_check_context(ce, engine);
1213 
1214 	if (ce->tag) {
1215 		/* Use a fixed tag for OA and friends */
1216 		ce->lrc_desc |= (u64)ce->tag << 32;
1217 	} else {
1218 		/* We don't need a strict matching tag, just different values */
1219 		ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1220 		ce->lrc_desc |=
1221 			(u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1222 			GEN11_SW_CTX_ID_SHIFT;
1223 		BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1224 	}
1225 
1226 	__intel_gt_pm_get(engine->gt);
1227 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1228 	intel_engine_context_in(engine);
1229 
1230 	return engine;
1231 }
1232 
1233 static inline struct i915_request *
1234 execlists_schedule_in(struct i915_request *rq, int idx)
1235 {
1236 	struct intel_context * const ce = rq->context;
1237 	struct intel_engine_cs *old;
1238 
1239 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1240 	trace_i915_request_in(rq, idx);
1241 
1242 	old = READ_ONCE(ce->inflight);
1243 	do {
1244 		if (!old) {
1245 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1246 			break;
1247 		}
1248 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1249 
1250 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1251 	return i915_request_get(rq);
1252 }
1253 
1254 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1255 {
1256 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1257 	struct i915_request *next = READ_ONCE(ve->request);
1258 
1259 	if (next && next->execution_mask & ~rq->execution_mask)
1260 		tasklet_schedule(&ve->base.execlists.tasklet);
1261 }
1262 
1263 static inline void
1264 __execlists_schedule_out(struct i915_request *rq,
1265 			 struct intel_engine_cs * const engine)
1266 {
1267 	struct intel_context * const ce = rq->context;
1268 
1269 	/*
1270 	 * NB process_csb() is not under the engine->active.lock and hence
1271 	 * schedule_out can race with schedule_in meaning that we should
1272 	 * refrain from doing non-trivial work here.
1273 	 */
1274 
1275 	/*
1276 	 * If we have just completed this context, the engine may now be
1277 	 * idle and we want to re-enter powersaving.
1278 	 */
1279 	if (list_is_last(&rq->link, &ce->timeline->requests) &&
1280 	    i915_request_completed(rq))
1281 		intel_engine_add_retire(engine, ce->timeline);
1282 
1283 	intel_engine_context_out(engine);
1284 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1285 	intel_gt_pm_put_async(engine->gt);
1286 
1287 	/*
1288 	 * If this is part of a virtual engine, its next request may
1289 	 * have been blocked waiting for access to the active context.
1290 	 * We have to kick all the siblings again in case we need to
1291 	 * switch (e.g. the next request is not runnable on this
1292 	 * engine). Hopefully, we will already have submitted the next
1293 	 * request before the tasklet runs and do not need to rebuild
1294 	 * each virtual tree and kick everyone again.
1295 	 */
1296 	if (ce->engine != engine)
1297 		kick_siblings(rq, ce);
1298 
1299 	intel_context_put(ce);
1300 }
1301 
1302 static inline void
1303 execlists_schedule_out(struct i915_request *rq)
1304 {
1305 	struct intel_context * const ce = rq->context;
1306 	struct intel_engine_cs *cur, *old;
1307 
1308 	trace_i915_request_out(rq);
1309 
1310 	old = READ_ONCE(ce->inflight);
1311 	do
1312 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1313 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1314 	if (!cur)
1315 		__execlists_schedule_out(rq, old);
1316 
1317 	i915_request_put(rq);
1318 }
1319 
1320 static u64 execlists_update_context(struct i915_request *rq)
1321 {
1322 	struct intel_context *ce = rq->context;
1323 	u64 desc = ce->lrc_desc;
1324 	u32 tail, prev;
1325 
1326 	/*
1327 	 * WaIdleLiteRestore:bdw,skl
1328 	 *
1329 	 * We should never submit the context with the same RING_TAIL twice
1330 	 * just in case we submit an empty ring, which confuses the HW.
1331 	 *
1332 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1333 	 * the normal request to be able to always advance the RING_TAIL on
1334 	 * subsequent resubmissions (for lite restore). Should that fail us,
1335 	 * and we try and submit the same tail again, force the context
1336 	 * reload.
1337 	 *
1338 	 * If we need to return to a preempted context, we need to skip the
1339 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1340 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1341 	 * an earlier request.
1342 	 */
1343 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1344 	prev = ce->lrc_reg_state[CTX_RING_TAIL];
1345 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1346 		desc |= CTX_DESC_FORCE_RESTORE;
1347 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1348 	rq->tail = rq->wa_tail;
1349 
1350 	/*
1351 	 * Make sure the context image is complete before we submit it to HW.
1352 	 *
1353 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1354 	 * an uncached write such as our mmio register access, the empirical
1355 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1356 	 * may not be visible to the HW prior to the completion of the UC
1357 	 * register write and that we may begin execution from the context
1358 	 * before its image is complete leading to invalid PD chasing.
1359 	 */
1360 	wmb();
1361 
1362 	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1363 	return desc;
1364 }
1365 
1366 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1367 {
1368 	if (execlists->ctrl_reg) {
1369 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1370 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1371 	} else {
1372 		writel(upper_32_bits(desc), execlists->submit_reg);
1373 		writel(lower_32_bits(desc), execlists->submit_reg);
1374 	}
1375 }
1376 
1377 static __maybe_unused void
1378 trace_ports(const struct intel_engine_execlists *execlists,
1379 	    const char *msg,
1380 	    struct i915_request * const *ports)
1381 {
1382 	const struct intel_engine_cs *engine =
1383 		container_of(execlists, typeof(*engine), execlists);
1384 
1385 	if (!ports[0])
1386 		return;
1387 
1388 	ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1389 		     ports[0]->fence.context,
1390 		     ports[0]->fence.seqno,
1391 		     i915_request_completed(ports[0]) ? "!" :
1392 		     i915_request_started(ports[0]) ? "*" :
1393 		     "",
1394 		     ports[1] ? ports[1]->fence.context : 0,
1395 		     ports[1] ? ports[1]->fence.seqno : 0);
1396 }
1397 
1398 static __maybe_unused bool
1399 assert_pending_valid(const struct intel_engine_execlists *execlists,
1400 		     const char *msg)
1401 {
1402 	struct i915_request * const *port, *rq;
1403 	struct intel_context *ce = NULL;
1404 
1405 	trace_ports(execlists, msg, execlists->pending);
1406 
1407 	if (!execlists->pending[0]) {
1408 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1409 		return false;
1410 	}
1411 
1412 	if (execlists->pending[execlists_num_ports(execlists)]) {
1413 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1414 			      execlists_num_ports(execlists));
1415 		return false;
1416 	}
1417 
1418 	for (port = execlists->pending; (rq = *port); port++) {
1419 		unsigned long flags;
1420 		bool ok = true;
1421 
1422 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1423 		GEM_BUG_ON(!i915_request_is_active(rq));
1424 
1425 		if (ce == rq->context) {
1426 			GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1427 				      ce->timeline->fence_context,
1428 				      port - execlists->pending);
1429 			return false;
1430 		}
1431 		ce = rq->context;
1432 
1433 		/* Hold tightly onto the lock to prevent concurrent retires! */
1434 		if (!spin_trylock_irqsave(&rq->lock, flags))
1435 			continue;
1436 
1437 		if (i915_request_completed(rq))
1438 			goto unlock;
1439 
1440 		if (i915_active_is_idle(&ce->active) &&
1441 		    !intel_context_is_barrier(ce)) {
1442 			GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1443 				      ce->timeline->fence_context,
1444 				      port - execlists->pending);
1445 			ok = false;
1446 			goto unlock;
1447 		}
1448 
1449 		if (!i915_vma_is_pinned(ce->state)) {
1450 			GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1451 				      ce->timeline->fence_context,
1452 				      port - execlists->pending);
1453 			ok = false;
1454 			goto unlock;
1455 		}
1456 
1457 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1458 			GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1459 				      ce->timeline->fence_context,
1460 				      port - execlists->pending);
1461 			ok = false;
1462 			goto unlock;
1463 		}
1464 
1465 unlock:
1466 		spin_unlock_irqrestore(&rq->lock, flags);
1467 		if (!ok)
1468 			return false;
1469 	}
1470 
1471 	return ce;
1472 }
1473 
1474 static void execlists_submit_ports(struct intel_engine_cs *engine)
1475 {
1476 	struct intel_engine_execlists *execlists = &engine->execlists;
1477 	unsigned int n;
1478 
1479 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1480 
1481 	/*
1482 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1483 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1484 	 * not be relinquished until the device is idle (see
1485 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1486 	 * that all ELSP are drained i.e. we have processed the CSB,
1487 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1488 	 */
1489 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1490 
1491 	/*
1492 	 * ELSQ note: the submit queue is not cleared after being submitted
1493 	 * to the HW so we need to make sure we always clean it up. This is
1494 	 * currently ensured by the fact that we always write the same number
1495 	 * of elsq entries, keep this in mind before changing the loop below.
1496 	 */
1497 	for (n = execlists_num_ports(execlists); n--; ) {
1498 		struct i915_request *rq = execlists->pending[n];
1499 
1500 		write_desc(execlists,
1501 			   rq ? execlists_update_context(rq) : 0,
1502 			   n);
1503 	}
1504 
1505 	/* we need to manually load the submit queue */
1506 	if (execlists->ctrl_reg)
1507 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1508 }
1509 
1510 static bool ctx_single_port_submission(const struct intel_context *ce)
1511 {
1512 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1513 		intel_context_force_single_submission(ce));
1514 }
1515 
1516 static bool can_merge_ctx(const struct intel_context *prev,
1517 			  const struct intel_context *next)
1518 {
1519 	if (prev != next)
1520 		return false;
1521 
1522 	if (ctx_single_port_submission(prev))
1523 		return false;
1524 
1525 	return true;
1526 }
1527 
1528 static bool can_merge_rq(const struct i915_request *prev,
1529 			 const struct i915_request *next)
1530 {
1531 	GEM_BUG_ON(prev == next);
1532 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1533 
1534 	/*
1535 	 * We do not submit known completed requests. Therefore if the next
1536 	 * request is already completed, we can pretend to merge it in
1537 	 * with the previous context (and we will skip updating the ELSP
1538 	 * and tracking). Thus hopefully keeping the ELSP full with active
1539 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1540 	 * us.
1541 	 */
1542 	if (i915_request_completed(next))
1543 		return true;
1544 
1545 	if (unlikely((prev->fence.flags ^ next->fence.flags) &
1546 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1547 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1548 		return false;
1549 
1550 	if (!can_merge_ctx(prev->context, next->context))
1551 		return false;
1552 
1553 	return true;
1554 }
1555 
1556 static void virtual_update_register_offsets(u32 *regs,
1557 					    struct intel_engine_cs *engine)
1558 {
1559 	set_offsets(regs, reg_offsets(engine), engine, false);
1560 }
1561 
1562 static bool virtual_matches(const struct virtual_engine *ve,
1563 			    const struct i915_request *rq,
1564 			    const struct intel_engine_cs *engine)
1565 {
1566 	const struct intel_engine_cs *inflight;
1567 
1568 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1569 		return false;
1570 
1571 	/*
1572 	 * We track when the HW has completed saving the context image
1573 	 * (i.e. when we have seen the final CS event switching out of
1574 	 * the context) and must not overwrite the context image before
1575 	 * then. This restricts us to only using the active engine
1576 	 * while the previous virtualized request is inflight (so
1577 	 * we reuse the register offsets). This is a very small
1578 	 * hystersis on the greedy seelction algorithm.
1579 	 */
1580 	inflight = intel_context_inflight(&ve->context);
1581 	if (inflight && inflight != engine)
1582 		return false;
1583 
1584 	return true;
1585 }
1586 
1587 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1588 				     struct intel_engine_cs *engine)
1589 {
1590 	struct intel_engine_cs *old = ve->siblings[0];
1591 
1592 	/* All unattached (rq->engine == old) must already be completed */
1593 
1594 	spin_lock(&old->breadcrumbs.irq_lock);
1595 	if (!list_empty(&ve->context.signal_link)) {
1596 		list_move_tail(&ve->context.signal_link,
1597 			       &engine->breadcrumbs.signalers);
1598 		intel_engine_signal_breadcrumbs(engine);
1599 	}
1600 	spin_unlock(&old->breadcrumbs.irq_lock);
1601 }
1602 
1603 #define for_each_waiter(p__, rq__) \
1604 	list_for_each_entry_lockless(p__, \
1605 				     &(rq__)->sched.waiters_list, \
1606 				     wait_link)
1607 
1608 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1609 {
1610 	LIST_HEAD(list);
1611 
1612 	/*
1613 	 * We want to move the interrupted request to the back of
1614 	 * the round-robin list (i.e. its priority level), but
1615 	 * in doing so, we must then move all requests that were in
1616 	 * flight and were waiting for the interrupted request to
1617 	 * be run after it again.
1618 	 */
1619 	do {
1620 		struct i915_dependency *p;
1621 
1622 		GEM_BUG_ON(i915_request_is_active(rq));
1623 		list_move_tail(&rq->sched.link, pl);
1624 
1625 		for_each_waiter(p, rq) {
1626 			struct i915_request *w =
1627 				container_of(p->waiter, typeof(*w), sched);
1628 
1629 			/* Leave semaphores spinning on the other engines */
1630 			if (w->engine != rq->engine)
1631 				continue;
1632 
1633 			/* No waiter should start before its signaler */
1634 			GEM_BUG_ON(i915_request_started(w) &&
1635 				   !i915_request_completed(rq));
1636 
1637 			GEM_BUG_ON(i915_request_is_active(w));
1638 			if (!i915_request_is_ready(w))
1639 				continue;
1640 
1641 			if (rq_prio(w) < rq_prio(rq))
1642 				continue;
1643 
1644 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1645 			list_move_tail(&w->sched.link, &list);
1646 		}
1647 
1648 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1649 	} while (rq);
1650 }
1651 
1652 static void defer_active(struct intel_engine_cs *engine)
1653 {
1654 	struct i915_request *rq;
1655 
1656 	rq = __unwind_incomplete_requests(engine);
1657 	if (!rq)
1658 		return;
1659 
1660 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1661 }
1662 
1663 static bool
1664 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1665 {
1666 	int hint;
1667 
1668 	if (!intel_engine_has_timeslices(engine))
1669 		return false;
1670 
1671 	hint = engine->execlists.queue_priority_hint;
1672 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1673 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1674 
1675 	return hint >= effective_prio(rq);
1676 }
1677 
1678 static int
1679 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1680 {
1681 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1682 		return INT_MIN;
1683 
1684 	return rq_prio(list_next_entry(rq, sched.link));
1685 }
1686 
1687 static inline unsigned long
1688 timeslice(const struct intel_engine_cs *engine)
1689 {
1690 	return READ_ONCE(engine->props.timeslice_duration_ms);
1691 }
1692 
1693 static unsigned long
1694 active_timeslice(const struct intel_engine_cs *engine)
1695 {
1696 	const struct i915_request *rq = *engine->execlists.active;
1697 
1698 	if (!rq || i915_request_completed(rq))
1699 		return 0;
1700 
1701 	if (engine->execlists.switch_priority_hint < effective_prio(rq))
1702 		return 0;
1703 
1704 	return timeslice(engine);
1705 }
1706 
1707 static void set_timeslice(struct intel_engine_cs *engine)
1708 {
1709 	if (!intel_engine_has_timeslices(engine))
1710 		return;
1711 
1712 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1713 }
1714 
1715 static void start_timeslice(struct intel_engine_cs *engine)
1716 {
1717 	struct intel_engine_execlists *execlists = &engine->execlists;
1718 
1719 	execlists->switch_priority_hint = execlists->queue_priority_hint;
1720 
1721 	if (timer_pending(&execlists->timer))
1722 		return;
1723 
1724 	set_timer_ms(&execlists->timer, timeslice(engine));
1725 }
1726 
1727 static void record_preemption(struct intel_engine_execlists *execlists)
1728 {
1729 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1730 }
1731 
1732 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
1733 					    const struct i915_request *rq)
1734 {
1735 	if (!rq)
1736 		return 0;
1737 
1738 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1739 	if (unlikely(intel_context_is_banned(rq->context)))
1740 		return 1;
1741 
1742 	return READ_ONCE(engine->props.preempt_timeout_ms);
1743 }
1744 
1745 static void set_preempt_timeout(struct intel_engine_cs *engine,
1746 				const struct i915_request *rq)
1747 {
1748 	if (!intel_engine_has_preempt_reset(engine))
1749 		return;
1750 
1751 	set_timer_ms(&engine->execlists.preempt,
1752 		     active_preempt_timeout(engine, rq));
1753 }
1754 
1755 static inline void clear_ports(struct i915_request **ports, int count)
1756 {
1757 	memset_p((void **)ports, NULL, count);
1758 }
1759 
1760 static void execlists_dequeue(struct intel_engine_cs *engine)
1761 {
1762 	struct intel_engine_execlists * const execlists = &engine->execlists;
1763 	struct i915_request **port = execlists->pending;
1764 	struct i915_request ** const last_port = port + execlists->port_mask;
1765 	struct i915_request * const *active;
1766 	struct i915_request *last;
1767 	struct rb_node *rb;
1768 	bool submit = false;
1769 
1770 	/*
1771 	 * Hardware submission is through 2 ports. Conceptually each port
1772 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1773 	 * static for a context, and unique to each, so we only execute
1774 	 * requests belonging to a single context from each ring. RING_HEAD
1775 	 * is maintained by the CS in the context image, it marks the place
1776 	 * where it got up to last time, and through RING_TAIL we tell the CS
1777 	 * where we want to execute up to this time.
1778 	 *
1779 	 * In this list the requests are in order of execution. Consecutive
1780 	 * requests from the same context are adjacent in the ringbuffer. We
1781 	 * can combine these requests into a single RING_TAIL update:
1782 	 *
1783 	 *              RING_HEAD...req1...req2
1784 	 *                                    ^- RING_TAIL
1785 	 * since to execute req2 the CS must first execute req1.
1786 	 *
1787 	 * Our goal then is to point each port to the end of a consecutive
1788 	 * sequence of requests as being the most optimal (fewest wake ups
1789 	 * and context switches) submission.
1790 	 */
1791 
1792 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1793 		struct virtual_engine *ve =
1794 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1795 		struct i915_request *rq = READ_ONCE(ve->request);
1796 
1797 		if (!rq) { /* lazily cleanup after another engine handled rq */
1798 			rb_erase_cached(rb, &execlists->virtual);
1799 			RB_CLEAR_NODE(rb);
1800 			rb = rb_first_cached(&execlists->virtual);
1801 			continue;
1802 		}
1803 
1804 		if (!virtual_matches(ve, rq, engine)) {
1805 			rb = rb_next(rb);
1806 			continue;
1807 		}
1808 
1809 		break;
1810 	}
1811 
1812 	/*
1813 	 * If the queue is higher priority than the last
1814 	 * request in the currently active context, submit afresh.
1815 	 * We will resubmit again afterwards in case we need to split
1816 	 * the active context to interject the preemption request,
1817 	 * i.e. we will retrigger preemption following the ack in case
1818 	 * of trouble.
1819 	 */
1820 	active = READ_ONCE(execlists->active);
1821 	while ((last = *active) && i915_request_completed(last))
1822 		active++;
1823 
1824 	if (last) {
1825 		if (need_preempt(engine, last, rb)) {
1826 			ENGINE_TRACE(engine,
1827 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1828 				     last->fence.context,
1829 				     last->fence.seqno,
1830 				     last->sched.attr.priority,
1831 				     execlists->queue_priority_hint);
1832 			record_preemption(execlists);
1833 
1834 			/*
1835 			 * Don't let the RING_HEAD advance past the breadcrumb
1836 			 * as we unwind (and until we resubmit) so that we do
1837 			 * not accidentally tell it to go backwards.
1838 			 */
1839 			ring_set_paused(engine, 1);
1840 
1841 			/*
1842 			 * Note that we have not stopped the GPU at this point,
1843 			 * so we are unwinding the incomplete requests as they
1844 			 * remain inflight and so by the time we do complete
1845 			 * the preemption, some of the unwound requests may
1846 			 * complete!
1847 			 */
1848 			__unwind_incomplete_requests(engine);
1849 
1850 			last = NULL;
1851 		} else if (need_timeslice(engine, last) &&
1852 			   timer_expired(&engine->execlists.timer)) {
1853 			ENGINE_TRACE(engine,
1854 				     "expired last=%llx:%lld, prio=%d, hint=%d\n",
1855 				     last->fence.context,
1856 				     last->fence.seqno,
1857 				     last->sched.attr.priority,
1858 				     execlists->queue_priority_hint);
1859 
1860 			ring_set_paused(engine, 1);
1861 			defer_active(engine);
1862 
1863 			/*
1864 			 * Unlike for preemption, if we rewind and continue
1865 			 * executing the same context as previously active,
1866 			 * the order of execution will remain the same and
1867 			 * the tail will only advance. We do not need to
1868 			 * force a full context restore, as a lite-restore
1869 			 * is sufficient to resample the monotonic TAIL.
1870 			 *
1871 			 * If we switch to any other context, similarly we
1872 			 * will not rewind TAIL of current context, and
1873 			 * normal save/restore will preserve state and allow
1874 			 * us to later continue executing the same request.
1875 			 */
1876 			last = NULL;
1877 		} else {
1878 			/*
1879 			 * Otherwise if we already have a request pending
1880 			 * for execution after the current one, we can
1881 			 * just wait until the next CS event before
1882 			 * queuing more. In either case we will force a
1883 			 * lite-restore preemption event, but if we wait
1884 			 * we hopefully coalesce several updates into a single
1885 			 * submission.
1886 			 */
1887 			if (!list_is_last(&last->sched.link,
1888 					  &engine->active.requests)) {
1889 				/*
1890 				 * Even if ELSP[1] is occupied and not worthy
1891 				 * of timeslices, our queue might be.
1892 				 */
1893 				start_timeslice(engine);
1894 				return;
1895 			}
1896 		}
1897 	}
1898 
1899 	while (rb) { /* XXX virtual is always taking precedence */
1900 		struct virtual_engine *ve =
1901 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1902 		struct i915_request *rq;
1903 
1904 		spin_lock(&ve->base.active.lock);
1905 
1906 		rq = ve->request;
1907 		if (unlikely(!rq)) { /* lost the race to a sibling */
1908 			spin_unlock(&ve->base.active.lock);
1909 			rb_erase_cached(rb, &execlists->virtual);
1910 			RB_CLEAR_NODE(rb);
1911 			rb = rb_first_cached(&execlists->virtual);
1912 			continue;
1913 		}
1914 
1915 		GEM_BUG_ON(rq != ve->request);
1916 		GEM_BUG_ON(rq->engine != &ve->base);
1917 		GEM_BUG_ON(rq->context != &ve->context);
1918 
1919 		if (rq_prio(rq) >= queue_prio(execlists)) {
1920 			if (!virtual_matches(ve, rq, engine)) {
1921 				spin_unlock(&ve->base.active.lock);
1922 				rb = rb_next(rb);
1923 				continue;
1924 			}
1925 
1926 			if (last && !can_merge_rq(last, rq)) {
1927 				spin_unlock(&ve->base.active.lock);
1928 				start_timeslice(engine);
1929 				return; /* leave this for another sibling */
1930 			}
1931 
1932 			ENGINE_TRACE(engine,
1933 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
1934 				     rq->fence.context,
1935 				     rq->fence.seqno,
1936 				     i915_request_completed(rq) ? "!" :
1937 				     i915_request_started(rq) ? "*" :
1938 				     "",
1939 				     yesno(engine != ve->siblings[0]));
1940 
1941 			ve->request = NULL;
1942 			ve->base.execlists.queue_priority_hint = INT_MIN;
1943 			rb_erase_cached(rb, &execlists->virtual);
1944 			RB_CLEAR_NODE(rb);
1945 
1946 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1947 			rq->engine = engine;
1948 
1949 			if (engine != ve->siblings[0]) {
1950 				u32 *regs = ve->context.lrc_reg_state;
1951 				unsigned int n;
1952 
1953 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1954 
1955 				if (!intel_engine_has_relative_mmio(engine))
1956 					virtual_update_register_offsets(regs,
1957 									engine);
1958 
1959 				if (!list_empty(&ve->context.signals))
1960 					virtual_xfer_breadcrumbs(ve, engine);
1961 
1962 				/*
1963 				 * Move the bound engine to the top of the list
1964 				 * for future execution. We then kick this
1965 				 * tasklet first before checking others, so that
1966 				 * we preferentially reuse this set of bound
1967 				 * registers.
1968 				 */
1969 				for (n = 1; n < ve->num_siblings; n++) {
1970 					if (ve->siblings[n] == engine) {
1971 						swap(ve->siblings[n],
1972 						     ve->siblings[0]);
1973 						break;
1974 					}
1975 				}
1976 
1977 				GEM_BUG_ON(ve->siblings[0] != engine);
1978 			}
1979 
1980 			if (__i915_request_submit(rq)) {
1981 				submit = true;
1982 				last = rq;
1983 			}
1984 			i915_request_put(rq);
1985 
1986 			/*
1987 			 * Hmm, we have a bunch of virtual engine requests,
1988 			 * but the first one was already completed (thanks
1989 			 * preempt-to-busy!). Keep looking at the veng queue
1990 			 * until we have no more relevant requests (i.e.
1991 			 * the normal submit queue has higher priority).
1992 			 */
1993 			if (!submit) {
1994 				spin_unlock(&ve->base.active.lock);
1995 				rb = rb_first_cached(&execlists->virtual);
1996 				continue;
1997 			}
1998 		}
1999 
2000 		spin_unlock(&ve->base.active.lock);
2001 		break;
2002 	}
2003 
2004 	while ((rb = rb_first_cached(&execlists->queue))) {
2005 		struct i915_priolist *p = to_priolist(rb);
2006 		struct i915_request *rq, *rn;
2007 		int i;
2008 
2009 		priolist_for_each_request_consume(rq, rn, p, i) {
2010 			bool merge = true;
2011 
2012 			/*
2013 			 * Can we combine this request with the current port?
2014 			 * It has to be the same context/ringbuffer and not
2015 			 * have any exceptions (e.g. GVT saying never to
2016 			 * combine contexts).
2017 			 *
2018 			 * If we can combine the requests, we can execute both
2019 			 * by updating the RING_TAIL to point to the end of the
2020 			 * second request, and so we never need to tell the
2021 			 * hardware about the first.
2022 			 */
2023 			if (last && !can_merge_rq(last, rq)) {
2024 				/*
2025 				 * If we are on the second port and cannot
2026 				 * combine this request with the last, then we
2027 				 * are done.
2028 				 */
2029 				if (port == last_port)
2030 					goto done;
2031 
2032 				/*
2033 				 * We must not populate both ELSP[] with the
2034 				 * same LRCA, i.e. we must submit 2 different
2035 				 * contexts if we submit 2 ELSP.
2036 				 */
2037 				if (last->context == rq->context)
2038 					goto done;
2039 
2040 				if (i915_request_has_sentinel(last))
2041 					goto done;
2042 
2043 				/*
2044 				 * If GVT overrides us we only ever submit
2045 				 * port[0], leaving port[1] empty. Note that we
2046 				 * also have to be careful that we don't queue
2047 				 * the same context (even though a different
2048 				 * request) to the second port.
2049 				 */
2050 				if (ctx_single_port_submission(last->context) ||
2051 				    ctx_single_port_submission(rq->context))
2052 					goto done;
2053 
2054 				merge = false;
2055 			}
2056 
2057 			if (__i915_request_submit(rq)) {
2058 				if (!merge) {
2059 					*port = execlists_schedule_in(last, port - execlists->pending);
2060 					port++;
2061 					last = NULL;
2062 				}
2063 
2064 				GEM_BUG_ON(last &&
2065 					   !can_merge_ctx(last->context,
2066 							  rq->context));
2067 
2068 				submit = true;
2069 				last = rq;
2070 			}
2071 		}
2072 
2073 		rb_erase_cached(&p->node, &execlists->queue);
2074 		i915_priolist_free(p);
2075 	}
2076 
2077 done:
2078 	/*
2079 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2080 	 *
2081 	 * We choose the priority hint such that if we add a request of greater
2082 	 * priority than this, we kick the submission tasklet to decide on
2083 	 * the right order of submitting the requests to hardware. We must
2084 	 * also be prepared to reorder requests as they are in-flight on the
2085 	 * HW. We derive the priority hint then as the first "hole" in
2086 	 * the HW submission ports and if there are no available slots,
2087 	 * the priority of the lowest executing request, i.e. last.
2088 	 *
2089 	 * When we do receive a higher priority request ready to run from the
2090 	 * user, see queue_request(), the priority hint is bumped to that
2091 	 * request triggering preemption on the next dequeue (or subsequent
2092 	 * interrupt for secondary ports).
2093 	 */
2094 	execlists->queue_priority_hint = queue_prio(execlists);
2095 
2096 	if (submit) {
2097 		*port = execlists_schedule_in(last, port - execlists->pending);
2098 		execlists->switch_priority_hint =
2099 			switch_prio(engine, *execlists->pending);
2100 
2101 		/*
2102 		 * Skip if we ended up with exactly the same set of requests,
2103 		 * e.g. trying to timeslice a pair of ordered contexts
2104 		 */
2105 		if (!memcmp(active, execlists->pending,
2106 			    (port - execlists->pending + 1) * sizeof(*port))) {
2107 			do
2108 				execlists_schedule_out(fetch_and_zero(port));
2109 			while (port-- != execlists->pending);
2110 
2111 			goto skip_submit;
2112 		}
2113 		clear_ports(port + 1, last_port - port);
2114 
2115 		execlists_submit_ports(engine);
2116 		set_preempt_timeout(engine, *active);
2117 	} else {
2118 skip_submit:
2119 		ring_set_paused(engine, 0);
2120 	}
2121 }
2122 
2123 static void
2124 cancel_port_requests(struct intel_engine_execlists * const execlists)
2125 {
2126 	struct i915_request * const *port;
2127 
2128 	for (port = execlists->pending; *port; port++)
2129 		execlists_schedule_out(*port);
2130 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2131 
2132 	/* Mark the end of active before we overwrite *active */
2133 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2134 		execlists_schedule_out(*port);
2135 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2136 
2137 	WRITE_ONCE(execlists->active, execlists->inflight);
2138 }
2139 
2140 static inline void
2141 invalidate_csb_entries(const u32 *first, const u32 *last)
2142 {
2143 	clflush((void *)first);
2144 	clflush((void *)last);
2145 }
2146 
2147 static inline bool
2148 reset_in_progress(const struct intel_engine_execlists *execlists)
2149 {
2150 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
2151 }
2152 
2153 /*
2154  * Starting with Gen12, the status has a new format:
2155  *
2156  *     bit  0:     switched to new queue
2157  *     bit  1:     reserved
2158  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2159  *                 switch detail is set to "wait on semaphore"
2160  *     bits 3-5:   engine class
2161  *     bits 6-11:  engine instance
2162  *     bits 12-14: reserved
2163  *     bits 15-25: sw context id of the lrc the GT switched to
2164  *     bits 26-31: sw counter of the lrc the GT switched to
2165  *     bits 32-35: context switch detail
2166  *                  - 0: ctx complete
2167  *                  - 1: wait on sync flip
2168  *                  - 2: wait on vblank
2169  *                  - 3: wait on scanline
2170  *                  - 4: wait on semaphore
2171  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2172  *                       WAIT_FOR_EVENT)
2173  *     bit  36:    reserved
2174  *     bits 37-43: wait detail (for switch detail 1 to 4)
2175  *     bits 44-46: reserved
2176  *     bits 47-57: sw context id of the lrc the GT switched away from
2177  *     bits 58-63: sw counter of the lrc the GT switched away from
2178  */
2179 static inline bool
2180 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2181 {
2182 	u32 lower_dw = csb[0];
2183 	u32 upper_dw = csb[1];
2184 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2185 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2186 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2187 
2188 	/*
2189 	 * The context switch detail is not guaranteed to be 5 when a preemption
2190 	 * occurs, so we can't just check for that. The check below works for
2191 	 * all the cases we care about, including preemptions of WAIT
2192 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2193 	 * would require some extra handling, but we don't support that.
2194 	 */
2195 	if (!ctx_away_valid || new_queue) {
2196 		GEM_BUG_ON(!ctx_to_valid);
2197 		return true;
2198 	}
2199 
2200 	/*
2201 	 * switch detail = 5 is covered by the case above and we do not expect a
2202 	 * context switch on an unsuccessful wait instruction since we always
2203 	 * use polling mode.
2204 	 */
2205 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2206 	return false;
2207 }
2208 
2209 static inline bool
2210 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2211 {
2212 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2213 }
2214 
2215 static void process_csb(struct intel_engine_cs *engine)
2216 {
2217 	struct intel_engine_execlists * const execlists = &engine->execlists;
2218 	const u32 * const buf = execlists->csb_status;
2219 	const u8 num_entries = execlists->csb_size;
2220 	u8 head, tail;
2221 
2222 	/*
2223 	 * As we modify our execlists state tracking we require exclusive
2224 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2225 	 * and we assume that is only inside the reset paths and so serialised.
2226 	 */
2227 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2228 		   !reset_in_progress(execlists));
2229 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2230 
2231 	/*
2232 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2233 	 * When reading from the csb_write mmio register, we have to be
2234 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2235 	 * the low 4bits. As it happens we know the next 4bits are always
2236 	 * zero and so we can simply masked off the low u8 of the register
2237 	 * and treat it identically to reading from the HWSP (without having
2238 	 * to use explicit shifting and masking, and probably bifurcating
2239 	 * the code to handle the legacy mmio read).
2240 	 */
2241 	head = execlists->csb_head;
2242 	tail = READ_ONCE(*execlists->csb_write);
2243 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2244 	if (unlikely(head == tail))
2245 		return;
2246 
2247 	/*
2248 	 * Hopefully paired with a wmb() in HW!
2249 	 *
2250 	 * We must complete the read of the write pointer before any reads
2251 	 * from the CSB, so that we do not see stale values. Without an rmb
2252 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2253 	 * we perform the READ_ONCE(*csb_write).
2254 	 */
2255 	rmb();
2256 
2257 	do {
2258 		bool promote;
2259 
2260 		if (++head == num_entries)
2261 			head = 0;
2262 
2263 		/*
2264 		 * We are flying near dragons again.
2265 		 *
2266 		 * We hold a reference to the request in execlist_port[]
2267 		 * but no more than that. We are operating in softirq
2268 		 * context and so cannot hold any mutex or sleep. That
2269 		 * prevents us stopping the requests we are processing
2270 		 * in port[] from being retired simultaneously (the
2271 		 * breadcrumb will be complete before we see the
2272 		 * context-switch). As we only hold the reference to the
2273 		 * request, any pointer chasing underneath the request
2274 		 * is subject to a potential use-after-free. Thus we
2275 		 * store all of the bookkeeping within port[] as
2276 		 * required, and avoid using unguarded pointers beneath
2277 		 * request itself. The same applies to the atomic
2278 		 * status notifier.
2279 		 */
2280 
2281 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2282 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2283 
2284 		if (INTEL_GEN(engine->i915) >= 12)
2285 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2286 		else
2287 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2288 		if (promote) {
2289 			struct i915_request * const *old = execlists->active;
2290 
2291 			/* Point active to the new ELSP; prevent overwriting */
2292 			WRITE_ONCE(execlists->active, execlists->pending);
2293 
2294 			if (!inject_preempt_hang(execlists))
2295 				ring_set_paused(engine, 0);
2296 
2297 			/* cancel old inflight, prepare for switch */
2298 			trace_ports(execlists, "preempted", old);
2299 			while (*old)
2300 				execlists_schedule_out(*old++);
2301 
2302 			/* switch pending to inflight */
2303 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2304 			WRITE_ONCE(execlists->active,
2305 				   memcpy(execlists->inflight,
2306 					  execlists->pending,
2307 					  execlists_num_ports(execlists) *
2308 					  sizeof(*execlists->pending)));
2309 
2310 			WRITE_ONCE(execlists->pending[0], NULL);
2311 		} else {
2312 			GEM_BUG_ON(!*execlists->active);
2313 
2314 			/* port0 completed, advanced to port1 */
2315 			trace_ports(execlists, "completed", execlists->active);
2316 
2317 			/*
2318 			 * We rely on the hardware being strongly
2319 			 * ordered, that the breadcrumb write is
2320 			 * coherent (visible from the CPU) before the
2321 			 * user interrupt and CSB is processed.
2322 			 */
2323 			GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2324 				   !reset_in_progress(execlists));
2325 			execlists_schedule_out(*execlists->active++);
2326 
2327 			GEM_BUG_ON(execlists->active - execlists->inflight >
2328 				   execlists_num_ports(execlists));
2329 		}
2330 	} while (head != tail);
2331 
2332 	execlists->csb_head = head;
2333 	set_timeslice(engine);
2334 
2335 	/*
2336 	 * Gen11 has proven to fail wrt global observation point between
2337 	 * entry and tail update, failing on the ordering and thus
2338 	 * we see an old entry in the context status buffer.
2339 	 *
2340 	 * Forcibly evict out entries for the next gpu csb update,
2341 	 * to increase the odds that we get a fresh entries with non
2342 	 * working hardware. The cost for doing so comes out mostly with
2343 	 * the wash as hardware, working or not, will need to do the
2344 	 * invalidation before.
2345 	 */
2346 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2347 }
2348 
2349 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2350 {
2351 	lockdep_assert_held(&engine->active.lock);
2352 	if (!engine->execlists.pending[0]) {
2353 		rcu_read_lock(); /* protect peeking at execlists->active */
2354 		execlists_dequeue(engine);
2355 		rcu_read_unlock();
2356 	}
2357 }
2358 
2359 static void __execlists_hold(struct i915_request *rq)
2360 {
2361 	LIST_HEAD(list);
2362 
2363 	do {
2364 		struct i915_dependency *p;
2365 
2366 		if (i915_request_is_active(rq))
2367 			__i915_request_unsubmit(rq);
2368 
2369 		RQ_TRACE(rq, "on hold\n");
2370 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2371 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2372 		i915_request_set_hold(rq);
2373 
2374 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2375 			struct i915_request *w =
2376 				container_of(p->waiter, typeof(*w), sched);
2377 
2378 			/* Leave semaphores spinning on the other engines */
2379 			if (w->engine != rq->engine)
2380 				continue;
2381 
2382 			if (!i915_request_is_ready(w))
2383 				continue;
2384 
2385 			if (i915_request_completed(w))
2386 				continue;
2387 
2388 			if (i915_request_on_hold(rq))
2389 				continue;
2390 
2391 			list_move_tail(&w->sched.link, &list);
2392 		}
2393 
2394 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2395 	} while (rq);
2396 }
2397 
2398 static bool execlists_hold(struct intel_engine_cs *engine,
2399 			   struct i915_request *rq)
2400 {
2401 	spin_lock_irq(&engine->active.lock);
2402 
2403 	if (i915_request_completed(rq)) { /* too late! */
2404 		rq = NULL;
2405 		goto unlock;
2406 	}
2407 
2408 	if (rq->engine != engine) { /* preempted virtual engine */
2409 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2410 
2411 		/*
2412 		 * intel_context_inflight() is only protected by virtue
2413 		 * of process_csb() being called only by the tasklet (or
2414 		 * directly from inside reset while the tasklet is suspended).
2415 		 * Assert that neither of those are allowed to run while we
2416 		 * poke at the request queues.
2417 		 */
2418 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2419 
2420 		/*
2421 		 * An unsubmitted request along a virtual engine will
2422 		 * remain on the active (this) engine until we are able
2423 		 * to process the context switch away (and so mark the
2424 		 * context as no longer in flight). That cannot have happened
2425 		 * yet, otherwise we would not be hanging!
2426 		 */
2427 		spin_lock(&ve->base.active.lock);
2428 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2429 		GEM_BUG_ON(ve->request != rq);
2430 		ve->request = NULL;
2431 		spin_unlock(&ve->base.active.lock);
2432 		i915_request_put(rq);
2433 
2434 		rq->engine = engine;
2435 	}
2436 
2437 	/*
2438 	 * Transfer this request onto the hold queue to prevent it
2439 	 * being resumbitted to HW (and potentially completed) before we have
2440 	 * released it. Since we may have already submitted following
2441 	 * requests, we need to remove those as well.
2442 	 */
2443 	GEM_BUG_ON(i915_request_on_hold(rq));
2444 	GEM_BUG_ON(rq->engine != engine);
2445 	__execlists_hold(rq);
2446 
2447 unlock:
2448 	spin_unlock_irq(&engine->active.lock);
2449 	return rq;
2450 }
2451 
2452 static bool hold_request(const struct i915_request *rq)
2453 {
2454 	struct i915_dependency *p;
2455 
2456 	/*
2457 	 * If one of our ancestors is on hold, we must also be on hold,
2458 	 * otherwise we will bypass it and execute before it.
2459 	 */
2460 	list_for_each_entry(p, &rq->sched.signalers_list, signal_link) {
2461 		const struct i915_request *s =
2462 			container_of(p->signaler, typeof(*s), sched);
2463 
2464 		if (s->engine != rq->engine)
2465 			continue;
2466 
2467 		if (i915_request_on_hold(s))
2468 			return true;
2469 	}
2470 
2471 	return false;
2472 }
2473 
2474 static void __execlists_unhold(struct i915_request *rq)
2475 {
2476 	LIST_HEAD(list);
2477 
2478 	do {
2479 		struct i915_dependency *p;
2480 
2481 		GEM_BUG_ON(!i915_request_on_hold(rq));
2482 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2483 
2484 		i915_request_clear_hold(rq);
2485 		list_move_tail(&rq->sched.link,
2486 			       i915_sched_lookup_priolist(rq->engine,
2487 							  rq_prio(rq)));
2488 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2489 		RQ_TRACE(rq, "hold release\n");
2490 
2491 		/* Also release any children on this engine that are ready */
2492 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2493 			struct i915_request *w =
2494 				container_of(p->waiter, typeof(*w), sched);
2495 
2496 			if (w->engine != rq->engine)
2497 				continue;
2498 
2499 			if (!i915_request_on_hold(rq))
2500 				continue;
2501 
2502 			/* Check that no other parents are also on hold */
2503 			if (hold_request(rq))
2504 				continue;
2505 
2506 			list_move_tail(&w->sched.link, &list);
2507 		}
2508 
2509 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2510 	} while (rq);
2511 }
2512 
2513 static void execlists_unhold(struct intel_engine_cs *engine,
2514 			     struct i915_request *rq)
2515 {
2516 	spin_lock_irq(&engine->active.lock);
2517 
2518 	/*
2519 	 * Move this request back to the priority queue, and all of its
2520 	 * children and grandchildren that were suspended along with it.
2521 	 */
2522 	__execlists_unhold(rq);
2523 
2524 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2525 		engine->execlists.queue_priority_hint = rq_prio(rq);
2526 		tasklet_hi_schedule(&engine->execlists.tasklet);
2527 	}
2528 
2529 	spin_unlock_irq(&engine->active.lock);
2530 }
2531 
2532 struct execlists_capture {
2533 	struct work_struct work;
2534 	struct i915_request *rq;
2535 	struct i915_gpu_coredump *error;
2536 };
2537 
2538 static void execlists_capture_work(struct work_struct *work)
2539 {
2540 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2541 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2542 	struct intel_engine_cs *engine = cap->rq->engine;
2543 	struct intel_gt_coredump *gt = cap->error->gt;
2544 	struct intel_engine_capture_vma *vma;
2545 
2546 	/* Compress all the objects attached to the request, slow! */
2547 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2548 	if (vma) {
2549 		struct i915_vma_compress *compress =
2550 			i915_vma_capture_prepare(gt);
2551 
2552 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2553 		i915_vma_capture_finish(gt, compress);
2554 	}
2555 
2556 	gt->simulated = gt->engine->simulated;
2557 	cap->error->simulated = gt->simulated;
2558 
2559 	/* Publish the error state, and announce it to the world */
2560 	i915_error_state_store(cap->error);
2561 	i915_gpu_coredump_put(cap->error);
2562 
2563 	/* Return this request and all that depend upon it for signaling */
2564 	execlists_unhold(engine, cap->rq);
2565 	i915_request_put(cap->rq);
2566 
2567 	kfree(cap);
2568 }
2569 
2570 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2571 {
2572 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2573 	struct execlists_capture *cap;
2574 
2575 	cap = kmalloc(sizeof(*cap), gfp);
2576 	if (!cap)
2577 		return NULL;
2578 
2579 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2580 	if (!cap->error)
2581 		goto err_cap;
2582 
2583 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2584 	if (!cap->error->gt)
2585 		goto err_gpu;
2586 
2587 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2588 	if (!cap->error->gt->engine)
2589 		goto err_gt;
2590 
2591 	return cap;
2592 
2593 err_gt:
2594 	kfree(cap->error->gt);
2595 err_gpu:
2596 	kfree(cap->error);
2597 err_cap:
2598 	kfree(cap);
2599 	return NULL;
2600 }
2601 
2602 static bool execlists_capture(struct intel_engine_cs *engine)
2603 {
2604 	struct execlists_capture *cap;
2605 
2606 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2607 		return true;
2608 
2609 	/*
2610 	 * We need to _quickly_ capture the engine state before we reset.
2611 	 * We are inside an atomic section (softirq) here and we are delaying
2612 	 * the forced preemption event.
2613 	 */
2614 	cap = capture_regs(engine);
2615 	if (!cap)
2616 		return true;
2617 
2618 	cap->rq = execlists_active(&engine->execlists);
2619 	GEM_BUG_ON(!cap->rq);
2620 
2621 	rcu_read_lock();
2622 	cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2623 	cap->rq = i915_request_get_rcu(cap->rq);
2624 	rcu_read_unlock();
2625 	if (!cap->rq)
2626 		goto err_free;
2627 
2628 	/*
2629 	 * Remove the request from the execlists queue, and take ownership
2630 	 * of the request. We pass it to our worker who will _slowly_ compress
2631 	 * all the pages the _user_ requested for debugging their batch, after
2632 	 * which we return it to the queue for signaling.
2633 	 *
2634 	 * By removing them from the execlists queue, we also remove the
2635 	 * requests from being processed by __unwind_incomplete_requests()
2636 	 * during the intel_engine_reset(), and so they will *not* be replayed
2637 	 * afterwards.
2638 	 *
2639 	 * Note that because we have not yet reset the engine at this point,
2640 	 * it is possible for the request that we have identified as being
2641 	 * guilty, did in fact complete and we will then hit an arbitration
2642 	 * point allowing the outstanding preemption to succeed. The likelihood
2643 	 * of that is very low (as capturing of the engine registers should be
2644 	 * fast enough to run inside an irq-off atomic section!), so we will
2645 	 * simply hold that request accountable for being non-preemptible
2646 	 * long enough to force the reset.
2647 	 */
2648 	if (!execlists_hold(engine, cap->rq))
2649 		goto err_rq;
2650 
2651 	INIT_WORK(&cap->work, execlists_capture_work);
2652 	schedule_work(&cap->work);
2653 	return true;
2654 
2655 err_rq:
2656 	i915_request_put(cap->rq);
2657 err_free:
2658 	i915_gpu_coredump_put(cap->error);
2659 	kfree(cap);
2660 	return false;
2661 }
2662 
2663 static noinline void preempt_reset(struct intel_engine_cs *engine)
2664 {
2665 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2666 	unsigned long *lock = &engine->gt->reset.flags;
2667 
2668 	if (i915_modparams.reset < 3)
2669 		return;
2670 
2671 	if (test_and_set_bit(bit, lock))
2672 		return;
2673 
2674 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2675 	tasklet_disable_nosync(&engine->execlists.tasklet);
2676 
2677 	ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
2678 		     READ_ONCE(engine->props.preempt_timeout_ms),
2679 		     jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2680 
2681 	ring_set_paused(engine, 1); /* Freeze the current request in place */
2682 	if (execlists_capture(engine))
2683 		intel_engine_reset(engine, "preemption time out");
2684 	else
2685 		ring_set_paused(engine, 0);
2686 
2687 	tasklet_enable(&engine->execlists.tasklet);
2688 	clear_and_wake_up_bit(bit, lock);
2689 }
2690 
2691 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2692 {
2693 	const struct timer_list *t = &engine->execlists.preempt;
2694 
2695 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2696 		return false;
2697 
2698 	if (!timer_expired(t))
2699 		return false;
2700 
2701 	return READ_ONCE(engine->execlists.pending[0]);
2702 }
2703 
2704 /*
2705  * Check the unread Context Status Buffers and manage the submission of new
2706  * contexts to the ELSP accordingly.
2707  */
2708 static void execlists_submission_tasklet(unsigned long data)
2709 {
2710 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2711 	bool timeout = preempt_timeout(engine);
2712 
2713 	process_csb(engine);
2714 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2715 		unsigned long flags;
2716 
2717 		spin_lock_irqsave(&engine->active.lock, flags);
2718 		__execlists_submission_tasklet(engine);
2719 		spin_unlock_irqrestore(&engine->active.lock, flags);
2720 
2721 		/* Recheck after serialising with direct-submission */
2722 		if (timeout && preempt_timeout(engine))
2723 			preempt_reset(engine);
2724 	}
2725 }
2726 
2727 static void __execlists_kick(struct intel_engine_execlists *execlists)
2728 {
2729 	/* Kick the tasklet for some interrupt coalescing and reset handling */
2730 	tasklet_hi_schedule(&execlists->tasklet);
2731 }
2732 
2733 #define execlists_kick(t, member) \
2734 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2735 
2736 static void execlists_timeslice(struct timer_list *timer)
2737 {
2738 	execlists_kick(timer, timer);
2739 }
2740 
2741 static void execlists_preempt(struct timer_list *timer)
2742 {
2743 	execlists_kick(timer, preempt);
2744 }
2745 
2746 static void queue_request(struct intel_engine_cs *engine,
2747 			  struct i915_request *rq)
2748 {
2749 	GEM_BUG_ON(!list_empty(&rq->sched.link));
2750 	list_add_tail(&rq->sched.link,
2751 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
2752 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2753 }
2754 
2755 static void __submit_queue_imm(struct intel_engine_cs *engine)
2756 {
2757 	struct intel_engine_execlists * const execlists = &engine->execlists;
2758 
2759 	if (reset_in_progress(execlists))
2760 		return; /* defer until we restart the engine following reset */
2761 
2762 	if (execlists->tasklet.func == execlists_submission_tasklet)
2763 		__execlists_submission_tasklet(engine);
2764 	else
2765 		tasklet_hi_schedule(&execlists->tasklet);
2766 }
2767 
2768 static void submit_queue(struct intel_engine_cs *engine,
2769 			 const struct i915_request *rq)
2770 {
2771 	struct intel_engine_execlists *execlists = &engine->execlists;
2772 
2773 	if (rq_prio(rq) <= execlists->queue_priority_hint)
2774 		return;
2775 
2776 	execlists->queue_priority_hint = rq_prio(rq);
2777 	__submit_queue_imm(engine);
2778 }
2779 
2780 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2781 			     const struct i915_request *rq)
2782 {
2783 	GEM_BUG_ON(i915_request_on_hold(rq));
2784 	return !list_empty(&engine->active.hold) && hold_request(rq);
2785 }
2786 
2787 static void execlists_submit_request(struct i915_request *request)
2788 {
2789 	struct intel_engine_cs *engine = request->engine;
2790 	unsigned long flags;
2791 
2792 	/* Will be called from irq-context when using foreign fences. */
2793 	spin_lock_irqsave(&engine->active.lock, flags);
2794 
2795 	if (unlikely(ancestor_on_hold(engine, request))) {
2796 		list_add_tail(&request->sched.link, &engine->active.hold);
2797 		i915_request_set_hold(request);
2798 	} else {
2799 		queue_request(engine, request);
2800 
2801 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2802 		GEM_BUG_ON(list_empty(&request->sched.link));
2803 
2804 		submit_queue(engine, request);
2805 	}
2806 
2807 	spin_unlock_irqrestore(&engine->active.lock, flags);
2808 }
2809 
2810 static void __execlists_context_fini(struct intel_context *ce)
2811 {
2812 	intel_ring_put(ce->ring);
2813 	i915_vma_put(ce->state);
2814 }
2815 
2816 static void execlists_context_destroy(struct kref *kref)
2817 {
2818 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2819 
2820 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2821 	GEM_BUG_ON(intel_context_is_pinned(ce));
2822 
2823 	if (ce->state)
2824 		__execlists_context_fini(ce);
2825 
2826 	intel_context_fini(ce);
2827 	intel_context_free(ce);
2828 }
2829 
2830 static void
2831 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2832 {
2833 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2834 		return;
2835 
2836 	vaddr += engine->context_size;
2837 
2838 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
2839 }
2840 
2841 static void
2842 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2843 {
2844 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2845 		return;
2846 
2847 	vaddr += engine->context_size;
2848 
2849 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
2850 		dev_err_once(engine->i915->drm.dev,
2851 			     "%s context redzone overwritten!\n",
2852 			     engine->name);
2853 }
2854 
2855 static void execlists_context_unpin(struct intel_context *ce)
2856 {
2857 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2858 		      ce->engine);
2859 
2860 	i915_gem_object_unpin_map(ce->state->obj);
2861 }
2862 
2863 static void
2864 __execlists_update_reg_state(const struct intel_context *ce,
2865 			     const struct intel_engine_cs *engine,
2866 			     u32 head)
2867 {
2868 	struct intel_ring *ring = ce->ring;
2869 	u32 *regs = ce->lrc_reg_state;
2870 
2871 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
2872 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2873 
2874 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
2875 	regs[CTX_RING_HEAD] = head;
2876 	regs[CTX_RING_TAIL] = ring->tail;
2877 
2878 	/* RPCS */
2879 	if (engine->class == RENDER_CLASS) {
2880 		regs[CTX_R_PWR_CLK_STATE] =
2881 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2882 
2883 		i915_oa_init_reg_state(ce, engine);
2884 	}
2885 }
2886 
2887 static int
2888 __execlists_context_pin(struct intel_context *ce,
2889 			struct intel_engine_cs *engine)
2890 {
2891 	void *vaddr;
2892 
2893 	GEM_BUG_ON(!ce->state);
2894 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2895 
2896 	vaddr = i915_gem_object_pin_map(ce->state->obj,
2897 					i915_coherent_map_type(engine->i915) |
2898 					I915_MAP_OVERRIDE);
2899 	if (IS_ERR(vaddr))
2900 		return PTR_ERR(vaddr);
2901 
2902 	ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
2903 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2904 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
2905 
2906 	return 0;
2907 }
2908 
2909 static int execlists_context_pin(struct intel_context *ce)
2910 {
2911 	return __execlists_context_pin(ce, ce->engine);
2912 }
2913 
2914 static int execlists_context_alloc(struct intel_context *ce)
2915 {
2916 	return __execlists_context_alloc(ce, ce->engine);
2917 }
2918 
2919 static void execlists_context_reset(struct intel_context *ce)
2920 {
2921 	CE_TRACE(ce, "reset\n");
2922 	GEM_BUG_ON(!intel_context_is_pinned(ce));
2923 
2924 	/*
2925 	 * Because we emit WA_TAIL_DWORDS there may be a disparity
2926 	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2927 	 * that stored in context. As we only write new commands from
2928 	 * ce->ring->tail onwards, everything before that is junk. If the GPU
2929 	 * starts reading from its RING_HEAD from the context, it may try to
2930 	 * execute that junk and die.
2931 	 *
2932 	 * The contexts that are stilled pinned on resume belong to the
2933 	 * kernel, and are local to each engine. All other contexts will
2934 	 * have their head/tail sanitized upon pinning before use, so they
2935 	 * will never see garbage,
2936 	 *
2937 	 * So to avoid that we reset the context images upon resume. For
2938 	 * simplicity, we just zero everything out.
2939 	 */
2940 	intel_ring_reset(ce->ring, ce->ring->emit);
2941 
2942 	/* Scrub away the garbage */
2943 	execlists_init_reg_state(ce->lrc_reg_state,
2944 				 ce, ce->engine, ce->ring, true);
2945 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
2946 
2947 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
2948 }
2949 
2950 static const struct intel_context_ops execlists_context_ops = {
2951 	.alloc = execlists_context_alloc,
2952 
2953 	.pin = execlists_context_pin,
2954 	.unpin = execlists_context_unpin,
2955 
2956 	.enter = intel_context_enter_engine,
2957 	.exit = intel_context_exit_engine,
2958 
2959 	.reset = execlists_context_reset,
2960 	.destroy = execlists_context_destroy,
2961 };
2962 
2963 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2964 {
2965 	u32 *cs;
2966 
2967 	GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2968 
2969 	cs = intel_ring_begin(rq, 6);
2970 	if (IS_ERR(cs))
2971 		return PTR_ERR(cs);
2972 
2973 	/*
2974 	 * Check if we have been preempted before we even get started.
2975 	 *
2976 	 * After this point i915_request_started() reports true, even if
2977 	 * we get preempted and so are no longer running.
2978 	 */
2979 	*cs++ = MI_ARB_CHECK;
2980 	*cs++ = MI_NOOP;
2981 
2982 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2983 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
2984 	*cs++ = 0;
2985 	*cs++ = rq->fence.seqno - 1;
2986 
2987 	intel_ring_advance(rq, cs);
2988 
2989 	/* Record the updated position of the request's payload */
2990 	rq->infix = intel_ring_offset(rq, cs);
2991 
2992 	return 0;
2993 }
2994 
2995 static int execlists_request_alloc(struct i915_request *request)
2996 {
2997 	int ret;
2998 
2999 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3000 
3001 	/*
3002 	 * Flush enough space to reduce the likelihood of waiting after
3003 	 * we start building the request - in which case we will just
3004 	 * have to repeat work.
3005 	 */
3006 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3007 
3008 	/*
3009 	 * Note that after this point, we have committed to using
3010 	 * this request as it is being used to both track the
3011 	 * state of engine initialisation and liveness of the
3012 	 * golden renderstate above. Think twice before you try
3013 	 * to cancel/unwind this request now.
3014 	 */
3015 
3016 	/* Unconditionally invalidate GPU caches and TLBs. */
3017 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3018 	if (ret)
3019 		return ret;
3020 
3021 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3022 	return 0;
3023 }
3024 
3025 /*
3026  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3027  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3028  * but there is a slight complication as this is applied in WA batch where the
3029  * values are only initialized once so we cannot take register value at the
3030  * beginning and reuse it further; hence we save its value to memory, upload a
3031  * constant value with bit21 set and then we restore it back with the saved value.
3032  * To simplify the WA, a constant value is formed by using the default value
3033  * of this register. This shouldn't be a problem because we are only modifying
3034  * it for a short period and this batch in non-premptible. We can ofcourse
3035  * use additional instructions that read the actual value of the register
3036  * at that time and set our bit of interest but it makes the WA complicated.
3037  *
3038  * This WA is also required for Gen9 so extracting as a function avoids
3039  * code duplication.
3040  */
3041 static u32 *
3042 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3043 {
3044 	/* NB no one else is allowed to scribble over scratch + 256! */
3045 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3046 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3047 	*batch++ = intel_gt_scratch_offset(engine->gt,
3048 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3049 	*batch++ = 0;
3050 
3051 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3052 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3053 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3054 
3055 	batch = gen8_emit_pipe_control(batch,
3056 				       PIPE_CONTROL_CS_STALL |
3057 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3058 				       0);
3059 
3060 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3061 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3062 	*batch++ = intel_gt_scratch_offset(engine->gt,
3063 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3064 	*batch++ = 0;
3065 
3066 	return batch;
3067 }
3068 
3069 /*
3070  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3071  * initialized at the beginning and shared across all contexts but this field
3072  * helps us to have multiple batches at different offsets and select them based
3073  * on a criteria. At the moment this batch always start at the beginning of the page
3074  * and at this point we don't have multiple wa_ctx batch buffers.
3075  *
3076  * The number of WA applied are not known at the beginning; we use this field
3077  * to return the no of DWORDS written.
3078  *
3079  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3080  * so it adds NOOPs as padding to make it cacheline aligned.
3081  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3082  * makes a complete batch buffer.
3083  */
3084 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3085 {
3086 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3087 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3088 
3089 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3090 	if (IS_BROADWELL(engine->i915))
3091 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3092 
3093 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3094 	/* Actual scratch location is at 128 bytes offset */
3095 	batch = gen8_emit_pipe_control(batch,
3096 				       PIPE_CONTROL_FLUSH_L3 |
3097 				       PIPE_CONTROL_STORE_DATA_INDEX |
3098 				       PIPE_CONTROL_CS_STALL |
3099 				       PIPE_CONTROL_QW_WRITE,
3100 				       LRC_PPHWSP_SCRATCH_ADDR);
3101 
3102 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3103 
3104 	/* Pad to end of cacheline */
3105 	while ((unsigned long)batch % CACHELINE_BYTES)
3106 		*batch++ = MI_NOOP;
3107 
3108 	/*
3109 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3110 	 * execution depends on the length specified in terms of cache lines
3111 	 * in the register CTX_RCS_INDIRECT_CTX
3112 	 */
3113 
3114 	return batch;
3115 }
3116 
3117 struct lri {
3118 	i915_reg_t reg;
3119 	u32 value;
3120 };
3121 
3122 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3123 {
3124 	GEM_BUG_ON(!count || count > 63);
3125 
3126 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3127 	do {
3128 		*batch++ = i915_mmio_reg_offset(lri->reg);
3129 		*batch++ = lri->value;
3130 	} while (lri++, --count);
3131 	*batch++ = MI_NOOP;
3132 
3133 	return batch;
3134 }
3135 
3136 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3137 {
3138 	static const struct lri lri[] = {
3139 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3140 		{
3141 			COMMON_SLICE_CHICKEN2,
3142 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3143 				       0),
3144 		},
3145 
3146 		/* BSpec: 11391 */
3147 		{
3148 			FF_SLICE_CHICKEN,
3149 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3150 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3151 		},
3152 
3153 		/* BSpec: 11299 */
3154 		{
3155 			_3D_CHICKEN3,
3156 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3157 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3158 		}
3159 	};
3160 
3161 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3162 
3163 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3164 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3165 
3166 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3167 	batch = gen8_emit_pipe_control(batch,
3168 				       PIPE_CONTROL_FLUSH_L3 |
3169 				       PIPE_CONTROL_STORE_DATA_INDEX |
3170 				       PIPE_CONTROL_CS_STALL |
3171 				       PIPE_CONTROL_QW_WRITE,
3172 				       LRC_PPHWSP_SCRATCH_ADDR);
3173 
3174 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3175 
3176 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3177 	if (HAS_POOLED_EU(engine->i915)) {
3178 		/*
3179 		 * EU pool configuration is setup along with golden context
3180 		 * during context initialization. This value depends on
3181 		 * device type (2x6 or 3x6) and needs to be updated based
3182 		 * on which subslice is disabled especially for 2x6
3183 		 * devices, however it is safe to load default
3184 		 * configuration of 3x6 device instead of masking off
3185 		 * corresponding bits because HW ignores bits of a disabled
3186 		 * subslice and drops down to appropriate config. Please
3187 		 * see render_state_setup() in i915_gem_render_state.c for
3188 		 * possible configurations, to avoid duplication they are
3189 		 * not shown here again.
3190 		 */
3191 		*batch++ = GEN9_MEDIA_POOL_STATE;
3192 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3193 		*batch++ = 0x00777000;
3194 		*batch++ = 0;
3195 		*batch++ = 0;
3196 		*batch++ = 0;
3197 	}
3198 
3199 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3200 
3201 	/* Pad to end of cacheline */
3202 	while ((unsigned long)batch % CACHELINE_BYTES)
3203 		*batch++ = MI_NOOP;
3204 
3205 	return batch;
3206 }
3207 
3208 static u32 *
3209 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3210 {
3211 	int i;
3212 
3213 	/*
3214 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3215 	 *
3216 	 * Ensure the engine is idle prior to programming a
3217 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3218 	 */
3219 	batch = gen8_emit_pipe_control(batch,
3220 				       PIPE_CONTROL_CS_STALL,
3221 				       0);
3222 	/*
3223 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3224 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3225 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3226 	 * confusing. Since gen8_emit_pipe_control() already advances the
3227 	 * batch by 6 dwords, we advance the other 10 here, completing a
3228 	 * cacheline. It's not clear if the workaround requires this padding
3229 	 * before other commands, or if it's just the regular padding we would
3230 	 * already have for the workaround bb, so leave it here for now.
3231 	 */
3232 	for (i = 0; i < 10; i++)
3233 		*batch++ = MI_NOOP;
3234 
3235 	/* Pad to end of cacheline */
3236 	while ((unsigned long)batch % CACHELINE_BYTES)
3237 		*batch++ = MI_NOOP;
3238 
3239 	return batch;
3240 }
3241 
3242 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3243 
3244 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3245 {
3246 	struct drm_i915_gem_object *obj;
3247 	struct i915_vma *vma;
3248 	int err;
3249 
3250 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3251 	if (IS_ERR(obj))
3252 		return PTR_ERR(obj);
3253 
3254 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3255 	if (IS_ERR(vma)) {
3256 		err = PTR_ERR(vma);
3257 		goto err;
3258 	}
3259 
3260 	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
3261 	if (err)
3262 		goto err;
3263 
3264 	engine->wa_ctx.vma = vma;
3265 	return 0;
3266 
3267 err:
3268 	i915_gem_object_put(obj);
3269 	return err;
3270 }
3271 
3272 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3273 {
3274 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3275 }
3276 
3277 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3278 
3279 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3280 {
3281 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3282 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3283 					    &wa_ctx->per_ctx };
3284 	wa_bb_func_t wa_bb_fn[2];
3285 	struct page *page;
3286 	void *batch, *batch_ptr;
3287 	unsigned int i;
3288 	int ret;
3289 
3290 	if (engine->class != RENDER_CLASS)
3291 		return 0;
3292 
3293 	switch (INTEL_GEN(engine->i915)) {
3294 	case 12:
3295 	case 11:
3296 		return 0;
3297 	case 10:
3298 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3299 		wa_bb_fn[1] = NULL;
3300 		break;
3301 	case 9:
3302 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3303 		wa_bb_fn[1] = NULL;
3304 		break;
3305 	case 8:
3306 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3307 		wa_bb_fn[1] = NULL;
3308 		break;
3309 	default:
3310 		MISSING_CASE(INTEL_GEN(engine->i915));
3311 		return 0;
3312 	}
3313 
3314 	ret = lrc_setup_wa_ctx(engine);
3315 	if (ret) {
3316 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
3317 		return ret;
3318 	}
3319 
3320 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3321 	batch = batch_ptr = kmap_atomic(page);
3322 
3323 	/*
3324 	 * Emit the two workaround batch buffers, recording the offset from the
3325 	 * start of the workaround batch buffer object for each and their
3326 	 * respective sizes.
3327 	 */
3328 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3329 		wa_bb[i]->offset = batch_ptr - batch;
3330 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3331 						  CACHELINE_BYTES))) {
3332 			ret = -EINVAL;
3333 			break;
3334 		}
3335 		if (wa_bb_fn[i])
3336 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3337 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3338 	}
3339 
3340 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3341 
3342 	kunmap_atomic(batch);
3343 	if (ret)
3344 		lrc_destroy_wa_ctx(engine);
3345 
3346 	return ret;
3347 }
3348 
3349 static void enable_execlists(struct intel_engine_cs *engine)
3350 {
3351 	u32 mode;
3352 
3353 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3354 
3355 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3356 
3357 	if (INTEL_GEN(engine->i915) >= 11)
3358 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3359 	else
3360 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3361 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3362 
3363 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3364 
3365 	ENGINE_WRITE_FW(engine,
3366 			RING_HWS_PGA,
3367 			i915_ggtt_offset(engine->status_page.vma));
3368 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3369 
3370 	engine->context_tag = 0;
3371 }
3372 
3373 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3374 {
3375 	bool unexpected = false;
3376 
3377 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3378 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3379 		unexpected = true;
3380 	}
3381 
3382 	return unexpected;
3383 }
3384 
3385 static int execlists_resume(struct intel_engine_cs *engine)
3386 {
3387 	intel_engine_apply_workarounds(engine);
3388 	intel_engine_apply_whitelist(engine);
3389 
3390 	intel_mocs_init_engine(engine);
3391 
3392 	intel_engine_reset_breadcrumbs(engine);
3393 
3394 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3395 		struct drm_printer p = drm_debug_printer(__func__);
3396 
3397 		intel_engine_dump(engine, &p, NULL);
3398 	}
3399 
3400 	enable_execlists(engine);
3401 
3402 	return 0;
3403 }
3404 
3405 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3406 {
3407 	struct intel_engine_execlists * const execlists = &engine->execlists;
3408 	unsigned long flags;
3409 
3410 	ENGINE_TRACE(engine, "depth<-%d\n",
3411 		     atomic_read(&execlists->tasklet.count));
3412 
3413 	/*
3414 	 * Prevent request submission to the hardware until we have
3415 	 * completed the reset in i915_gem_reset_finish(). If a request
3416 	 * is completed by one engine, it may then queue a request
3417 	 * to a second via its execlists->tasklet *just* as we are
3418 	 * calling engine->resume() and also writing the ELSP.
3419 	 * Turning off the execlists->tasklet until the reset is over
3420 	 * prevents the race.
3421 	 */
3422 	__tasklet_disable_sync_once(&execlists->tasklet);
3423 	GEM_BUG_ON(!reset_in_progress(execlists));
3424 
3425 	/* And flush any current direct submission. */
3426 	spin_lock_irqsave(&engine->active.lock, flags);
3427 	spin_unlock_irqrestore(&engine->active.lock, flags);
3428 
3429 	/*
3430 	 * We stop engines, otherwise we might get failed reset and a
3431 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
3432 	 * from system hang if batchbuffer is progressing when
3433 	 * the reset is issued, regardless of READY_TO_RESET ack.
3434 	 * Thus assume it is best to stop engines on all gens
3435 	 * where we have a gpu reset.
3436 	 *
3437 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3438 	 *
3439 	 * FIXME: Wa for more modern gens needs to be validated
3440 	 */
3441 	intel_engine_stop_cs(engine);
3442 }
3443 
3444 static void reset_csb_pointers(struct intel_engine_cs *engine)
3445 {
3446 	struct intel_engine_execlists * const execlists = &engine->execlists;
3447 	const unsigned int reset_value = execlists->csb_size - 1;
3448 
3449 	ring_set_paused(engine, 0);
3450 
3451 	/*
3452 	 * After a reset, the HW starts writing into CSB entry [0]. We
3453 	 * therefore have to set our HEAD pointer back one entry so that
3454 	 * the *first* entry we check is entry 0. To complicate this further,
3455 	 * as we don't wait for the first interrupt after reset, we have to
3456 	 * fake the HW write to point back to the last entry so that our
3457 	 * inline comparison of our cached head position against the last HW
3458 	 * write works even before the first interrupt.
3459 	 */
3460 	execlists->csb_head = reset_value;
3461 	WRITE_ONCE(*execlists->csb_write, reset_value);
3462 	wmb(); /* Make sure this is visible to HW (paranoia?) */
3463 
3464 	/*
3465 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3466 	 * Bludgeon them with a mmio update to be sure.
3467 	 */
3468 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3469 		     reset_value << 8 | reset_value);
3470 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3471 
3472 	invalidate_csb_entries(&execlists->csb_status[0],
3473 			       &execlists->csb_status[reset_value]);
3474 }
3475 
3476 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3477 {
3478 	int x;
3479 
3480 	x = lrc_ring_mi_mode(engine);
3481 	if (x != -1) {
3482 		regs[x + 1] &= ~STOP_RING;
3483 		regs[x + 1] |= STOP_RING << 16;
3484 	}
3485 }
3486 
3487 static void __execlists_reset_reg_state(const struct intel_context *ce,
3488 					const struct intel_engine_cs *engine)
3489 {
3490 	u32 *regs = ce->lrc_reg_state;
3491 
3492 	__reset_stop_ring(regs, engine);
3493 }
3494 
3495 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3496 {
3497 	struct intel_engine_execlists * const execlists = &engine->execlists;
3498 	struct intel_context *ce;
3499 	struct i915_request *rq;
3500 	u32 head;
3501 
3502 	mb(); /* paranoia: read the CSB pointers from after the reset */
3503 	clflush(execlists->csb_write);
3504 	mb();
3505 
3506 	process_csb(engine); /* drain preemption events */
3507 
3508 	/* Following the reset, we need to reload the CSB read/write pointers */
3509 	reset_csb_pointers(engine);
3510 
3511 	/*
3512 	 * Save the currently executing context, even if we completed
3513 	 * its request, it was still running at the time of the
3514 	 * reset and will have been clobbered.
3515 	 */
3516 	rq = execlists_active(execlists);
3517 	if (!rq)
3518 		goto unwind;
3519 
3520 	/* We still have requests in-flight; the engine should be active */
3521 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3522 
3523 	ce = rq->context;
3524 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3525 
3526 	if (i915_request_completed(rq)) {
3527 		/* Idle context; tidy up the ring so we can restart afresh */
3528 		head = intel_ring_wrap(ce->ring, rq->tail);
3529 		goto out_replay;
3530 	}
3531 
3532 	/* Context has requests still in-flight; it should not be idle! */
3533 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3534 	rq = active_request(ce->timeline, rq);
3535 	head = intel_ring_wrap(ce->ring, rq->head);
3536 	GEM_BUG_ON(head == ce->ring->tail);
3537 
3538 	/*
3539 	 * If this request hasn't started yet, e.g. it is waiting on a
3540 	 * semaphore, we need to avoid skipping the request or else we
3541 	 * break the signaling chain. However, if the context is corrupt
3542 	 * the request will not restart and we will be stuck with a wedged
3543 	 * device. It is quite often the case that if we issue a reset
3544 	 * while the GPU is loading the context image, that the context
3545 	 * image becomes corrupt.
3546 	 *
3547 	 * Otherwise, if we have not started yet, the request should replay
3548 	 * perfectly and we do not need to flag the result as being erroneous.
3549 	 */
3550 	if (!i915_request_started(rq))
3551 		goto out_replay;
3552 
3553 	/*
3554 	 * If the request was innocent, we leave the request in the ELSP
3555 	 * and will try to replay it on restarting. The context image may
3556 	 * have been corrupted by the reset, in which case we may have
3557 	 * to service a new GPU hang, but more likely we can continue on
3558 	 * without impact.
3559 	 *
3560 	 * If the request was guilty, we presume the context is corrupt
3561 	 * and have to at least restore the RING register in the context
3562 	 * image back to the expected values to skip over the guilty request.
3563 	 */
3564 	__i915_request_reset(rq, stalled);
3565 	if (!stalled)
3566 		goto out_replay;
3567 
3568 	/*
3569 	 * We want a simple context + ring to execute the breadcrumb update.
3570 	 * We cannot rely on the context being intact across the GPU hang,
3571 	 * so clear it and rebuild just what we need for the breadcrumb.
3572 	 * All pending requests for this context will be zapped, and any
3573 	 * future request will be after userspace has had the opportunity
3574 	 * to recreate its own state.
3575 	 */
3576 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3577 	restore_default_state(ce, engine);
3578 
3579 out_replay:
3580 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3581 		     head, ce->ring->tail);
3582 	__execlists_reset_reg_state(ce, engine);
3583 	__execlists_update_reg_state(ce, engine, head);
3584 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3585 
3586 unwind:
3587 	/* Push back any incomplete requests for replay after the reset. */
3588 	cancel_port_requests(execlists);
3589 	__unwind_incomplete_requests(engine);
3590 }
3591 
3592 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3593 {
3594 	unsigned long flags;
3595 
3596 	ENGINE_TRACE(engine, "\n");
3597 
3598 	spin_lock_irqsave(&engine->active.lock, flags);
3599 
3600 	__execlists_reset(engine, stalled);
3601 
3602 	spin_unlock_irqrestore(&engine->active.lock, flags);
3603 }
3604 
3605 static void nop_submission_tasklet(unsigned long data)
3606 {
3607 	/* The driver is wedged; don't process any more events. */
3608 }
3609 
3610 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3611 {
3612 	struct intel_engine_execlists * const execlists = &engine->execlists;
3613 	struct i915_request *rq, *rn;
3614 	struct rb_node *rb;
3615 	unsigned long flags;
3616 
3617 	ENGINE_TRACE(engine, "\n");
3618 
3619 	/*
3620 	 * Before we call engine->cancel_requests(), we should have exclusive
3621 	 * access to the submission state. This is arranged for us by the
3622 	 * caller disabling the interrupt generation, the tasklet and other
3623 	 * threads that may then access the same state, giving us a free hand
3624 	 * to reset state. However, we still need to let lockdep be aware that
3625 	 * we know this state may be accessed in hardirq context, so we
3626 	 * disable the irq around this manipulation and we want to keep
3627 	 * the spinlock focused on its duties and not accidentally conflate
3628 	 * coverage to the submission's irq state. (Similarly, although we
3629 	 * shouldn't need to disable irq around the manipulation of the
3630 	 * submission's irq state, we also wish to remind ourselves that
3631 	 * it is irq state.)
3632 	 */
3633 	spin_lock_irqsave(&engine->active.lock, flags);
3634 
3635 	__execlists_reset(engine, true);
3636 
3637 	/* Mark all executing requests as skipped. */
3638 	list_for_each_entry(rq, &engine->active.requests, sched.link)
3639 		mark_eio(rq);
3640 
3641 	/* Flush the queued requests to the timeline list (for retiring). */
3642 	while ((rb = rb_first_cached(&execlists->queue))) {
3643 		struct i915_priolist *p = to_priolist(rb);
3644 		int i;
3645 
3646 		priolist_for_each_request_consume(rq, rn, p, i) {
3647 			mark_eio(rq);
3648 			__i915_request_submit(rq);
3649 		}
3650 
3651 		rb_erase_cached(&p->node, &execlists->queue);
3652 		i915_priolist_free(p);
3653 	}
3654 
3655 	/* On-hold requests will be flushed to timeline upon their release */
3656 	list_for_each_entry(rq, &engine->active.hold, sched.link)
3657 		mark_eio(rq);
3658 
3659 	/* Cancel all attached virtual engines */
3660 	while ((rb = rb_first_cached(&execlists->virtual))) {
3661 		struct virtual_engine *ve =
3662 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3663 
3664 		rb_erase_cached(rb, &execlists->virtual);
3665 		RB_CLEAR_NODE(rb);
3666 
3667 		spin_lock(&ve->base.active.lock);
3668 		rq = fetch_and_zero(&ve->request);
3669 		if (rq) {
3670 			mark_eio(rq);
3671 
3672 			rq->engine = engine;
3673 			__i915_request_submit(rq);
3674 			i915_request_put(rq);
3675 
3676 			ve->base.execlists.queue_priority_hint = INT_MIN;
3677 		}
3678 		spin_unlock(&ve->base.active.lock);
3679 	}
3680 
3681 	/* Remaining _unready_ requests will be nop'ed when submitted */
3682 
3683 	execlists->queue_priority_hint = INT_MIN;
3684 	execlists->queue = RB_ROOT_CACHED;
3685 
3686 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3687 	execlists->tasklet.func = nop_submission_tasklet;
3688 
3689 	spin_unlock_irqrestore(&engine->active.lock, flags);
3690 }
3691 
3692 static void execlists_reset_finish(struct intel_engine_cs *engine)
3693 {
3694 	struct intel_engine_execlists * const execlists = &engine->execlists;
3695 
3696 	/*
3697 	 * After a GPU reset, we may have requests to replay. Do so now while
3698 	 * we still have the forcewake to be sure that the GPU is not allowed
3699 	 * to sleep before we restart and reload a context.
3700 	 */
3701 	GEM_BUG_ON(!reset_in_progress(execlists));
3702 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3703 		execlists->tasklet.func(execlists->tasklet.data);
3704 
3705 	if (__tasklet_enable(&execlists->tasklet))
3706 		/* And kick in case we missed a new request submission. */
3707 		tasklet_hi_schedule(&execlists->tasklet);
3708 	ENGINE_TRACE(engine, "depth->%d\n",
3709 		     atomic_read(&execlists->tasklet.count));
3710 }
3711 
3712 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3713 				    u64 offset, u32 len,
3714 				    const unsigned int flags)
3715 {
3716 	u32 *cs;
3717 
3718 	cs = intel_ring_begin(rq, 4);
3719 	if (IS_ERR(cs))
3720 		return PTR_ERR(cs);
3721 
3722 	/*
3723 	 * WaDisableCtxRestoreArbitration:bdw,chv
3724 	 *
3725 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3726 	 * particular all the gen that do not need the w/a at all!), if we
3727 	 * took care to make sure that on every switch into this context
3728 	 * (both ordinary and for preemption) that arbitrartion was enabled
3729 	 * we would be fine.  However, for gen8 there is another w/a that
3730 	 * requires us to not preempt inside GPGPU execution, so we keep
3731 	 * arbitration disabled for gen8 batches. Arbitration will be
3732 	 * re-enabled before we close the request
3733 	 * (engine->emit_fini_breadcrumb).
3734 	 */
3735 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3736 
3737 	/* FIXME(BDW+): Address space and security selectors. */
3738 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3739 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3740 	*cs++ = lower_32_bits(offset);
3741 	*cs++ = upper_32_bits(offset);
3742 
3743 	intel_ring_advance(rq, cs);
3744 
3745 	return 0;
3746 }
3747 
3748 static int gen8_emit_bb_start(struct i915_request *rq,
3749 			      u64 offset, u32 len,
3750 			      const unsigned int flags)
3751 {
3752 	u32 *cs;
3753 
3754 	cs = intel_ring_begin(rq, 6);
3755 	if (IS_ERR(cs))
3756 		return PTR_ERR(cs);
3757 
3758 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3759 
3760 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3761 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3762 	*cs++ = lower_32_bits(offset);
3763 	*cs++ = upper_32_bits(offset);
3764 
3765 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3766 	*cs++ = MI_NOOP;
3767 
3768 	intel_ring_advance(rq, cs);
3769 
3770 	return 0;
3771 }
3772 
3773 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3774 {
3775 	ENGINE_WRITE(engine, RING_IMR,
3776 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
3777 	ENGINE_POSTING_READ(engine, RING_IMR);
3778 }
3779 
3780 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3781 {
3782 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3783 }
3784 
3785 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3786 {
3787 	u32 cmd, *cs;
3788 
3789 	cs = intel_ring_begin(request, 4);
3790 	if (IS_ERR(cs))
3791 		return PTR_ERR(cs);
3792 
3793 	cmd = MI_FLUSH_DW + 1;
3794 
3795 	/* We always require a command barrier so that subsequent
3796 	 * commands, such as breadcrumb interrupts, are strictly ordered
3797 	 * wrt the contents of the write cache being flushed to memory
3798 	 * (and thus being coherent from the CPU).
3799 	 */
3800 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3801 
3802 	if (mode & EMIT_INVALIDATE) {
3803 		cmd |= MI_INVALIDATE_TLB;
3804 		if (request->engine->class == VIDEO_DECODE_CLASS)
3805 			cmd |= MI_INVALIDATE_BSD;
3806 	}
3807 
3808 	*cs++ = cmd;
3809 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3810 	*cs++ = 0; /* upper addr */
3811 	*cs++ = 0; /* value */
3812 	intel_ring_advance(request, cs);
3813 
3814 	return 0;
3815 }
3816 
3817 static int gen8_emit_flush_render(struct i915_request *request,
3818 				  u32 mode)
3819 {
3820 	bool vf_flush_wa = false, dc_flush_wa = false;
3821 	u32 *cs, flags = 0;
3822 	int len;
3823 
3824 	flags |= PIPE_CONTROL_CS_STALL;
3825 
3826 	if (mode & EMIT_FLUSH) {
3827 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3828 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3829 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3830 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3831 	}
3832 
3833 	if (mode & EMIT_INVALIDATE) {
3834 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3835 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3836 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3837 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3838 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3839 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3840 		flags |= PIPE_CONTROL_QW_WRITE;
3841 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3842 
3843 		/*
3844 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3845 		 * pipe control.
3846 		 */
3847 		if (IS_GEN(request->i915, 9))
3848 			vf_flush_wa = true;
3849 
3850 		/* WaForGAMHang:kbl */
3851 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3852 			dc_flush_wa = true;
3853 	}
3854 
3855 	len = 6;
3856 
3857 	if (vf_flush_wa)
3858 		len += 6;
3859 
3860 	if (dc_flush_wa)
3861 		len += 12;
3862 
3863 	cs = intel_ring_begin(request, len);
3864 	if (IS_ERR(cs))
3865 		return PTR_ERR(cs);
3866 
3867 	if (vf_flush_wa)
3868 		cs = gen8_emit_pipe_control(cs, 0, 0);
3869 
3870 	if (dc_flush_wa)
3871 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3872 					    0);
3873 
3874 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3875 
3876 	if (dc_flush_wa)
3877 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3878 
3879 	intel_ring_advance(request, cs);
3880 
3881 	return 0;
3882 }
3883 
3884 static int gen11_emit_flush_render(struct i915_request *request,
3885 				   u32 mode)
3886 {
3887 	if (mode & EMIT_FLUSH) {
3888 		u32 *cs;
3889 		u32 flags = 0;
3890 
3891 		flags |= PIPE_CONTROL_CS_STALL;
3892 
3893 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3894 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3895 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3896 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3897 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3898 		flags |= PIPE_CONTROL_QW_WRITE;
3899 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3900 
3901 		cs = intel_ring_begin(request, 6);
3902 		if (IS_ERR(cs))
3903 			return PTR_ERR(cs);
3904 
3905 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3906 		intel_ring_advance(request, cs);
3907 	}
3908 
3909 	if (mode & EMIT_INVALIDATE) {
3910 		u32 *cs;
3911 		u32 flags = 0;
3912 
3913 		flags |= PIPE_CONTROL_CS_STALL;
3914 
3915 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3916 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3917 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3918 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3919 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3920 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3921 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3922 		flags |= PIPE_CONTROL_QW_WRITE;
3923 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3924 
3925 		cs = intel_ring_begin(request, 6);
3926 		if (IS_ERR(cs))
3927 			return PTR_ERR(cs);
3928 
3929 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3930 		intel_ring_advance(request, cs);
3931 	}
3932 
3933 	return 0;
3934 }
3935 
3936 static u32 preparser_disable(bool state)
3937 {
3938 	return MI_ARB_CHECK | 1 << 8 | state;
3939 }
3940 
3941 static int gen12_emit_flush_render(struct i915_request *request,
3942 				   u32 mode)
3943 {
3944 	if (mode & EMIT_FLUSH) {
3945 		u32 flags = 0;
3946 		u32 *cs;
3947 
3948 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3949 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3950 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3951 		/* Wa_1409600907:tgl */
3952 		flags |= PIPE_CONTROL_DEPTH_STALL;
3953 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3954 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3955 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3956 
3957 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3958 		flags |= PIPE_CONTROL_QW_WRITE;
3959 
3960 		flags |= PIPE_CONTROL_CS_STALL;
3961 
3962 		cs = intel_ring_begin(request, 6);
3963 		if (IS_ERR(cs))
3964 			return PTR_ERR(cs);
3965 
3966 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3967 		intel_ring_advance(request, cs);
3968 	}
3969 
3970 	if (mode & EMIT_INVALIDATE) {
3971 		u32 flags = 0;
3972 		u32 *cs;
3973 
3974 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3975 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3976 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3977 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3978 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3979 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3980 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3981 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
3982 
3983 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3984 		flags |= PIPE_CONTROL_QW_WRITE;
3985 
3986 		flags |= PIPE_CONTROL_CS_STALL;
3987 
3988 		cs = intel_ring_begin(request, 8);
3989 		if (IS_ERR(cs))
3990 			return PTR_ERR(cs);
3991 
3992 		/*
3993 		 * Prevent the pre-parser from skipping past the TLB
3994 		 * invalidate and loading a stale page for the batch
3995 		 * buffer / request payload.
3996 		 */
3997 		*cs++ = preparser_disable(true);
3998 
3999 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4000 
4001 		*cs++ = preparser_disable(false);
4002 		intel_ring_advance(request, cs);
4003 	}
4004 
4005 	return 0;
4006 }
4007 
4008 /*
4009  * Reserve space for 2 NOOPs at the end of each request to be
4010  * used as a workaround for not being allowed to do lite
4011  * restore with HEAD==TAIL (WaIdleLiteRestore).
4012  */
4013 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4014 {
4015 	/* Ensure there's always at least one preemption point per-request. */
4016 	*cs++ = MI_ARB_CHECK;
4017 	*cs++ = MI_NOOP;
4018 	request->wa_tail = intel_ring_offset(request, cs);
4019 
4020 	return cs;
4021 }
4022 
4023 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4024 {
4025 	*cs++ = MI_SEMAPHORE_WAIT |
4026 		MI_SEMAPHORE_GLOBAL_GTT |
4027 		MI_SEMAPHORE_POLL |
4028 		MI_SEMAPHORE_SAD_EQ_SDD;
4029 	*cs++ = 0;
4030 	*cs++ = intel_hws_preempt_address(request->engine);
4031 	*cs++ = 0;
4032 
4033 	return cs;
4034 }
4035 
4036 static __always_inline u32*
4037 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4038 				 u32 *cs)
4039 {
4040 	*cs++ = MI_USER_INTERRUPT;
4041 
4042 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4043 	if (intel_engine_has_semaphores(request->engine))
4044 		cs = emit_preempt_busywait(request, cs);
4045 
4046 	request->tail = intel_ring_offset(request, cs);
4047 	assert_ring_tail_valid(request->ring, request->tail);
4048 
4049 	return gen8_emit_wa_tail(request, cs);
4050 }
4051 
4052 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4053 {
4054 	cs = gen8_emit_ggtt_write(cs,
4055 				  request->fence.seqno,
4056 				  i915_request_active_timeline(request)->hwsp_offset,
4057 				  0);
4058 
4059 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4060 }
4061 
4062 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4063 {
4064 	cs = gen8_emit_pipe_control(cs,
4065 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4066 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4067 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4068 				    0);
4069 
4070 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4071 	cs = gen8_emit_ggtt_write_rcs(cs,
4072 				      request->fence.seqno,
4073 				      i915_request_active_timeline(request)->hwsp_offset,
4074 				      PIPE_CONTROL_FLUSH_ENABLE |
4075 				      PIPE_CONTROL_CS_STALL);
4076 
4077 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4078 }
4079 
4080 static u32 *
4081 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4082 {
4083 	cs = gen8_emit_ggtt_write_rcs(cs,
4084 				      request->fence.seqno,
4085 				      i915_request_active_timeline(request)->hwsp_offset,
4086 				      PIPE_CONTROL_CS_STALL |
4087 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4088 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4089 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4090 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4091 				      PIPE_CONTROL_FLUSH_ENABLE);
4092 
4093 	return gen8_emit_fini_breadcrumb_footer(request, cs);
4094 }
4095 
4096 /*
4097  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4098  * flush and will continue pre-fetching the instructions after it before the
4099  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4100  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4101  * of the next request before the memory has been flushed, we're guaranteed that
4102  * we won't access the batch itself too early.
4103  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4104  * so, if the current request is modifying an instruction in the next request on
4105  * the same intel_context, we might pre-fetch and then execute the pre-update
4106  * instruction. To avoid this, the users of self-modifying code should either
4107  * disable the parser around the code emitting the memory writes, via a new flag
4108  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4109  * the in-kernel use-cases we've opted to use a separate context, see
4110  * reloc_gpu() as an example.
4111  * All the above applies only to the instructions themselves. Non-inline data
4112  * used by the instructions is not pre-fetched.
4113  */
4114 
4115 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4116 {
4117 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4118 		MI_SEMAPHORE_GLOBAL_GTT |
4119 		MI_SEMAPHORE_POLL |
4120 		MI_SEMAPHORE_SAD_EQ_SDD;
4121 	*cs++ = 0;
4122 	*cs++ = intel_hws_preempt_address(request->engine);
4123 	*cs++ = 0;
4124 	*cs++ = 0;
4125 	*cs++ = MI_NOOP;
4126 
4127 	return cs;
4128 }
4129 
4130 static __always_inline u32*
4131 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4132 {
4133 	*cs++ = MI_USER_INTERRUPT;
4134 
4135 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4136 	if (intel_engine_has_semaphores(request->engine))
4137 		cs = gen12_emit_preempt_busywait(request, cs);
4138 
4139 	request->tail = intel_ring_offset(request, cs);
4140 	assert_ring_tail_valid(request->ring, request->tail);
4141 
4142 	return gen8_emit_wa_tail(request, cs);
4143 }
4144 
4145 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4146 {
4147 	cs = gen8_emit_ggtt_write(cs,
4148 				  request->fence.seqno,
4149 				  i915_request_active_timeline(request)->hwsp_offset,
4150 				  0);
4151 
4152 	return gen12_emit_fini_breadcrumb_footer(request, cs);
4153 }
4154 
4155 static u32 *
4156 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4157 {
4158 	cs = gen8_emit_ggtt_write_rcs(cs,
4159 				      request->fence.seqno,
4160 				      i915_request_active_timeline(request)->hwsp_offset,
4161 				      PIPE_CONTROL_CS_STALL |
4162 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4163 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4164 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4165 				      /* Wa_1409600907:tgl */
4166 				      PIPE_CONTROL_DEPTH_STALL |
4167 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4168 				      PIPE_CONTROL_FLUSH_ENABLE |
4169 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4170 
4171 	return gen12_emit_fini_breadcrumb_footer(request, cs);
4172 }
4173 
4174 static void execlists_park(struct intel_engine_cs *engine)
4175 {
4176 	cancel_timer(&engine->execlists.timer);
4177 	cancel_timer(&engine->execlists.preempt);
4178 }
4179 
4180 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4181 {
4182 	engine->submit_request = execlists_submit_request;
4183 	engine->schedule = i915_schedule;
4184 	engine->execlists.tasklet.func = execlists_submission_tasklet;
4185 
4186 	engine->reset.prepare = execlists_reset_prepare;
4187 	engine->reset.rewind = execlists_reset_rewind;
4188 	engine->reset.cancel = execlists_reset_cancel;
4189 	engine->reset.finish = execlists_reset_finish;
4190 
4191 	engine->park = execlists_park;
4192 	engine->unpark = NULL;
4193 
4194 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4195 	if (!intel_vgpu_active(engine->i915)) {
4196 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4197 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
4198 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4199 	}
4200 
4201 	if (INTEL_GEN(engine->i915) >= 12)
4202 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4203 
4204 	if (intel_engine_has_preemption(engine))
4205 		engine->emit_bb_start = gen8_emit_bb_start;
4206 	else
4207 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
4208 }
4209 
4210 static void execlists_shutdown(struct intel_engine_cs *engine)
4211 {
4212 	/* Synchronise with residual timers and any softirq they raise */
4213 	del_timer_sync(&engine->execlists.timer);
4214 	del_timer_sync(&engine->execlists.preempt);
4215 	tasklet_kill(&engine->execlists.tasklet);
4216 }
4217 
4218 static void execlists_release(struct intel_engine_cs *engine)
4219 {
4220 	execlists_shutdown(engine);
4221 
4222 	intel_engine_cleanup_common(engine);
4223 	lrc_destroy_wa_ctx(engine);
4224 }
4225 
4226 static void
4227 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4228 {
4229 	/* Default vfuncs which can be overriden by each engine. */
4230 
4231 	engine->resume = execlists_resume;
4232 
4233 	engine->cops = &execlists_context_ops;
4234 	engine->request_alloc = execlists_request_alloc;
4235 
4236 	engine->emit_flush = gen8_emit_flush;
4237 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4238 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4239 	if (INTEL_GEN(engine->i915) >= 12)
4240 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4241 
4242 	engine->set_default_submission = intel_execlists_set_default_submission;
4243 
4244 	if (INTEL_GEN(engine->i915) < 11) {
4245 		engine->irq_enable = gen8_logical_ring_enable_irq;
4246 		engine->irq_disable = gen8_logical_ring_disable_irq;
4247 	} else {
4248 		/*
4249 		 * TODO: On Gen11 interrupt masks need to be clear
4250 		 * to allow C6 entry. Keep interrupts enabled at
4251 		 * and take the hit of generating extra interrupts
4252 		 * until a more refined solution exists.
4253 		 */
4254 	}
4255 }
4256 
4257 static inline void
4258 logical_ring_default_irqs(struct intel_engine_cs *engine)
4259 {
4260 	unsigned int shift = 0;
4261 
4262 	if (INTEL_GEN(engine->i915) < 11) {
4263 		const u8 irq_shifts[] = {
4264 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
4265 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
4266 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
4267 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
4268 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
4269 		};
4270 
4271 		shift = irq_shifts[engine->id];
4272 	}
4273 
4274 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4275 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4276 }
4277 
4278 static void rcs_submission_override(struct intel_engine_cs *engine)
4279 {
4280 	switch (INTEL_GEN(engine->i915)) {
4281 	case 12:
4282 		engine->emit_flush = gen12_emit_flush_render;
4283 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4284 		break;
4285 	case 11:
4286 		engine->emit_flush = gen11_emit_flush_render;
4287 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4288 		break;
4289 	default:
4290 		engine->emit_flush = gen8_emit_flush_render;
4291 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4292 		break;
4293 	}
4294 }
4295 
4296 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4297 {
4298 	struct intel_engine_execlists * const execlists = &engine->execlists;
4299 	struct drm_i915_private *i915 = engine->i915;
4300 	struct intel_uncore *uncore = engine->uncore;
4301 	u32 base = engine->mmio_base;
4302 
4303 	tasklet_init(&engine->execlists.tasklet,
4304 		     execlists_submission_tasklet, (unsigned long)engine);
4305 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4306 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4307 
4308 	logical_ring_default_vfuncs(engine);
4309 	logical_ring_default_irqs(engine);
4310 
4311 	if (engine->class == RENDER_CLASS)
4312 		rcs_submission_override(engine);
4313 
4314 	if (intel_init_workaround_bb(engine))
4315 		/*
4316 		 * We continue even if we fail to initialize WA batch
4317 		 * because we only expect rare glitches but nothing
4318 		 * critical to prevent us from using GPU
4319 		 */
4320 		DRM_ERROR("WA batch buffer initialization failed\n");
4321 
4322 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
4323 		execlists->submit_reg = uncore->regs +
4324 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4325 		execlists->ctrl_reg = uncore->regs +
4326 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4327 	} else {
4328 		execlists->submit_reg = uncore->regs +
4329 			i915_mmio_reg_offset(RING_ELSP(base));
4330 	}
4331 
4332 	execlists->csb_status =
4333 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4334 
4335 	execlists->csb_write =
4336 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
4337 
4338 	if (INTEL_GEN(i915) < 11)
4339 		execlists->csb_size = GEN8_CSB_ENTRIES;
4340 	else
4341 		execlists->csb_size = GEN11_CSB_ENTRIES;
4342 
4343 	reset_csb_pointers(engine);
4344 
4345 	/* Finally, take ownership and responsibility for cleanup! */
4346 	engine->release = execlists_release;
4347 
4348 	return 0;
4349 }
4350 
4351 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4352 {
4353 	u32 indirect_ctx_offset;
4354 
4355 	switch (INTEL_GEN(engine->i915)) {
4356 	default:
4357 		MISSING_CASE(INTEL_GEN(engine->i915));
4358 		/* fall through */
4359 	case 12:
4360 		indirect_ctx_offset =
4361 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4362 		break;
4363 	case 11:
4364 		indirect_ctx_offset =
4365 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4366 		break;
4367 	case 10:
4368 		indirect_ctx_offset =
4369 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4370 		break;
4371 	case 9:
4372 		indirect_ctx_offset =
4373 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4374 		break;
4375 	case 8:
4376 		indirect_ctx_offset =
4377 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4378 		break;
4379 	}
4380 
4381 	return indirect_ctx_offset;
4382 }
4383 
4384 
4385 static void init_common_reg_state(u32 * const regs,
4386 				  const struct intel_engine_cs *engine,
4387 				  const struct intel_ring *ring,
4388 				  bool inhibit)
4389 {
4390 	u32 ctl;
4391 
4392 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4393 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4394 	if (inhibit)
4395 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4396 	if (INTEL_GEN(engine->i915) < 11)
4397 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4398 					   CTX_CTRL_RS_CTX_ENABLE);
4399 	regs[CTX_CONTEXT_CONTROL] = ctl;
4400 
4401 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4402 }
4403 
4404 static void init_wa_bb_reg_state(u32 * const regs,
4405 				 const struct intel_engine_cs *engine,
4406 				 u32 pos_bb_per_ctx)
4407 {
4408 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4409 
4410 	if (wa_ctx->per_ctx.size) {
4411 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4412 
4413 		regs[pos_bb_per_ctx] =
4414 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4415 	}
4416 
4417 	if (wa_ctx->indirect_ctx.size) {
4418 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4419 
4420 		regs[pos_bb_per_ctx + 2] =
4421 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
4422 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4423 
4424 		regs[pos_bb_per_ctx + 4] =
4425 			intel_lr_indirect_ctx_offset(engine) << 6;
4426 	}
4427 }
4428 
4429 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4430 {
4431 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
4432 		/* 64b PPGTT (48bit canonical)
4433 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
4434 		 * other PDP Descriptors are ignored.
4435 		 */
4436 		ASSIGN_CTX_PML4(ppgtt, regs);
4437 	} else {
4438 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
4439 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
4440 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
4441 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
4442 	}
4443 }
4444 
4445 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4446 {
4447 	if (i915_is_ggtt(vm))
4448 		return i915_vm_to_ggtt(vm)->alias;
4449 	else
4450 		return i915_vm_to_ppgtt(vm);
4451 }
4452 
4453 static void execlists_init_reg_state(u32 *regs,
4454 				     const struct intel_context *ce,
4455 				     const struct intel_engine_cs *engine,
4456 				     const struct intel_ring *ring,
4457 				     bool inhibit)
4458 {
4459 	/*
4460 	 * A context is actually a big batch buffer with several
4461 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4462 	 * values we are setting here are only for the first context restore:
4463 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
4464 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4465 	 * we are not initializing here).
4466 	 *
4467 	 * Must keep consistent with virtual_update_register_offsets().
4468 	 */
4469 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
4470 
4471 	init_common_reg_state(regs, engine, ring, inhibit);
4472 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4473 
4474 	init_wa_bb_reg_state(regs, engine,
4475 			     INTEL_GEN(engine->i915) >= 12 ?
4476 			     GEN12_CTX_BB_PER_CTX_PTR :
4477 			     CTX_BB_PER_CTX_PTR);
4478 
4479 	__reset_stop_ring(regs, engine);
4480 }
4481 
4482 static int
4483 populate_lr_context(struct intel_context *ce,
4484 		    struct drm_i915_gem_object *ctx_obj,
4485 		    struct intel_engine_cs *engine,
4486 		    struct intel_ring *ring)
4487 {
4488 	bool inhibit = true;
4489 	void *vaddr;
4490 	int ret;
4491 
4492 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4493 	if (IS_ERR(vaddr)) {
4494 		ret = PTR_ERR(vaddr);
4495 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4496 		return ret;
4497 	}
4498 
4499 	set_redzone(vaddr, engine);
4500 
4501 	if (engine->default_state) {
4502 		void *defaults;
4503 
4504 		defaults = i915_gem_object_pin_map(engine->default_state,
4505 						   I915_MAP_WB);
4506 		if (IS_ERR(defaults)) {
4507 			ret = PTR_ERR(defaults);
4508 			goto err_unpin_ctx;
4509 		}
4510 
4511 		memcpy(vaddr, defaults, engine->context_size);
4512 		i915_gem_object_unpin_map(engine->default_state);
4513 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
4514 		inhibit = false;
4515 	}
4516 
4517 	/* The second page of the context object contains some fields which must
4518 	 * be set up prior to the first execution. */
4519 	execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4520 				 ce, engine, ring, inhibit);
4521 
4522 	ret = 0;
4523 err_unpin_ctx:
4524 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4525 	i915_gem_object_unpin_map(ctx_obj);
4526 	return ret;
4527 }
4528 
4529 static int __execlists_context_alloc(struct intel_context *ce,
4530 				     struct intel_engine_cs *engine)
4531 {
4532 	struct drm_i915_gem_object *ctx_obj;
4533 	struct intel_ring *ring;
4534 	struct i915_vma *vma;
4535 	u32 context_size;
4536 	int ret;
4537 
4538 	GEM_BUG_ON(ce->state);
4539 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4540 
4541 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4542 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4543 
4544 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4545 	if (IS_ERR(ctx_obj))
4546 		return PTR_ERR(ctx_obj);
4547 
4548 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4549 	if (IS_ERR(vma)) {
4550 		ret = PTR_ERR(vma);
4551 		goto error_deref_obj;
4552 	}
4553 
4554 	if (!ce->timeline) {
4555 		struct intel_timeline *tl;
4556 
4557 		tl = intel_timeline_create(engine->gt, NULL);
4558 		if (IS_ERR(tl)) {
4559 			ret = PTR_ERR(tl);
4560 			goto error_deref_obj;
4561 		}
4562 
4563 		ce->timeline = tl;
4564 	}
4565 
4566 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4567 	if (IS_ERR(ring)) {
4568 		ret = PTR_ERR(ring);
4569 		goto error_deref_obj;
4570 	}
4571 
4572 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4573 	if (ret) {
4574 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4575 		goto error_ring_free;
4576 	}
4577 
4578 	ce->ring = ring;
4579 	ce->state = vma;
4580 
4581 	return 0;
4582 
4583 error_ring_free:
4584 	intel_ring_put(ring);
4585 error_deref_obj:
4586 	i915_gem_object_put(ctx_obj);
4587 	return ret;
4588 }
4589 
4590 static struct list_head *virtual_queue(struct virtual_engine *ve)
4591 {
4592 	return &ve->base.execlists.default_priolist.requests[0];
4593 }
4594 
4595 static void virtual_context_destroy(struct kref *kref)
4596 {
4597 	struct virtual_engine *ve =
4598 		container_of(kref, typeof(*ve), context.ref);
4599 	unsigned int n;
4600 
4601 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4602 	GEM_BUG_ON(ve->request);
4603 	GEM_BUG_ON(ve->context.inflight);
4604 
4605 	for (n = 0; n < ve->num_siblings; n++) {
4606 		struct intel_engine_cs *sibling = ve->siblings[n];
4607 		struct rb_node *node = &ve->nodes[sibling->id].rb;
4608 		unsigned long flags;
4609 
4610 		if (RB_EMPTY_NODE(node))
4611 			continue;
4612 
4613 		spin_lock_irqsave(&sibling->active.lock, flags);
4614 
4615 		/* Detachment is lazily performed in the execlists tasklet */
4616 		if (!RB_EMPTY_NODE(node))
4617 			rb_erase_cached(node, &sibling->execlists.virtual);
4618 
4619 		spin_unlock_irqrestore(&sibling->active.lock, flags);
4620 	}
4621 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4622 
4623 	if (ve->context.state)
4624 		__execlists_context_fini(&ve->context);
4625 	intel_context_fini(&ve->context);
4626 
4627 	kfree(ve->bonds);
4628 	kfree(ve);
4629 }
4630 
4631 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4632 {
4633 	int swp;
4634 
4635 	/*
4636 	 * Pick a random sibling on starting to help spread the load around.
4637 	 *
4638 	 * New contexts are typically created with exactly the same order
4639 	 * of siblings, and often started in batches. Due to the way we iterate
4640 	 * the array of sibling when submitting requests, sibling[0] is
4641 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4642 	 * randomised across the system, we also help spread the load by the
4643 	 * first engine we inspect being different each time.
4644 	 *
4645 	 * NB This does not force us to execute on this engine, it will just
4646 	 * typically be the first we inspect for submission.
4647 	 */
4648 	swp = prandom_u32_max(ve->num_siblings);
4649 	if (!swp)
4650 		return;
4651 
4652 	swap(ve->siblings[swp], ve->siblings[0]);
4653 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4654 		virtual_update_register_offsets(ve->context.lrc_reg_state,
4655 						ve->siblings[0]);
4656 }
4657 
4658 static int virtual_context_alloc(struct intel_context *ce)
4659 {
4660 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4661 
4662 	return __execlists_context_alloc(ce, ve->siblings[0]);
4663 }
4664 
4665 static int virtual_context_pin(struct intel_context *ce)
4666 {
4667 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4668 	int err;
4669 
4670 	/* Note: we must use a real engine class for setting up reg state */
4671 	err = __execlists_context_pin(ce, ve->siblings[0]);
4672 	if (err)
4673 		return err;
4674 
4675 	virtual_engine_initial_hint(ve);
4676 	return 0;
4677 }
4678 
4679 static void virtual_context_enter(struct intel_context *ce)
4680 {
4681 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4682 	unsigned int n;
4683 
4684 	for (n = 0; n < ve->num_siblings; n++)
4685 		intel_engine_pm_get(ve->siblings[n]);
4686 
4687 	intel_timeline_enter(ce->timeline);
4688 }
4689 
4690 static void virtual_context_exit(struct intel_context *ce)
4691 {
4692 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4693 	unsigned int n;
4694 
4695 	intel_timeline_exit(ce->timeline);
4696 
4697 	for (n = 0; n < ve->num_siblings; n++)
4698 		intel_engine_pm_put(ve->siblings[n]);
4699 }
4700 
4701 static const struct intel_context_ops virtual_context_ops = {
4702 	.alloc = virtual_context_alloc,
4703 
4704 	.pin = virtual_context_pin,
4705 	.unpin = execlists_context_unpin,
4706 
4707 	.enter = virtual_context_enter,
4708 	.exit = virtual_context_exit,
4709 
4710 	.destroy = virtual_context_destroy,
4711 };
4712 
4713 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4714 {
4715 	struct i915_request *rq;
4716 	intel_engine_mask_t mask;
4717 
4718 	rq = READ_ONCE(ve->request);
4719 	if (!rq)
4720 		return 0;
4721 
4722 	/* The rq is ready for submission; rq->execution_mask is now stable. */
4723 	mask = rq->execution_mask;
4724 	if (unlikely(!mask)) {
4725 		/* Invalid selection, submit to a random engine in error */
4726 		i915_request_skip(rq, -ENODEV);
4727 		mask = ve->siblings[0]->mask;
4728 	}
4729 
4730 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4731 		     rq->fence.context, rq->fence.seqno,
4732 		     mask, ve->base.execlists.queue_priority_hint);
4733 
4734 	return mask;
4735 }
4736 
4737 static void virtual_submission_tasklet(unsigned long data)
4738 {
4739 	struct virtual_engine * const ve = (struct virtual_engine *)data;
4740 	const int prio = ve->base.execlists.queue_priority_hint;
4741 	intel_engine_mask_t mask;
4742 	unsigned int n;
4743 
4744 	rcu_read_lock();
4745 	mask = virtual_submission_mask(ve);
4746 	rcu_read_unlock();
4747 	if (unlikely(!mask))
4748 		return;
4749 
4750 	local_irq_disable();
4751 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4752 		struct intel_engine_cs *sibling = ve->siblings[n];
4753 		struct ve_node * const node = &ve->nodes[sibling->id];
4754 		struct rb_node **parent, *rb;
4755 		bool first;
4756 
4757 		if (unlikely(!(mask & sibling->mask))) {
4758 			if (!RB_EMPTY_NODE(&node->rb)) {
4759 				spin_lock(&sibling->active.lock);
4760 				rb_erase_cached(&node->rb,
4761 						&sibling->execlists.virtual);
4762 				RB_CLEAR_NODE(&node->rb);
4763 				spin_unlock(&sibling->active.lock);
4764 			}
4765 			continue;
4766 		}
4767 
4768 		spin_lock(&sibling->active.lock);
4769 
4770 		if (!RB_EMPTY_NODE(&node->rb)) {
4771 			/*
4772 			 * Cheat and avoid rebalancing the tree if we can
4773 			 * reuse this node in situ.
4774 			 */
4775 			first = rb_first_cached(&sibling->execlists.virtual) ==
4776 				&node->rb;
4777 			if (prio == node->prio || (prio > node->prio && first))
4778 				goto submit_engine;
4779 
4780 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4781 		}
4782 
4783 		rb = NULL;
4784 		first = true;
4785 		parent = &sibling->execlists.virtual.rb_root.rb_node;
4786 		while (*parent) {
4787 			struct ve_node *other;
4788 
4789 			rb = *parent;
4790 			other = rb_entry(rb, typeof(*other), rb);
4791 			if (prio > other->prio) {
4792 				parent = &rb->rb_left;
4793 			} else {
4794 				parent = &rb->rb_right;
4795 				first = false;
4796 			}
4797 		}
4798 
4799 		rb_link_node(&node->rb, rb, parent);
4800 		rb_insert_color_cached(&node->rb,
4801 				       &sibling->execlists.virtual,
4802 				       first);
4803 
4804 submit_engine:
4805 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4806 		node->prio = prio;
4807 		if (first && prio > sibling->execlists.queue_priority_hint) {
4808 			sibling->execlists.queue_priority_hint = prio;
4809 			tasklet_hi_schedule(&sibling->execlists.tasklet);
4810 		}
4811 
4812 		spin_unlock(&sibling->active.lock);
4813 	}
4814 	local_irq_enable();
4815 }
4816 
4817 static void virtual_submit_request(struct i915_request *rq)
4818 {
4819 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4820 	struct i915_request *old;
4821 	unsigned long flags;
4822 
4823 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
4824 		     rq->fence.context,
4825 		     rq->fence.seqno);
4826 
4827 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4828 
4829 	spin_lock_irqsave(&ve->base.active.lock, flags);
4830 
4831 	old = ve->request;
4832 	if (old) { /* background completion event from preempt-to-busy */
4833 		GEM_BUG_ON(!i915_request_completed(old));
4834 		__i915_request_submit(old);
4835 		i915_request_put(old);
4836 	}
4837 
4838 	if (i915_request_completed(rq)) {
4839 		__i915_request_submit(rq);
4840 
4841 		ve->base.execlists.queue_priority_hint = INT_MIN;
4842 		ve->request = NULL;
4843 	} else {
4844 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
4845 		ve->request = i915_request_get(rq);
4846 
4847 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4848 		list_move_tail(&rq->sched.link, virtual_queue(ve));
4849 
4850 		tasklet_schedule(&ve->base.execlists.tasklet);
4851 	}
4852 
4853 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
4854 }
4855 
4856 static struct ve_bond *
4857 virtual_find_bond(struct virtual_engine *ve,
4858 		  const struct intel_engine_cs *master)
4859 {
4860 	int i;
4861 
4862 	for (i = 0; i < ve->num_bonds; i++) {
4863 		if (ve->bonds[i].master == master)
4864 			return &ve->bonds[i];
4865 	}
4866 
4867 	return NULL;
4868 }
4869 
4870 static void
4871 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4872 {
4873 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4874 	intel_engine_mask_t allowed, exec;
4875 	struct ve_bond *bond;
4876 
4877 	allowed = ~to_request(signal)->engine->mask;
4878 
4879 	bond = virtual_find_bond(ve, to_request(signal)->engine);
4880 	if (bond)
4881 		allowed &= bond->sibling_mask;
4882 
4883 	/* Restrict the bonded request to run on only the available engines */
4884 	exec = READ_ONCE(rq->execution_mask);
4885 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4886 		;
4887 
4888 	/* Prevent the master from being re-run on the bonded engines */
4889 	to_request(signal)->execution_mask &= ~allowed;
4890 }
4891 
4892 struct intel_context *
4893 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
4894 			       unsigned int count)
4895 {
4896 	struct virtual_engine *ve;
4897 	unsigned int n;
4898 	int err;
4899 
4900 	if (count == 0)
4901 		return ERR_PTR(-EINVAL);
4902 
4903 	if (count == 1)
4904 		return intel_context_create(siblings[0]);
4905 
4906 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4907 	if (!ve)
4908 		return ERR_PTR(-ENOMEM);
4909 
4910 	ve->base.i915 = siblings[0]->i915;
4911 	ve->base.gt = siblings[0]->gt;
4912 	ve->base.uncore = siblings[0]->uncore;
4913 	ve->base.id = -1;
4914 
4915 	ve->base.class = OTHER_CLASS;
4916 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4917 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4918 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4919 
4920 	/*
4921 	 * The decision on whether to submit a request using semaphores
4922 	 * depends on the saturated state of the engine. We only compute
4923 	 * this during HW submission of the request, and we need for this
4924 	 * state to be globally applied to all requests being submitted
4925 	 * to this engine. Virtual engines encompass more than one physical
4926 	 * engine and so we cannot accurately tell in advance if one of those
4927 	 * engines is already saturated and so cannot afford to use a semaphore
4928 	 * and be pessimized in priority for doing so -- if we are the only
4929 	 * context using semaphores after all other clients have stopped, we
4930 	 * will be starved on the saturated system. Such a global switch for
4931 	 * semaphores is less than ideal, but alas is the current compromise.
4932 	 */
4933 	ve->base.saturated = ALL_ENGINES;
4934 
4935 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4936 
4937 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4938 	intel_engine_init_breadcrumbs(&ve->base);
4939 	intel_engine_init_execlists(&ve->base);
4940 
4941 	ve->base.cops = &virtual_context_ops;
4942 	ve->base.request_alloc = execlists_request_alloc;
4943 
4944 	ve->base.schedule = i915_schedule;
4945 	ve->base.submit_request = virtual_submit_request;
4946 	ve->base.bond_execute = virtual_bond_execute;
4947 
4948 	INIT_LIST_HEAD(virtual_queue(ve));
4949 	ve->base.execlists.queue_priority_hint = INT_MIN;
4950 	tasklet_init(&ve->base.execlists.tasklet,
4951 		     virtual_submission_tasklet,
4952 		     (unsigned long)ve);
4953 
4954 	intel_context_init(&ve->context, &ve->base);
4955 
4956 	for (n = 0; n < count; n++) {
4957 		struct intel_engine_cs *sibling = siblings[n];
4958 
4959 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
4960 		if (sibling->mask & ve->base.mask) {
4961 			DRM_DEBUG("duplicate %s entry in load balancer\n",
4962 				  sibling->name);
4963 			err = -EINVAL;
4964 			goto err_put;
4965 		}
4966 
4967 		/*
4968 		 * The virtual engine implementation is tightly coupled to
4969 		 * the execlists backend -- we push out request directly
4970 		 * into a tree inside each physical engine. We could support
4971 		 * layering if we handle cloning of the requests and
4972 		 * submitting a copy into each backend.
4973 		 */
4974 		if (sibling->execlists.tasklet.func !=
4975 		    execlists_submission_tasklet) {
4976 			err = -ENODEV;
4977 			goto err_put;
4978 		}
4979 
4980 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4981 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4982 
4983 		ve->siblings[ve->num_siblings++] = sibling;
4984 		ve->base.mask |= sibling->mask;
4985 
4986 		/*
4987 		 * All physical engines must be compatible for their emission
4988 		 * functions (as we build the instructions during request
4989 		 * construction and do not alter them before submission
4990 		 * on the physical engine). We use the engine class as a guide
4991 		 * here, although that could be refined.
4992 		 */
4993 		if (ve->base.class != OTHER_CLASS) {
4994 			if (ve->base.class != sibling->class) {
4995 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
4996 					  sibling->class, ve->base.class);
4997 				err = -EINVAL;
4998 				goto err_put;
4999 			}
5000 			continue;
5001 		}
5002 
5003 		ve->base.class = sibling->class;
5004 		ve->base.uabi_class = sibling->uabi_class;
5005 		snprintf(ve->base.name, sizeof(ve->base.name),
5006 			 "v%dx%d", ve->base.class, count);
5007 		ve->base.context_size = sibling->context_size;
5008 
5009 		ve->base.emit_bb_start = sibling->emit_bb_start;
5010 		ve->base.emit_flush = sibling->emit_flush;
5011 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5012 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5013 		ve->base.emit_fini_breadcrumb_dw =
5014 			sibling->emit_fini_breadcrumb_dw;
5015 
5016 		ve->base.flags = sibling->flags;
5017 	}
5018 
5019 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5020 
5021 	return &ve->context;
5022 
5023 err_put:
5024 	intel_context_put(&ve->context);
5025 	return ERR_PTR(err);
5026 }
5027 
5028 struct intel_context *
5029 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5030 {
5031 	struct virtual_engine *se = to_virtual_engine(src);
5032 	struct intel_context *dst;
5033 
5034 	dst = intel_execlists_create_virtual(se->siblings,
5035 					     se->num_siblings);
5036 	if (IS_ERR(dst))
5037 		return dst;
5038 
5039 	if (se->num_bonds) {
5040 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5041 
5042 		de->bonds = kmemdup(se->bonds,
5043 				    sizeof(*se->bonds) * se->num_bonds,
5044 				    GFP_KERNEL);
5045 		if (!de->bonds) {
5046 			intel_context_put(dst);
5047 			return ERR_PTR(-ENOMEM);
5048 		}
5049 
5050 		de->num_bonds = se->num_bonds;
5051 	}
5052 
5053 	return dst;
5054 }
5055 
5056 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5057 				     const struct intel_engine_cs *master,
5058 				     const struct intel_engine_cs *sibling)
5059 {
5060 	struct virtual_engine *ve = to_virtual_engine(engine);
5061 	struct ve_bond *bond;
5062 	int n;
5063 
5064 	/* Sanity check the sibling is part of the virtual engine */
5065 	for (n = 0; n < ve->num_siblings; n++)
5066 		if (sibling == ve->siblings[n])
5067 			break;
5068 	if (n == ve->num_siblings)
5069 		return -EINVAL;
5070 
5071 	bond = virtual_find_bond(ve, master);
5072 	if (bond) {
5073 		bond->sibling_mask |= sibling->mask;
5074 		return 0;
5075 	}
5076 
5077 	bond = krealloc(ve->bonds,
5078 			sizeof(*bond) * (ve->num_bonds + 1),
5079 			GFP_KERNEL);
5080 	if (!bond)
5081 		return -ENOMEM;
5082 
5083 	bond[ve->num_bonds].master = master;
5084 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5085 
5086 	ve->bonds = bond;
5087 	ve->num_bonds++;
5088 
5089 	return 0;
5090 }
5091 
5092 struct intel_engine_cs *
5093 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5094 				 unsigned int sibling)
5095 {
5096 	struct virtual_engine *ve = to_virtual_engine(engine);
5097 
5098 	if (sibling >= ve->num_siblings)
5099 		return NULL;
5100 
5101 	return ve->siblings[sibling];
5102 }
5103 
5104 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5105 				   struct drm_printer *m,
5106 				   void (*show_request)(struct drm_printer *m,
5107 							struct i915_request *rq,
5108 							const char *prefix),
5109 				   unsigned int max)
5110 {
5111 	const struct intel_engine_execlists *execlists = &engine->execlists;
5112 	struct i915_request *rq, *last;
5113 	unsigned long flags;
5114 	unsigned int count;
5115 	struct rb_node *rb;
5116 
5117 	spin_lock_irqsave(&engine->active.lock, flags);
5118 
5119 	last = NULL;
5120 	count = 0;
5121 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5122 		if (count++ < max - 1)
5123 			show_request(m, rq, "\t\tE ");
5124 		else
5125 			last = rq;
5126 	}
5127 	if (last) {
5128 		if (count > max) {
5129 			drm_printf(m,
5130 				   "\t\t...skipping %d executing requests...\n",
5131 				   count - max);
5132 		}
5133 		show_request(m, last, "\t\tE ");
5134 	}
5135 
5136 	last = NULL;
5137 	count = 0;
5138 	if (execlists->queue_priority_hint != INT_MIN)
5139 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5140 			   execlists->queue_priority_hint);
5141 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5142 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5143 		int i;
5144 
5145 		priolist_for_each_request(rq, p, i) {
5146 			if (count++ < max - 1)
5147 				show_request(m, rq, "\t\tQ ");
5148 			else
5149 				last = rq;
5150 		}
5151 	}
5152 	if (last) {
5153 		if (count > max) {
5154 			drm_printf(m,
5155 				   "\t\t...skipping %d queued requests...\n",
5156 				   count - max);
5157 		}
5158 		show_request(m, last, "\t\tQ ");
5159 	}
5160 
5161 	last = NULL;
5162 	count = 0;
5163 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5164 		struct virtual_engine *ve =
5165 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5166 		struct i915_request *rq = READ_ONCE(ve->request);
5167 
5168 		if (rq) {
5169 			if (count++ < max - 1)
5170 				show_request(m, rq, "\t\tV ");
5171 			else
5172 				last = rq;
5173 		}
5174 	}
5175 	if (last) {
5176 		if (count > max) {
5177 			drm_printf(m,
5178 				   "\t\t...skipping %d virtual requests...\n",
5179 				   count - max);
5180 		}
5181 		show_request(m, last, "\t\tV ");
5182 	}
5183 
5184 	spin_unlock_irqrestore(&engine->active.lock, flags);
5185 }
5186 
5187 void intel_lr_context_reset(struct intel_engine_cs *engine,
5188 			    struct intel_context *ce,
5189 			    u32 head,
5190 			    bool scrub)
5191 {
5192 	GEM_BUG_ON(!intel_context_is_pinned(ce));
5193 
5194 	/*
5195 	 * We want a simple context + ring to execute the breadcrumb update.
5196 	 * We cannot rely on the context being intact across the GPU hang,
5197 	 * so clear it and rebuild just what we need for the breadcrumb.
5198 	 * All pending requests for this context will be zapped, and any
5199 	 * future request will be after userspace has had the opportunity
5200 	 * to recreate its own state.
5201 	 */
5202 	if (scrub)
5203 		restore_default_state(ce, engine);
5204 
5205 	/* Rerun the request; its payload has been neutered (if guilty). */
5206 	__execlists_update_reg_state(ce, engine, head);
5207 }
5208 
5209 bool
5210 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5211 {
5212 	return engine->set_default_submission ==
5213 	       intel_execlists_set_default_submission;
5214 }
5215 
5216 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5217 #include "selftest_lrc.c"
5218 #endif
5219