xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 323dd2c3)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "gem/i915_gem_context.h"
137 
138 #include "i915_drv.h"
139 #include "i915_perf.h"
140 #include "i915_trace.h"
141 #include "i915_vgpu.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 
151 #define RING_EXECLIST_QFULL		(1 << 0x2)
152 #define RING_EXECLIST1_VALID		(1 << 0x3)
153 #define RING_EXECLIST0_VALID		(1 << 0x4)
154 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
155 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
156 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
157 
158 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
159 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
161 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
162 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
163 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
164 
165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
166 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
167 
168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
169 
170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
172 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
173 #define GEN12_IDLE_CTX_ID		0x7FF
174 #define GEN12_CSB_CTX_VALID(csb_dw) \
175 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
176 
177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
179 #define WA_TAIL_DWORDS 2
180 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
181 
182 struct virtual_engine {
183 	struct intel_engine_cs base;
184 	struct intel_context context;
185 
186 	/*
187 	 * We allow only a single request through the virtual engine at a time
188 	 * (each request in the timeline waits for the completion fence of
189 	 * the previous before being submitted). By restricting ourselves to
190 	 * only submitting a single request, each request is placed on to a
191 	 * physical to maximise load spreading (by virtue of the late greedy
192 	 * scheduling -- each real engine takes the next available request
193 	 * upon idling).
194 	 */
195 	struct i915_request *request;
196 
197 	/*
198 	 * We keep a rbtree of available virtual engines inside each physical
199 	 * engine, sorted by priority. Here we preallocate the nodes we need
200 	 * for the virtual engine, indexed by physical_engine->id.
201 	 */
202 	struct ve_node {
203 		struct rb_node rb;
204 		int prio;
205 	} nodes[I915_NUM_ENGINES];
206 
207 	/*
208 	 * Keep track of bonded pairs -- restrictions upon on our selection
209 	 * of physical engines any particular request may be submitted to.
210 	 * If we receive a submit-fence from a master engine, we will only
211 	 * use one of sibling_mask physical engines.
212 	 */
213 	struct ve_bond {
214 		const struct intel_engine_cs *master;
215 		intel_engine_mask_t sibling_mask;
216 	} *bonds;
217 	unsigned int num_bonds;
218 
219 	/* And finally, which physical engines this virtual engine maps onto. */
220 	unsigned int num_siblings;
221 	struct intel_engine_cs *siblings[0];
222 };
223 
224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
225 {
226 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
227 	return container_of(engine, struct virtual_engine, base);
228 }
229 
230 static int __execlists_context_alloc(struct intel_context *ce,
231 				     struct intel_engine_cs *engine);
232 
233 static void execlists_init_reg_state(u32 *reg_state,
234 				     const struct intel_context *ce,
235 				     const struct intel_engine_cs *engine,
236 				     const struct intel_ring *ring,
237 				     bool close);
238 static void
239 __execlists_update_reg_state(const struct intel_context *ce,
240 			     const struct intel_engine_cs *engine);
241 
242 static void mark_eio(struct i915_request *rq)
243 {
244 	if (i915_request_completed(rq))
245 		return;
246 
247 	GEM_BUG_ON(i915_request_signaled(rq));
248 
249 	dma_fence_set_error(&rq->fence, -EIO);
250 	i915_request_mark_complete(rq);
251 }
252 
253 static struct i915_request *
254 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
255 {
256 	struct i915_request *active = rq;
257 
258 	rcu_read_lock();
259 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
260 		if (i915_request_completed(rq))
261 			break;
262 
263 		active = rq;
264 	}
265 	rcu_read_unlock();
266 
267 	return active;
268 }
269 
270 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
271 {
272 	return (i915_ggtt_offset(engine->status_page.vma) +
273 		I915_GEM_HWS_PREEMPT_ADDR);
274 }
275 
276 static inline void
277 ring_set_paused(const struct intel_engine_cs *engine, int state)
278 {
279 	/*
280 	 * We inspect HWS_PREEMPT with a semaphore inside
281 	 * engine->emit_fini_breadcrumb. If the dword is true,
282 	 * the ring is paused as the semaphore will busywait
283 	 * until the dword is false.
284 	 */
285 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
286 	if (state)
287 		wmb();
288 }
289 
290 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
291 {
292 	return rb_entry(rb, struct i915_priolist, node);
293 }
294 
295 static inline int rq_prio(const struct i915_request *rq)
296 {
297 	return rq->sched.attr.priority;
298 }
299 
300 static int effective_prio(const struct i915_request *rq)
301 {
302 	int prio = rq_prio(rq);
303 
304 	/*
305 	 * If this request is special and must not be interrupted at any
306 	 * cost, so be it. Note we are only checking the most recent request
307 	 * in the context and so may be masking an earlier vip request. It
308 	 * is hoped that under the conditions where nopreempt is used, this
309 	 * will not matter (i.e. all requests to that context will be
310 	 * nopreempt for as long as desired).
311 	 */
312 	if (i915_request_has_nopreempt(rq))
313 		prio = I915_PRIORITY_UNPREEMPTABLE;
314 
315 	/*
316 	 * On unwinding the active request, we give it a priority bump
317 	 * if it has completed waiting on any semaphore. If we know that
318 	 * the request has already started, we can prevent an unwanted
319 	 * preempt-to-idle cycle by taking that into account now.
320 	 */
321 	if (__i915_request_has_started(rq))
322 		prio |= I915_PRIORITY_NOSEMAPHORE;
323 
324 	/* Restrict mere WAIT boosts from triggering preemption */
325 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
326 	return prio | __NO_PREEMPTION;
327 }
328 
329 static int queue_prio(const struct intel_engine_execlists *execlists)
330 {
331 	struct i915_priolist *p;
332 	struct rb_node *rb;
333 
334 	rb = rb_first_cached(&execlists->queue);
335 	if (!rb)
336 		return INT_MIN;
337 
338 	/*
339 	 * As the priolist[] are inverted, with the highest priority in [0],
340 	 * we have to flip the index value to become priority.
341 	 */
342 	p = to_priolist(rb);
343 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
344 }
345 
346 static inline bool need_preempt(const struct intel_engine_cs *engine,
347 				const struct i915_request *rq,
348 				struct rb_node *rb)
349 {
350 	int last_prio;
351 
352 	if (!intel_engine_has_semaphores(engine))
353 		return false;
354 
355 	/*
356 	 * Check if the current priority hint merits a preemption attempt.
357 	 *
358 	 * We record the highest value priority we saw during rescheduling
359 	 * prior to this dequeue, therefore we know that if it is strictly
360 	 * less than the current tail of ESLP[0], we do not need to force
361 	 * a preempt-to-idle cycle.
362 	 *
363 	 * However, the priority hint is a mere hint that we may need to
364 	 * preempt. If that hint is stale or we may be trying to preempt
365 	 * ourselves, ignore the request.
366 	 *
367 	 * More naturally we would write
368 	 *      prio >= max(0, last);
369 	 * except that we wish to prevent triggering preemption at the same
370 	 * priority level: the task that is running should remain running
371 	 * to preserve FIFO ordering of dependencies.
372 	 */
373 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
374 	if (engine->execlists.queue_priority_hint <= last_prio)
375 		return false;
376 
377 	/*
378 	 * Check against the first request in ELSP[1], it will, thanks to the
379 	 * power of PI, be the highest priority of that context.
380 	 */
381 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
382 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
383 		return true;
384 
385 	if (rb) {
386 		struct virtual_engine *ve =
387 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
388 		bool preempt = false;
389 
390 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
391 			struct i915_request *next;
392 
393 			rcu_read_lock();
394 			next = READ_ONCE(ve->request);
395 			if (next)
396 				preempt = rq_prio(next) > last_prio;
397 			rcu_read_unlock();
398 		}
399 
400 		if (preempt)
401 			return preempt;
402 	}
403 
404 	/*
405 	 * If the inflight context did not trigger the preemption, then maybe
406 	 * it was the set of queued requests? Pick the highest priority in
407 	 * the queue (the first active priolist) and see if it deserves to be
408 	 * running instead of ELSP[0].
409 	 *
410 	 * The highest priority request in the queue can not be either
411 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
412 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
413 	 */
414 	return queue_prio(&engine->execlists) > last_prio;
415 }
416 
417 __maybe_unused static inline bool
418 assert_priority_queue(const struct i915_request *prev,
419 		      const struct i915_request *next)
420 {
421 	/*
422 	 * Without preemption, the prev may refer to the still active element
423 	 * which we refuse to let go.
424 	 *
425 	 * Even with preemption, there are times when we think it is better not
426 	 * to preempt and leave an ostensibly lower priority request in flight.
427 	 */
428 	if (i915_request_is_active(prev))
429 		return true;
430 
431 	return rq_prio(prev) >= rq_prio(next);
432 }
433 
434 /*
435  * The context descriptor encodes various attributes of a context,
436  * including its GTT address and some flags. Because it's fairly
437  * expensive to calculate, we'll just do it once and cache the result,
438  * which remains valid until the context is unpinned.
439  *
440  * This is what a descriptor looks like, from LSB to MSB::
441  *
442  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
443  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
444  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
445  *      bits 53-54:    mbz, reserved for use by hardware
446  *      bits 55-63:    group ID, currently unused and set to 0
447  *
448  * Starting from Gen11, the upper dword of the descriptor has a new format:
449  *
450  *      bits 32-36:    reserved
451  *      bits 37-47:    SW context ID
452  *      bits 48:53:    engine instance
453  *      bit 54:        mbz, reserved for use by hardware
454  *      bits 55-60:    SW counter
455  *      bits 61-63:    engine class
456  *
457  * engine info, SW context ID and SW counter need to form a unique number
458  * (Context ID) per lrc.
459  */
460 static u64
461 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
462 {
463 	u64 desc;
464 
465 	desc = INTEL_LEGACY_32B_CONTEXT;
466 	if (i915_vm_is_4lvl(ce->vm))
467 		desc = INTEL_LEGACY_64B_CONTEXT;
468 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
469 
470 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
471 	if (IS_GEN(engine->i915, 8))
472 		desc |= GEN8_CTX_L3LLC_COHERENT;
473 
474 	desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
475 	/*
476 	 * The following 32bits are copied into the OA reports (dword 2).
477 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
478 	 * anything below.
479 	 */
480 	if (INTEL_GEN(engine->i915) >= 11) {
481 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
482 								/* bits 48-53 */
483 
484 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
485 								/* bits 61-63 */
486 	}
487 
488 	return desc;
489 }
490 
491 static u32 *set_offsets(u32 *regs,
492 			const u8 *data,
493 			const struct intel_engine_cs *engine)
494 #define NOP(x) (BIT(7) | (x))
495 #define LRI(count, flags) ((flags) << 6 | (count))
496 #define POSTED BIT(0)
497 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
498 #define REG16(x) \
499 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
500 	(((x) >> 2) & 0x7f)
501 #define END() 0
502 {
503 	const u32 base = engine->mmio_base;
504 
505 	while (*data) {
506 		u8 count, flags;
507 
508 		if (*data & BIT(7)) { /* skip */
509 			regs += *data++ & ~BIT(7);
510 			continue;
511 		}
512 
513 		count = *data & 0x3f;
514 		flags = *data >> 6;
515 		data++;
516 
517 		*regs = MI_LOAD_REGISTER_IMM(count);
518 		if (flags & POSTED)
519 			*regs |= MI_LRI_FORCE_POSTED;
520 		if (INTEL_GEN(engine->i915) >= 11)
521 			*regs |= MI_LRI_CS_MMIO;
522 		regs++;
523 
524 		GEM_BUG_ON(!count);
525 		do {
526 			u32 offset = 0;
527 			u8 v;
528 
529 			do {
530 				v = *data++;
531 				offset <<= 7;
532 				offset |= v & ~BIT(7);
533 			} while (v & BIT(7));
534 
535 			*regs = base + (offset << 2);
536 			regs += 2;
537 		} while (--count);
538 	}
539 
540 	return regs;
541 }
542 
543 static const u8 gen8_xcs_offsets[] = {
544 	NOP(1),
545 	LRI(11, 0),
546 	REG16(0x244),
547 	REG(0x034),
548 	REG(0x030),
549 	REG(0x038),
550 	REG(0x03c),
551 	REG(0x168),
552 	REG(0x140),
553 	REG(0x110),
554 	REG(0x11c),
555 	REG(0x114),
556 	REG(0x118),
557 
558 	NOP(9),
559 	LRI(9, 0),
560 	REG16(0x3a8),
561 	REG16(0x28c),
562 	REG16(0x288),
563 	REG16(0x284),
564 	REG16(0x280),
565 	REG16(0x27c),
566 	REG16(0x278),
567 	REG16(0x274),
568 	REG16(0x270),
569 
570 	NOP(13),
571 	LRI(2, 0),
572 	REG16(0x200),
573 	REG(0x028),
574 
575 	END(),
576 };
577 
578 static const u8 gen9_xcs_offsets[] = {
579 	NOP(1),
580 	LRI(14, POSTED),
581 	REG16(0x244),
582 	REG(0x034),
583 	REG(0x030),
584 	REG(0x038),
585 	REG(0x03c),
586 	REG(0x168),
587 	REG(0x140),
588 	REG(0x110),
589 	REG(0x11c),
590 	REG(0x114),
591 	REG(0x118),
592 	REG(0x1c0),
593 	REG(0x1c4),
594 	REG(0x1c8),
595 
596 	NOP(3),
597 	LRI(9, POSTED),
598 	REG16(0x3a8),
599 	REG16(0x28c),
600 	REG16(0x288),
601 	REG16(0x284),
602 	REG16(0x280),
603 	REG16(0x27c),
604 	REG16(0x278),
605 	REG16(0x274),
606 	REG16(0x270),
607 
608 	NOP(13),
609 	LRI(1, POSTED),
610 	REG16(0x200),
611 
612 	NOP(13),
613 	LRI(44, POSTED),
614 	REG(0x028),
615 	REG(0x09c),
616 	REG(0x0c0),
617 	REG(0x178),
618 	REG(0x17c),
619 	REG16(0x358),
620 	REG(0x170),
621 	REG(0x150),
622 	REG(0x154),
623 	REG(0x158),
624 	REG16(0x41c),
625 	REG16(0x600),
626 	REG16(0x604),
627 	REG16(0x608),
628 	REG16(0x60c),
629 	REG16(0x610),
630 	REG16(0x614),
631 	REG16(0x618),
632 	REG16(0x61c),
633 	REG16(0x620),
634 	REG16(0x624),
635 	REG16(0x628),
636 	REG16(0x62c),
637 	REG16(0x630),
638 	REG16(0x634),
639 	REG16(0x638),
640 	REG16(0x63c),
641 	REG16(0x640),
642 	REG16(0x644),
643 	REG16(0x648),
644 	REG16(0x64c),
645 	REG16(0x650),
646 	REG16(0x654),
647 	REG16(0x658),
648 	REG16(0x65c),
649 	REG16(0x660),
650 	REG16(0x664),
651 	REG16(0x668),
652 	REG16(0x66c),
653 	REG16(0x670),
654 	REG16(0x674),
655 	REG16(0x678),
656 	REG16(0x67c),
657 	REG(0x068),
658 
659 	END(),
660 };
661 
662 static const u8 gen12_xcs_offsets[] = {
663 	NOP(1),
664 	LRI(13, POSTED),
665 	REG16(0x244),
666 	REG(0x034),
667 	REG(0x030),
668 	REG(0x038),
669 	REG(0x03c),
670 	REG(0x168),
671 	REG(0x140),
672 	REG(0x110),
673 	REG(0x1c0),
674 	REG(0x1c4),
675 	REG(0x1c8),
676 	REG(0x180),
677 	REG16(0x2b4),
678 
679 	NOP(5),
680 	LRI(9, POSTED),
681 	REG16(0x3a8),
682 	REG16(0x28c),
683 	REG16(0x288),
684 	REG16(0x284),
685 	REG16(0x280),
686 	REG16(0x27c),
687 	REG16(0x278),
688 	REG16(0x274),
689 	REG16(0x270),
690 
691 	END(),
692 };
693 
694 static const u8 gen8_rcs_offsets[] = {
695 	NOP(1),
696 	LRI(14, POSTED),
697 	REG16(0x244),
698 	REG(0x034),
699 	REG(0x030),
700 	REG(0x038),
701 	REG(0x03c),
702 	REG(0x168),
703 	REG(0x140),
704 	REG(0x110),
705 	REG(0x11c),
706 	REG(0x114),
707 	REG(0x118),
708 	REG(0x1c0),
709 	REG(0x1c4),
710 	REG(0x1c8),
711 
712 	NOP(3),
713 	LRI(9, POSTED),
714 	REG16(0x3a8),
715 	REG16(0x28c),
716 	REG16(0x288),
717 	REG16(0x284),
718 	REG16(0x280),
719 	REG16(0x27c),
720 	REG16(0x278),
721 	REG16(0x274),
722 	REG16(0x270),
723 
724 	NOP(13),
725 	LRI(1, 0),
726 	REG(0x0c8),
727 
728 	END(),
729 };
730 
731 static const u8 gen11_rcs_offsets[] = {
732 	NOP(1),
733 	LRI(15, POSTED),
734 	REG16(0x244),
735 	REG(0x034),
736 	REG(0x030),
737 	REG(0x038),
738 	REG(0x03c),
739 	REG(0x168),
740 	REG(0x140),
741 	REG(0x110),
742 	REG(0x11c),
743 	REG(0x114),
744 	REG(0x118),
745 	REG(0x1c0),
746 	REG(0x1c4),
747 	REG(0x1c8),
748 	REG(0x180),
749 
750 	NOP(1),
751 	LRI(9, POSTED),
752 	REG16(0x3a8),
753 	REG16(0x28c),
754 	REG16(0x288),
755 	REG16(0x284),
756 	REG16(0x280),
757 	REG16(0x27c),
758 	REG16(0x278),
759 	REG16(0x274),
760 	REG16(0x270),
761 
762 	LRI(1, POSTED),
763 	REG(0x1b0),
764 
765 	NOP(10),
766 	LRI(1, 0),
767 	REG(0x0c8),
768 
769 	END(),
770 };
771 
772 static const u8 gen12_rcs_offsets[] = {
773 	NOP(1),
774 	LRI(13, POSTED),
775 	REG16(0x244),
776 	REG(0x034),
777 	REG(0x030),
778 	REG(0x038),
779 	REG(0x03c),
780 	REG(0x168),
781 	REG(0x140),
782 	REG(0x110),
783 	REG(0x1c0),
784 	REG(0x1c4),
785 	REG(0x1c8),
786 	REG(0x180),
787 	REG16(0x2b4),
788 
789 	NOP(5),
790 	LRI(9, POSTED),
791 	REG16(0x3a8),
792 	REG16(0x28c),
793 	REG16(0x288),
794 	REG16(0x284),
795 	REG16(0x280),
796 	REG16(0x27c),
797 	REG16(0x278),
798 	REG16(0x274),
799 	REG16(0x270),
800 
801 	LRI(3, POSTED),
802 	REG(0x1b0),
803 	REG16(0x5a8),
804 	REG16(0x5ac),
805 
806 	NOP(6),
807 	LRI(1, 0),
808 	REG(0x0c8),
809 
810 	END(),
811 };
812 
813 #undef END
814 #undef REG16
815 #undef REG
816 #undef LRI
817 #undef NOP
818 
819 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
820 {
821 	/*
822 	 * The gen12+ lists only have the registers we program in the basic
823 	 * default state. We rely on the context image using relative
824 	 * addressing to automatic fixup the register state between the
825 	 * physical engines for virtual engine.
826 	 */
827 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
828 		   !intel_engine_has_relative_mmio(engine));
829 
830 	if (engine->class == RENDER_CLASS) {
831 		if (INTEL_GEN(engine->i915) >= 12)
832 			return gen12_rcs_offsets;
833 		else if (INTEL_GEN(engine->i915) >= 11)
834 			return gen11_rcs_offsets;
835 		else
836 			return gen8_rcs_offsets;
837 	} else {
838 		if (INTEL_GEN(engine->i915) >= 12)
839 			return gen12_xcs_offsets;
840 		else if (INTEL_GEN(engine->i915) >= 9)
841 			return gen9_xcs_offsets;
842 		else
843 			return gen8_xcs_offsets;
844 	}
845 }
846 
847 static void unwind_wa_tail(struct i915_request *rq)
848 {
849 	rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
850 	assert_ring_tail_valid(rq->ring, rq->tail);
851 }
852 
853 static struct i915_request *
854 __unwind_incomplete_requests(struct intel_engine_cs *engine)
855 {
856 	struct i915_request *rq, *rn, *active = NULL;
857 	struct list_head *uninitialized_var(pl);
858 	int prio = I915_PRIORITY_INVALID;
859 
860 	lockdep_assert_held(&engine->active.lock);
861 
862 	list_for_each_entry_safe_reverse(rq, rn,
863 					 &engine->active.requests,
864 					 sched.link) {
865 
866 		if (i915_request_completed(rq))
867 			continue; /* XXX */
868 
869 		__i915_request_unsubmit(rq);
870 		unwind_wa_tail(rq);
871 
872 		/*
873 		 * Push the request back into the queue for later resubmission.
874 		 * If this request is not native to this physical engine (i.e.
875 		 * it came from a virtual source), push it back onto the virtual
876 		 * engine so that it can be moved across onto another physical
877 		 * engine as load dictates.
878 		 */
879 		if (likely(rq->execution_mask == engine->mask)) {
880 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
881 			if (rq_prio(rq) != prio) {
882 				prio = rq_prio(rq);
883 				pl = i915_sched_lookup_priolist(engine, prio);
884 			}
885 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
886 
887 			list_move(&rq->sched.link, pl);
888 			active = rq;
889 		} else {
890 			struct intel_engine_cs *owner = rq->hw_context->engine;
891 
892 			/*
893 			 * Decouple the virtual breadcrumb before moving it
894 			 * back to the virtual engine -- we don't want the
895 			 * request to complete in the background and try
896 			 * and cancel the breadcrumb on the virtual engine
897 			 * (instead of the old engine where it is linked)!
898 			 */
899 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
900 				     &rq->fence.flags)) {
901 				spin_lock_nested(&rq->lock,
902 						 SINGLE_DEPTH_NESTING);
903 				i915_request_cancel_breadcrumb(rq);
904 				spin_unlock(&rq->lock);
905 			}
906 			rq->engine = owner;
907 			owner->submit_request(rq);
908 			active = NULL;
909 		}
910 	}
911 
912 	return active;
913 }
914 
915 struct i915_request *
916 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
917 {
918 	struct intel_engine_cs *engine =
919 		container_of(execlists, typeof(*engine), execlists);
920 
921 	return __unwind_incomplete_requests(engine);
922 }
923 
924 static inline void
925 execlists_context_status_change(struct i915_request *rq, unsigned long status)
926 {
927 	/*
928 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
929 	 * The compiler should eliminate this function as dead-code.
930 	 */
931 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
932 		return;
933 
934 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
935 				   status, rq);
936 }
937 
938 static void intel_engine_context_in(struct intel_engine_cs *engine)
939 {
940 	unsigned long flags;
941 
942 	if (READ_ONCE(engine->stats.enabled) == 0)
943 		return;
944 
945 	write_seqlock_irqsave(&engine->stats.lock, flags);
946 
947 	if (engine->stats.enabled > 0) {
948 		if (engine->stats.active++ == 0)
949 			engine->stats.start = ktime_get();
950 		GEM_BUG_ON(engine->stats.active == 0);
951 	}
952 
953 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
954 }
955 
956 static void intel_engine_context_out(struct intel_engine_cs *engine)
957 {
958 	unsigned long flags;
959 
960 	if (READ_ONCE(engine->stats.enabled) == 0)
961 		return;
962 
963 	write_seqlock_irqsave(&engine->stats.lock, flags);
964 
965 	if (engine->stats.enabled > 0) {
966 		ktime_t last;
967 
968 		if (engine->stats.active && --engine->stats.active == 0) {
969 			/*
970 			 * Decrement the active context count and in case GPU
971 			 * is now idle add up to the running total.
972 			 */
973 			last = ktime_sub(ktime_get(), engine->stats.start);
974 
975 			engine->stats.total = ktime_add(engine->stats.total,
976 							last);
977 		} else if (engine->stats.active == 0) {
978 			/*
979 			 * After turning on engine stats, context out might be
980 			 * the first event in which case we account from the
981 			 * time stats gathering was turned on.
982 			 */
983 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
984 
985 			engine->stats.total = ktime_add(engine->stats.total,
986 							last);
987 		}
988 	}
989 
990 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
991 }
992 
993 static void restore_default_state(struct intel_context *ce,
994 				  struct intel_engine_cs *engine)
995 {
996 	u32 *regs = ce->lrc_reg_state;
997 
998 	if (engine->pinned_default_state)
999 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
1000 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1001 		       engine->context_size - PAGE_SIZE);
1002 
1003 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1004 }
1005 
1006 static void reset_active(struct i915_request *rq,
1007 			 struct intel_engine_cs *engine)
1008 {
1009 	struct intel_context * const ce = rq->hw_context;
1010 	u32 head;
1011 
1012 	/*
1013 	 * The executing context has been cancelled. We want to prevent
1014 	 * further execution along this context and propagate the error on
1015 	 * to anything depending on its results.
1016 	 *
1017 	 * In __i915_request_submit(), we apply the -EIO and remove the
1018 	 * requests' payloads for any banned requests. But first, we must
1019 	 * rewind the context back to the start of the incomplete request so
1020 	 * that we do not jump back into the middle of the batch.
1021 	 *
1022 	 * We preserve the breadcrumbs and semaphores of the incomplete
1023 	 * requests so that inter-timeline dependencies (i.e other timelines)
1024 	 * remain correctly ordered. And we defer to __i915_request_submit()
1025 	 * so that all asynchronous waits are correctly handled.
1026 	 */
1027 	GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
1028 		  __func__, engine->name, rq->fence.context, rq->fence.seqno);
1029 
1030 	/* On resubmission of the active request, payload will be scrubbed */
1031 	if (i915_request_completed(rq))
1032 		head = rq->tail;
1033 	else
1034 		head = active_request(ce->timeline, rq)->head;
1035 	ce->ring->head = intel_ring_wrap(ce->ring, head);
1036 	intel_ring_update_space(ce->ring);
1037 
1038 	/* Scrub the context image to prevent replaying the previous batch */
1039 	restore_default_state(ce, engine);
1040 	__execlists_update_reg_state(ce, engine);
1041 
1042 	/* We've switched away, so this should be a no-op, but intent matters */
1043 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1044 }
1045 
1046 static inline struct intel_engine_cs *
1047 __execlists_schedule_in(struct i915_request *rq)
1048 {
1049 	struct intel_engine_cs * const engine = rq->engine;
1050 	struct intel_context * const ce = rq->hw_context;
1051 
1052 	intel_context_get(ce);
1053 
1054 	if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
1055 		reset_active(rq, engine);
1056 
1057 	if (ce->tag) {
1058 		/* Use a fixed tag for OA and friends */
1059 		ce->lrc_desc |= (u64)ce->tag << 32;
1060 	} else {
1061 		/* We don't need a strict matching tag, just different values */
1062 		ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1063 		ce->lrc_desc |=
1064 			(u64)(engine->context_tag++ % NUM_CONTEXT_TAG) <<
1065 			GEN11_SW_CTX_ID_SHIFT;
1066 		BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1067 	}
1068 
1069 	intel_gt_pm_get(engine->gt);
1070 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1071 	intel_engine_context_in(engine);
1072 
1073 	return engine;
1074 }
1075 
1076 static inline struct i915_request *
1077 execlists_schedule_in(struct i915_request *rq, int idx)
1078 {
1079 	struct intel_context * const ce = rq->hw_context;
1080 	struct intel_engine_cs *old;
1081 
1082 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1083 	trace_i915_request_in(rq, idx);
1084 
1085 	old = READ_ONCE(ce->inflight);
1086 	do {
1087 		if (!old) {
1088 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1089 			break;
1090 		}
1091 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1092 
1093 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1094 	return i915_request_get(rq);
1095 }
1096 
1097 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1098 {
1099 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1100 	struct i915_request *next = READ_ONCE(ve->request);
1101 
1102 	if (next && next->execution_mask & ~rq->execution_mask)
1103 		tasklet_schedule(&ve->base.execlists.tasklet);
1104 }
1105 
1106 static inline void
1107 __execlists_schedule_out(struct i915_request *rq,
1108 			 struct intel_engine_cs * const engine)
1109 {
1110 	struct intel_context * const ce = rq->hw_context;
1111 
1112 	/*
1113 	 * NB process_csb() is not under the engine->active.lock and hence
1114 	 * schedule_out can race with schedule_in meaning that we should
1115 	 * refrain from doing non-trivial work here.
1116 	 */
1117 
1118 	intel_engine_context_out(engine);
1119 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1120 	intel_gt_pm_put(engine->gt);
1121 
1122 	/*
1123 	 * If this is part of a virtual engine, its next request may
1124 	 * have been blocked waiting for access to the active context.
1125 	 * We have to kick all the siblings again in case we need to
1126 	 * switch (e.g. the next request is not runnable on this
1127 	 * engine). Hopefully, we will already have submitted the next
1128 	 * request before the tasklet runs and do not need to rebuild
1129 	 * each virtual tree and kick everyone again.
1130 	 */
1131 	if (ce->engine != engine)
1132 		kick_siblings(rq, ce);
1133 
1134 	intel_context_put(ce);
1135 }
1136 
1137 static inline void
1138 execlists_schedule_out(struct i915_request *rq)
1139 {
1140 	struct intel_context * const ce = rq->hw_context;
1141 	struct intel_engine_cs *cur, *old;
1142 
1143 	trace_i915_request_out(rq);
1144 
1145 	old = READ_ONCE(ce->inflight);
1146 	do
1147 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1148 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1149 	if (!cur)
1150 		__execlists_schedule_out(rq, old);
1151 
1152 	i915_request_put(rq);
1153 }
1154 
1155 static u64 execlists_update_context(const struct i915_request *rq)
1156 {
1157 	struct intel_context *ce = rq->hw_context;
1158 	u64 desc;
1159 
1160 	ce->lrc_reg_state[CTX_RING_TAIL] =
1161 		intel_ring_set_tail(rq->ring, rq->tail);
1162 
1163 	/*
1164 	 * Make sure the context image is complete before we submit it to HW.
1165 	 *
1166 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1167 	 * an uncached write such as our mmio register access, the empirical
1168 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1169 	 * may not be visible to the HW prior to the completion of the UC
1170 	 * register write and that we may begin execution from the context
1171 	 * before its image is complete leading to invalid PD chasing.
1172 	 *
1173 	 * Furthermore, Braswell, at least, wants a full mb to be sure that
1174 	 * the writes are coherent in memory (visible to the GPU) prior to
1175 	 * execution, and not just visible to other CPUs (as is the result of
1176 	 * wmb).
1177 	 */
1178 	mb();
1179 
1180 	desc = ce->lrc_desc;
1181 	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1182 
1183 	/* Wa_1607138340:tgl */
1184 	if (IS_TGL_REVID(rq->i915, TGL_REVID_A0, TGL_REVID_A0))
1185 		desc |= CTX_DESC_FORCE_RESTORE;
1186 
1187 	return desc;
1188 }
1189 
1190 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1191 {
1192 	if (execlists->ctrl_reg) {
1193 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1194 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1195 	} else {
1196 		writel(upper_32_bits(desc), execlists->submit_reg);
1197 		writel(lower_32_bits(desc), execlists->submit_reg);
1198 	}
1199 }
1200 
1201 static __maybe_unused void
1202 trace_ports(const struct intel_engine_execlists *execlists,
1203 	    const char *msg,
1204 	    struct i915_request * const *ports)
1205 {
1206 	const struct intel_engine_cs *engine =
1207 		container_of(execlists, typeof(*engine), execlists);
1208 
1209 	if (!ports[0])
1210 		return;
1211 
1212 	GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n",
1213 		  engine->name, msg,
1214 		  ports[0]->fence.context,
1215 		  ports[0]->fence.seqno,
1216 		  i915_request_completed(ports[0]) ? "!" :
1217 		  i915_request_started(ports[0]) ? "*" :
1218 		  "",
1219 		  ports[1] ? ports[1]->fence.context : 0,
1220 		  ports[1] ? ports[1]->fence.seqno : 0);
1221 }
1222 
1223 static __maybe_unused bool
1224 assert_pending_valid(const struct intel_engine_execlists *execlists,
1225 		     const char *msg)
1226 {
1227 	struct i915_request * const *port, *rq;
1228 	struct intel_context *ce = NULL;
1229 
1230 	trace_ports(execlists, msg, execlists->pending);
1231 
1232 	if (!execlists->pending[0]) {
1233 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1234 		return false;
1235 	}
1236 
1237 	if (execlists->pending[execlists_num_ports(execlists)]) {
1238 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1239 			      execlists_num_ports(execlists));
1240 		return false;
1241 	}
1242 
1243 	for (port = execlists->pending; (rq = *port); port++) {
1244 		if (ce == rq->hw_context) {
1245 			GEM_TRACE_ERR("Duplicate context in pending[%zd]\n",
1246 				      port - execlists->pending);
1247 			return false;
1248 		}
1249 
1250 		ce = rq->hw_context;
1251 		if (i915_request_completed(rq))
1252 			continue;
1253 
1254 		if (i915_active_is_idle(&ce->active)) {
1255 			GEM_TRACE_ERR("Inactive context in pending[%zd]\n",
1256 				      port - execlists->pending);
1257 			return false;
1258 		}
1259 
1260 		if (!i915_vma_is_pinned(ce->state)) {
1261 			GEM_TRACE_ERR("Unpinned context in pending[%zd]\n",
1262 				      port - execlists->pending);
1263 			return false;
1264 		}
1265 
1266 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1267 			GEM_TRACE_ERR("Unpinned ringbuffer in pending[%zd]\n",
1268 				      port - execlists->pending);
1269 			return false;
1270 		}
1271 	}
1272 
1273 	return ce;
1274 }
1275 
1276 static void execlists_submit_ports(struct intel_engine_cs *engine)
1277 {
1278 	struct intel_engine_execlists *execlists = &engine->execlists;
1279 	unsigned int n;
1280 
1281 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1282 
1283 	/*
1284 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1285 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1286 	 * not be relinquished until the device is idle (see
1287 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1288 	 * that all ELSP are drained i.e. we have processed the CSB,
1289 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1290 	 */
1291 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1292 
1293 	/*
1294 	 * ELSQ note: the submit queue is not cleared after being submitted
1295 	 * to the HW so we need to make sure we always clean it up. This is
1296 	 * currently ensured by the fact that we always write the same number
1297 	 * of elsq entries, keep this in mind before changing the loop below.
1298 	 */
1299 	for (n = execlists_num_ports(execlists); n--; ) {
1300 		struct i915_request *rq = execlists->pending[n];
1301 
1302 		write_desc(execlists,
1303 			   rq ? execlists_update_context(rq) : 0,
1304 			   n);
1305 	}
1306 
1307 	/* we need to manually load the submit queue */
1308 	if (execlists->ctrl_reg)
1309 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1310 }
1311 
1312 static bool ctx_single_port_submission(const struct intel_context *ce)
1313 {
1314 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1315 		i915_gem_context_force_single_submission(ce->gem_context));
1316 }
1317 
1318 static bool can_merge_ctx(const struct intel_context *prev,
1319 			  const struct intel_context *next)
1320 {
1321 	if (prev != next)
1322 		return false;
1323 
1324 	if (ctx_single_port_submission(prev))
1325 		return false;
1326 
1327 	return true;
1328 }
1329 
1330 static bool can_merge_rq(const struct i915_request *prev,
1331 			 const struct i915_request *next)
1332 {
1333 	GEM_BUG_ON(prev == next);
1334 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1335 
1336 	/*
1337 	 * We do not submit known completed requests. Therefore if the next
1338 	 * request is already completed, we can pretend to merge it in
1339 	 * with the previous context (and we will skip updating the ELSP
1340 	 * and tracking). Thus hopefully keeping the ELSP full with active
1341 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1342 	 * us.
1343 	 */
1344 	if (i915_request_completed(next))
1345 		return true;
1346 
1347 	if (unlikely((prev->flags ^ next->flags) &
1348 		     (I915_REQUEST_NOPREEMPT | I915_REQUEST_SENTINEL)))
1349 		return false;
1350 
1351 	if (!can_merge_ctx(prev->hw_context, next->hw_context))
1352 		return false;
1353 
1354 	return true;
1355 }
1356 
1357 static void virtual_update_register_offsets(u32 *regs,
1358 					    struct intel_engine_cs *engine)
1359 {
1360 	set_offsets(regs, reg_offsets(engine), engine);
1361 }
1362 
1363 static bool virtual_matches(const struct virtual_engine *ve,
1364 			    const struct i915_request *rq,
1365 			    const struct intel_engine_cs *engine)
1366 {
1367 	const struct intel_engine_cs *inflight;
1368 
1369 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1370 		return false;
1371 
1372 	/*
1373 	 * We track when the HW has completed saving the context image
1374 	 * (i.e. when we have seen the final CS event switching out of
1375 	 * the context) and must not overwrite the context image before
1376 	 * then. This restricts us to only using the active engine
1377 	 * while the previous virtualized request is inflight (so
1378 	 * we reuse the register offsets). This is a very small
1379 	 * hystersis on the greedy seelction algorithm.
1380 	 */
1381 	inflight = intel_context_inflight(&ve->context);
1382 	if (inflight && inflight != engine)
1383 		return false;
1384 
1385 	return true;
1386 }
1387 
1388 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1389 				     struct intel_engine_cs *engine)
1390 {
1391 	struct intel_engine_cs *old = ve->siblings[0];
1392 
1393 	/* All unattached (rq->engine == old) must already be completed */
1394 
1395 	spin_lock(&old->breadcrumbs.irq_lock);
1396 	if (!list_empty(&ve->context.signal_link)) {
1397 		list_move_tail(&ve->context.signal_link,
1398 			       &engine->breadcrumbs.signalers);
1399 		intel_engine_queue_breadcrumbs(engine);
1400 	}
1401 	spin_unlock(&old->breadcrumbs.irq_lock);
1402 }
1403 
1404 static struct i915_request *
1405 last_active(const struct intel_engine_execlists *execlists)
1406 {
1407 	struct i915_request * const *last = READ_ONCE(execlists->active);
1408 
1409 	while (*last && i915_request_completed(*last))
1410 		last++;
1411 
1412 	return *last;
1413 }
1414 
1415 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1416 {
1417 	LIST_HEAD(list);
1418 
1419 	/*
1420 	 * We want to move the interrupted request to the back of
1421 	 * the round-robin list (i.e. its priority level), but
1422 	 * in doing so, we must then move all requests that were in
1423 	 * flight and were waiting for the interrupted request to
1424 	 * be run after it again.
1425 	 */
1426 	do {
1427 		struct i915_dependency *p;
1428 
1429 		GEM_BUG_ON(i915_request_is_active(rq));
1430 		list_move_tail(&rq->sched.link, pl);
1431 
1432 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
1433 			struct i915_request *w =
1434 				container_of(p->waiter, typeof(*w), sched);
1435 
1436 			/* Leave semaphores spinning on the other engines */
1437 			if (w->engine != rq->engine)
1438 				continue;
1439 
1440 			/* No waiter should start before its signaler */
1441 			GEM_BUG_ON(i915_request_started(w) &&
1442 				   !i915_request_completed(rq));
1443 
1444 			GEM_BUG_ON(i915_request_is_active(w));
1445 			if (list_empty(&w->sched.link))
1446 				continue; /* Not yet submitted; unready */
1447 
1448 			if (rq_prio(w) < rq_prio(rq))
1449 				continue;
1450 
1451 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1452 			list_move_tail(&w->sched.link, &list);
1453 		}
1454 
1455 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1456 	} while (rq);
1457 }
1458 
1459 static void defer_active(struct intel_engine_cs *engine)
1460 {
1461 	struct i915_request *rq;
1462 
1463 	rq = __unwind_incomplete_requests(engine);
1464 	if (!rq)
1465 		return;
1466 
1467 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1468 }
1469 
1470 static bool
1471 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1472 {
1473 	int hint;
1474 
1475 	if (!intel_engine_has_timeslices(engine))
1476 		return false;
1477 
1478 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1479 		return false;
1480 
1481 	hint = max(rq_prio(list_next_entry(rq, sched.link)),
1482 		   engine->execlists.queue_priority_hint);
1483 
1484 	return hint >= effective_prio(rq);
1485 }
1486 
1487 static int
1488 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1489 {
1490 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1491 		return INT_MIN;
1492 
1493 	return rq_prio(list_next_entry(rq, sched.link));
1494 }
1495 
1496 static inline unsigned long
1497 timeslice(const struct intel_engine_cs *engine)
1498 {
1499 	return READ_ONCE(engine->props.timeslice_duration_ms);
1500 }
1501 
1502 static unsigned long
1503 active_timeslice(const struct intel_engine_cs *engine)
1504 {
1505 	const struct i915_request *rq = *engine->execlists.active;
1506 
1507 	if (i915_request_completed(rq))
1508 		return 0;
1509 
1510 	if (engine->execlists.switch_priority_hint < effective_prio(rq))
1511 		return 0;
1512 
1513 	return timeslice(engine);
1514 }
1515 
1516 static void set_timeslice(struct intel_engine_cs *engine)
1517 {
1518 	if (!intel_engine_has_timeslices(engine))
1519 		return;
1520 
1521 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1522 }
1523 
1524 static void record_preemption(struct intel_engine_execlists *execlists)
1525 {
1526 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1527 }
1528 
1529 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
1530 {
1531 	struct i915_request *rq;
1532 
1533 	rq = last_active(&engine->execlists);
1534 	if (!rq)
1535 		return 0;
1536 
1537 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1538 	if (unlikely(i915_gem_context_is_banned(rq->gem_context)))
1539 		return 1;
1540 
1541 	return READ_ONCE(engine->props.preempt_timeout_ms);
1542 }
1543 
1544 static void set_preempt_timeout(struct intel_engine_cs *engine)
1545 {
1546 	if (!intel_engine_has_preempt_reset(engine))
1547 		return;
1548 
1549 	set_timer_ms(&engine->execlists.preempt,
1550 		     active_preempt_timeout(engine));
1551 }
1552 
1553 static void execlists_dequeue(struct intel_engine_cs *engine)
1554 {
1555 	struct intel_engine_execlists * const execlists = &engine->execlists;
1556 	struct i915_request **port = execlists->pending;
1557 	struct i915_request ** const last_port = port + execlists->port_mask;
1558 	struct i915_request *last;
1559 	struct rb_node *rb;
1560 	bool submit = false;
1561 
1562 	/*
1563 	 * Hardware submission is through 2 ports. Conceptually each port
1564 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1565 	 * static for a context, and unique to each, so we only execute
1566 	 * requests belonging to a single context from each ring. RING_HEAD
1567 	 * is maintained by the CS in the context image, it marks the place
1568 	 * where it got up to last time, and through RING_TAIL we tell the CS
1569 	 * where we want to execute up to this time.
1570 	 *
1571 	 * In this list the requests are in order of execution. Consecutive
1572 	 * requests from the same context are adjacent in the ringbuffer. We
1573 	 * can combine these requests into a single RING_TAIL update:
1574 	 *
1575 	 *              RING_HEAD...req1...req2
1576 	 *                                    ^- RING_TAIL
1577 	 * since to execute req2 the CS must first execute req1.
1578 	 *
1579 	 * Our goal then is to point each port to the end of a consecutive
1580 	 * sequence of requests as being the most optimal (fewest wake ups
1581 	 * and context switches) submission.
1582 	 */
1583 
1584 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1585 		struct virtual_engine *ve =
1586 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1587 		struct i915_request *rq = READ_ONCE(ve->request);
1588 
1589 		if (!rq) { /* lazily cleanup after another engine handled rq */
1590 			rb_erase_cached(rb, &execlists->virtual);
1591 			RB_CLEAR_NODE(rb);
1592 			rb = rb_first_cached(&execlists->virtual);
1593 			continue;
1594 		}
1595 
1596 		if (!virtual_matches(ve, rq, engine)) {
1597 			rb = rb_next(rb);
1598 			continue;
1599 		}
1600 
1601 		break;
1602 	}
1603 
1604 	/*
1605 	 * If the queue is higher priority than the last
1606 	 * request in the currently active context, submit afresh.
1607 	 * We will resubmit again afterwards in case we need to split
1608 	 * the active context to interject the preemption request,
1609 	 * i.e. we will retrigger preemption following the ack in case
1610 	 * of trouble.
1611 	 */
1612 	last = last_active(execlists);
1613 	if (last) {
1614 		if (need_preempt(engine, last, rb)) {
1615 			GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n",
1616 				  engine->name,
1617 				  last->fence.context,
1618 				  last->fence.seqno,
1619 				  last->sched.attr.priority,
1620 				  execlists->queue_priority_hint);
1621 			record_preemption(execlists);
1622 
1623 			/*
1624 			 * Don't let the RING_HEAD advance past the breadcrumb
1625 			 * as we unwind (and until we resubmit) so that we do
1626 			 * not accidentally tell it to go backwards.
1627 			 */
1628 			ring_set_paused(engine, 1);
1629 
1630 			/*
1631 			 * Note that we have not stopped the GPU at this point,
1632 			 * so we are unwinding the incomplete requests as they
1633 			 * remain inflight and so by the time we do complete
1634 			 * the preemption, some of the unwound requests may
1635 			 * complete!
1636 			 */
1637 			__unwind_incomplete_requests(engine);
1638 
1639 			/*
1640 			 * If we need to return to the preempted context, we
1641 			 * need to skip the lite-restore and force it to
1642 			 * reload the RING_TAIL. Otherwise, the HW has a
1643 			 * tendency to ignore us rewinding the TAIL to the
1644 			 * end of an earlier request.
1645 			 */
1646 			last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1647 			last = NULL;
1648 		} else if (need_timeslice(engine, last) &&
1649 			   timer_expired(&engine->execlists.timer)) {
1650 			GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n",
1651 				  engine->name,
1652 				  last->fence.context,
1653 				  last->fence.seqno,
1654 				  last->sched.attr.priority,
1655 				  execlists->queue_priority_hint);
1656 
1657 			ring_set_paused(engine, 1);
1658 			defer_active(engine);
1659 
1660 			/*
1661 			 * Unlike for preemption, if we rewind and continue
1662 			 * executing the same context as previously active,
1663 			 * the order of execution will remain the same and
1664 			 * the tail will only advance. We do not need to
1665 			 * force a full context restore, as a lite-restore
1666 			 * is sufficient to resample the monotonic TAIL.
1667 			 *
1668 			 * If we switch to any other context, similarly we
1669 			 * will not rewind TAIL of current context, and
1670 			 * normal save/restore will preserve state and allow
1671 			 * us to later continue executing the same request.
1672 			 */
1673 			last = NULL;
1674 		} else {
1675 			/*
1676 			 * Otherwise if we already have a request pending
1677 			 * for execution after the current one, we can
1678 			 * just wait until the next CS event before
1679 			 * queuing more. In either case we will force a
1680 			 * lite-restore preemption event, but if we wait
1681 			 * we hopefully coalesce several updates into a single
1682 			 * submission.
1683 			 */
1684 			if (!list_is_last(&last->sched.link,
1685 					  &engine->active.requests)) {
1686 				/*
1687 				 * Even if ELSP[1] is occupied and not worthy
1688 				 * of timeslices, our queue might be.
1689 				 */
1690 				if (!execlists->timer.expires &&
1691 				    need_timeslice(engine, last))
1692 					set_timer_ms(&execlists->timer,
1693 						     timeslice(engine));
1694 
1695 				return;
1696 			}
1697 
1698 			/*
1699 			 * WaIdleLiteRestore:bdw,skl
1700 			 * Apply the wa NOOPs to prevent
1701 			 * ring:HEAD == rq:TAIL as we resubmit the
1702 			 * request. See gen8_emit_fini_breadcrumb() for
1703 			 * where we prepare the padding after the
1704 			 * end of the request.
1705 			 */
1706 			last->tail = last->wa_tail;
1707 		}
1708 	}
1709 
1710 	while (rb) { /* XXX virtual is always taking precedence */
1711 		struct virtual_engine *ve =
1712 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1713 		struct i915_request *rq;
1714 
1715 		spin_lock(&ve->base.active.lock);
1716 
1717 		rq = ve->request;
1718 		if (unlikely(!rq)) { /* lost the race to a sibling */
1719 			spin_unlock(&ve->base.active.lock);
1720 			rb_erase_cached(rb, &execlists->virtual);
1721 			RB_CLEAR_NODE(rb);
1722 			rb = rb_first_cached(&execlists->virtual);
1723 			continue;
1724 		}
1725 
1726 		GEM_BUG_ON(rq != ve->request);
1727 		GEM_BUG_ON(rq->engine != &ve->base);
1728 		GEM_BUG_ON(rq->hw_context != &ve->context);
1729 
1730 		if (rq_prio(rq) >= queue_prio(execlists)) {
1731 			if (!virtual_matches(ve, rq, engine)) {
1732 				spin_unlock(&ve->base.active.lock);
1733 				rb = rb_next(rb);
1734 				continue;
1735 			}
1736 
1737 			if (last && !can_merge_rq(last, rq)) {
1738 				spin_unlock(&ve->base.active.lock);
1739 				return; /* leave this for another */
1740 			}
1741 
1742 			GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
1743 				  engine->name,
1744 				  rq->fence.context,
1745 				  rq->fence.seqno,
1746 				  i915_request_completed(rq) ? "!" :
1747 				  i915_request_started(rq) ? "*" :
1748 				  "",
1749 				  yesno(engine != ve->siblings[0]));
1750 
1751 			ve->request = NULL;
1752 			ve->base.execlists.queue_priority_hint = INT_MIN;
1753 			rb_erase_cached(rb, &execlists->virtual);
1754 			RB_CLEAR_NODE(rb);
1755 
1756 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1757 			rq->engine = engine;
1758 
1759 			if (engine != ve->siblings[0]) {
1760 				u32 *regs = ve->context.lrc_reg_state;
1761 				unsigned int n;
1762 
1763 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1764 
1765 				if (!intel_engine_has_relative_mmio(engine))
1766 					virtual_update_register_offsets(regs,
1767 									engine);
1768 
1769 				if (!list_empty(&ve->context.signals))
1770 					virtual_xfer_breadcrumbs(ve, engine);
1771 
1772 				/*
1773 				 * Move the bound engine to the top of the list
1774 				 * for future execution. We then kick this
1775 				 * tasklet first before checking others, so that
1776 				 * we preferentially reuse this set of bound
1777 				 * registers.
1778 				 */
1779 				for (n = 1; n < ve->num_siblings; n++) {
1780 					if (ve->siblings[n] == engine) {
1781 						swap(ve->siblings[n],
1782 						     ve->siblings[0]);
1783 						break;
1784 					}
1785 				}
1786 
1787 				GEM_BUG_ON(ve->siblings[0] != engine);
1788 			}
1789 
1790 			if (__i915_request_submit(rq)) {
1791 				submit = true;
1792 				last = rq;
1793 			}
1794 			i915_request_put(rq);
1795 
1796 			/*
1797 			 * Hmm, we have a bunch of virtual engine requests,
1798 			 * but the first one was already completed (thanks
1799 			 * preempt-to-busy!). Keep looking at the veng queue
1800 			 * until we have no more relevant requests (i.e.
1801 			 * the normal submit queue has higher priority).
1802 			 */
1803 			if (!submit) {
1804 				spin_unlock(&ve->base.active.lock);
1805 				rb = rb_first_cached(&execlists->virtual);
1806 				continue;
1807 			}
1808 		}
1809 
1810 		spin_unlock(&ve->base.active.lock);
1811 		break;
1812 	}
1813 
1814 	while ((rb = rb_first_cached(&execlists->queue))) {
1815 		struct i915_priolist *p = to_priolist(rb);
1816 		struct i915_request *rq, *rn;
1817 		int i;
1818 
1819 		priolist_for_each_request_consume(rq, rn, p, i) {
1820 			bool merge = true;
1821 
1822 			/*
1823 			 * Can we combine this request with the current port?
1824 			 * It has to be the same context/ringbuffer and not
1825 			 * have any exceptions (e.g. GVT saying never to
1826 			 * combine contexts).
1827 			 *
1828 			 * If we can combine the requests, we can execute both
1829 			 * by updating the RING_TAIL to point to the end of the
1830 			 * second request, and so we never need to tell the
1831 			 * hardware about the first.
1832 			 */
1833 			if (last && !can_merge_rq(last, rq)) {
1834 				/*
1835 				 * If we are on the second port and cannot
1836 				 * combine this request with the last, then we
1837 				 * are done.
1838 				 */
1839 				if (port == last_port)
1840 					goto done;
1841 
1842 				/*
1843 				 * We must not populate both ELSP[] with the
1844 				 * same LRCA, i.e. we must submit 2 different
1845 				 * contexts if we submit 2 ELSP.
1846 				 */
1847 				if (last->hw_context == rq->hw_context)
1848 					goto done;
1849 
1850 				if (i915_request_has_sentinel(last))
1851 					goto done;
1852 
1853 				/*
1854 				 * If GVT overrides us we only ever submit
1855 				 * port[0], leaving port[1] empty. Note that we
1856 				 * also have to be careful that we don't queue
1857 				 * the same context (even though a different
1858 				 * request) to the second port.
1859 				 */
1860 				if (ctx_single_port_submission(last->hw_context) ||
1861 				    ctx_single_port_submission(rq->hw_context))
1862 					goto done;
1863 
1864 				merge = false;
1865 			}
1866 
1867 			if (__i915_request_submit(rq)) {
1868 				if (!merge) {
1869 					*port = execlists_schedule_in(last, port - execlists->pending);
1870 					port++;
1871 					last = NULL;
1872 				}
1873 
1874 				GEM_BUG_ON(last &&
1875 					   !can_merge_ctx(last->hw_context,
1876 							  rq->hw_context));
1877 
1878 				submit = true;
1879 				last = rq;
1880 			}
1881 		}
1882 
1883 		rb_erase_cached(&p->node, &execlists->queue);
1884 		i915_priolist_free(p);
1885 	}
1886 
1887 done:
1888 	/*
1889 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
1890 	 *
1891 	 * We choose the priority hint such that if we add a request of greater
1892 	 * priority than this, we kick the submission tasklet to decide on
1893 	 * the right order of submitting the requests to hardware. We must
1894 	 * also be prepared to reorder requests as they are in-flight on the
1895 	 * HW. We derive the priority hint then as the first "hole" in
1896 	 * the HW submission ports and if there are no available slots,
1897 	 * the priority of the lowest executing request, i.e. last.
1898 	 *
1899 	 * When we do receive a higher priority request ready to run from the
1900 	 * user, see queue_request(), the priority hint is bumped to that
1901 	 * request triggering preemption on the next dequeue (or subsequent
1902 	 * interrupt for secondary ports).
1903 	 */
1904 	execlists->queue_priority_hint = queue_prio(execlists);
1905 	GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n",
1906 		  engine->name, execlists->queue_priority_hint,
1907 		  yesno(submit));
1908 
1909 	if (submit) {
1910 		*port = execlists_schedule_in(last, port - execlists->pending);
1911 		execlists->switch_priority_hint =
1912 			switch_prio(engine, *execlists->pending);
1913 
1914 		/*
1915 		 * Skip if we ended up with exactly the same set of requests,
1916 		 * e.g. trying to timeslice a pair of ordered contexts
1917 		 */
1918 		if (!memcmp(execlists->active, execlists->pending,
1919 			    (port - execlists->pending + 1) * sizeof(*port))) {
1920 			do
1921 				execlists_schedule_out(fetch_and_zero(port));
1922 			while (port-- != execlists->pending);
1923 
1924 			goto skip_submit;
1925 		}
1926 
1927 		memset(port + 1, 0, (last_port - port) * sizeof(*port));
1928 		execlists_submit_ports(engine);
1929 
1930 		set_preempt_timeout(engine);
1931 	} else {
1932 skip_submit:
1933 		ring_set_paused(engine, 0);
1934 	}
1935 }
1936 
1937 static void
1938 cancel_port_requests(struct intel_engine_execlists * const execlists)
1939 {
1940 	struct i915_request * const *port, *rq;
1941 
1942 	for (port = execlists->pending; (rq = *port); port++)
1943 		execlists_schedule_out(rq);
1944 	memset(execlists->pending, 0, sizeof(execlists->pending));
1945 
1946 	for (port = execlists->active; (rq = *port); port++)
1947 		execlists_schedule_out(rq);
1948 	execlists->active =
1949 		memset(execlists->inflight, 0, sizeof(execlists->inflight));
1950 }
1951 
1952 static inline void
1953 invalidate_csb_entries(const u32 *first, const u32 *last)
1954 {
1955 	clflush((void *)first);
1956 	clflush((void *)last);
1957 }
1958 
1959 static inline bool
1960 reset_in_progress(const struct intel_engine_execlists *execlists)
1961 {
1962 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1963 }
1964 
1965 /*
1966  * Starting with Gen12, the status has a new format:
1967  *
1968  *     bit  0:     switched to new queue
1969  *     bit  1:     reserved
1970  *     bit  2:     semaphore wait mode (poll or signal), only valid when
1971  *                 switch detail is set to "wait on semaphore"
1972  *     bits 3-5:   engine class
1973  *     bits 6-11:  engine instance
1974  *     bits 12-14: reserved
1975  *     bits 15-25: sw context id of the lrc the GT switched to
1976  *     bits 26-31: sw counter of the lrc the GT switched to
1977  *     bits 32-35: context switch detail
1978  *                  - 0: ctx complete
1979  *                  - 1: wait on sync flip
1980  *                  - 2: wait on vblank
1981  *                  - 3: wait on scanline
1982  *                  - 4: wait on semaphore
1983  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
1984  *                       WAIT_FOR_EVENT)
1985  *     bit  36:    reserved
1986  *     bits 37-43: wait detail (for switch detail 1 to 4)
1987  *     bits 44-46: reserved
1988  *     bits 47-57: sw context id of the lrc the GT switched away from
1989  *     bits 58-63: sw counter of the lrc the GT switched away from
1990  */
1991 static inline bool
1992 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1993 {
1994 	u32 lower_dw = csb[0];
1995 	u32 upper_dw = csb[1];
1996 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
1997 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
1998 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
1999 
2000 	/*
2001 	 * The context switch detail is not guaranteed to be 5 when a preemption
2002 	 * occurs, so we can't just check for that. The check below works for
2003 	 * all the cases we care about, including preemptions of WAIT
2004 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2005 	 * would require some extra handling, but we don't support that.
2006 	 */
2007 	if (!ctx_away_valid || new_queue) {
2008 		GEM_BUG_ON(!ctx_to_valid);
2009 		return true;
2010 	}
2011 
2012 	/*
2013 	 * switch detail = 5 is covered by the case above and we do not expect a
2014 	 * context switch on an unsuccessful wait instruction since we always
2015 	 * use polling mode.
2016 	 */
2017 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2018 	return false;
2019 }
2020 
2021 static inline bool
2022 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2023 {
2024 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2025 }
2026 
2027 static void process_csb(struct intel_engine_cs *engine)
2028 {
2029 	struct intel_engine_execlists * const execlists = &engine->execlists;
2030 	const u32 * const buf = execlists->csb_status;
2031 	const u8 num_entries = execlists->csb_size;
2032 	u8 head, tail;
2033 
2034 	/*
2035 	 * As we modify our execlists state tracking we require exclusive
2036 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2037 	 * and we assume that is only inside the reset paths and so serialised.
2038 	 */
2039 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2040 		   !reset_in_progress(execlists));
2041 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2042 
2043 	/*
2044 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2045 	 * When reading from the csb_write mmio register, we have to be
2046 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2047 	 * the low 4bits. As it happens we know the next 4bits are always
2048 	 * zero and so we can simply masked off the low u8 of the register
2049 	 * and treat it identically to reading from the HWSP (without having
2050 	 * to use explicit shifting and masking, and probably bifurcating
2051 	 * the code to handle the legacy mmio read).
2052 	 */
2053 	head = execlists->csb_head;
2054 	tail = READ_ONCE(*execlists->csb_write);
2055 	GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
2056 	if (unlikely(head == tail))
2057 		return;
2058 
2059 	/*
2060 	 * Hopefully paired with a wmb() in HW!
2061 	 *
2062 	 * We must complete the read of the write pointer before any reads
2063 	 * from the CSB, so that we do not see stale values. Without an rmb
2064 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2065 	 * we perform the READ_ONCE(*csb_write).
2066 	 */
2067 	rmb();
2068 
2069 	do {
2070 		bool promote;
2071 
2072 		if (++head == num_entries)
2073 			head = 0;
2074 
2075 		/*
2076 		 * We are flying near dragons again.
2077 		 *
2078 		 * We hold a reference to the request in execlist_port[]
2079 		 * but no more than that. We are operating in softirq
2080 		 * context and so cannot hold any mutex or sleep. That
2081 		 * prevents us stopping the requests we are processing
2082 		 * in port[] from being retired simultaneously (the
2083 		 * breadcrumb will be complete before we see the
2084 		 * context-switch). As we only hold the reference to the
2085 		 * request, any pointer chasing underneath the request
2086 		 * is subject to a potential use-after-free. Thus we
2087 		 * store all of the bookkeeping within port[] as
2088 		 * required, and avoid using unguarded pointers beneath
2089 		 * request itself. The same applies to the atomic
2090 		 * status notifier.
2091 		 */
2092 
2093 		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
2094 			  engine->name, head,
2095 			  buf[2 * head + 0], buf[2 * head + 1]);
2096 
2097 		if (INTEL_GEN(engine->i915) >= 12)
2098 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2099 		else
2100 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2101 		if (promote) {
2102 			if (!inject_preempt_hang(execlists))
2103 				ring_set_paused(engine, 0);
2104 
2105 			/* cancel old inflight, prepare for switch */
2106 			trace_ports(execlists, "preempted", execlists->active);
2107 			while (*execlists->active)
2108 				execlists_schedule_out(*execlists->active++);
2109 
2110 			/* switch pending to inflight */
2111 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2112 			execlists->active =
2113 				memcpy(execlists->inflight,
2114 				       execlists->pending,
2115 				       execlists_num_ports(execlists) *
2116 				       sizeof(*execlists->pending));
2117 
2118 			set_timeslice(engine);
2119 
2120 			WRITE_ONCE(execlists->pending[0], NULL);
2121 		} else {
2122 			GEM_BUG_ON(!*execlists->active);
2123 
2124 			/* port0 completed, advanced to port1 */
2125 			trace_ports(execlists, "completed", execlists->active);
2126 
2127 			/*
2128 			 * We rely on the hardware being strongly
2129 			 * ordered, that the breadcrumb write is
2130 			 * coherent (visible from the CPU) before the
2131 			 * user interrupt and CSB is processed.
2132 			 */
2133 			GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2134 				   !reset_in_progress(execlists));
2135 			execlists_schedule_out(*execlists->active++);
2136 
2137 			GEM_BUG_ON(execlists->active - execlists->inflight >
2138 				   execlists_num_ports(execlists));
2139 		}
2140 	} while (head != tail);
2141 
2142 	execlists->csb_head = head;
2143 
2144 	/*
2145 	 * Gen11 has proven to fail wrt global observation point between
2146 	 * entry and tail update, failing on the ordering and thus
2147 	 * we see an old entry in the context status buffer.
2148 	 *
2149 	 * Forcibly evict out entries for the next gpu csb update,
2150 	 * to increase the odds that we get a fresh entries with non
2151 	 * working hardware. The cost for doing so comes out mostly with
2152 	 * the wash as hardware, working or not, will need to do the
2153 	 * invalidation before.
2154 	 */
2155 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2156 }
2157 
2158 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2159 {
2160 	lockdep_assert_held(&engine->active.lock);
2161 	if (!engine->execlists.pending[0]) {
2162 		rcu_read_lock(); /* protect peeking at execlists->active */
2163 		execlists_dequeue(engine);
2164 		rcu_read_unlock();
2165 	}
2166 }
2167 
2168 static noinline void preempt_reset(struct intel_engine_cs *engine)
2169 {
2170 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2171 	unsigned long *lock = &engine->gt->reset.flags;
2172 
2173 	if (i915_modparams.reset < 3)
2174 		return;
2175 
2176 	if (test_and_set_bit(bit, lock))
2177 		return;
2178 
2179 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2180 	tasklet_disable_nosync(&engine->execlists.tasklet);
2181 
2182 	GEM_TRACE("%s: preempt timeout %lu+%ums\n",
2183 		  engine->name,
2184 		  READ_ONCE(engine->props.preempt_timeout_ms),
2185 		  jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2186 	intel_engine_reset(engine, "preemption time out");
2187 
2188 	tasklet_enable(&engine->execlists.tasklet);
2189 	clear_and_wake_up_bit(bit, lock);
2190 }
2191 
2192 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2193 {
2194 	const struct timer_list *t = &engine->execlists.preempt;
2195 
2196 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2197 		return false;
2198 
2199 	if (!timer_expired(t))
2200 		return false;
2201 
2202 	return READ_ONCE(engine->execlists.pending[0]);
2203 }
2204 
2205 /*
2206  * Check the unread Context Status Buffers and manage the submission of new
2207  * contexts to the ELSP accordingly.
2208  */
2209 static void execlists_submission_tasklet(unsigned long data)
2210 {
2211 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2212 	bool timeout = preempt_timeout(engine);
2213 
2214 	process_csb(engine);
2215 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2216 		unsigned long flags;
2217 
2218 		spin_lock_irqsave(&engine->active.lock, flags);
2219 		__execlists_submission_tasklet(engine);
2220 		spin_unlock_irqrestore(&engine->active.lock, flags);
2221 
2222 		/* Recheck after serialising with direct-submission */
2223 		if (timeout && preempt_timeout(engine))
2224 			preempt_reset(engine);
2225 	}
2226 }
2227 
2228 static void __execlists_kick(struct intel_engine_execlists *execlists)
2229 {
2230 	/* Kick the tasklet for some interrupt coalescing and reset handling */
2231 	tasklet_hi_schedule(&execlists->tasklet);
2232 }
2233 
2234 #define execlists_kick(t, member) \
2235 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2236 
2237 static void execlists_timeslice(struct timer_list *timer)
2238 {
2239 	execlists_kick(timer, timer);
2240 }
2241 
2242 static void execlists_preempt(struct timer_list *timer)
2243 {
2244 	execlists_kick(timer, preempt);
2245 }
2246 
2247 static void queue_request(struct intel_engine_cs *engine,
2248 			  struct i915_sched_node *node,
2249 			  int prio)
2250 {
2251 	GEM_BUG_ON(!list_empty(&node->link));
2252 	list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
2253 }
2254 
2255 static void __submit_queue_imm(struct intel_engine_cs *engine)
2256 {
2257 	struct intel_engine_execlists * const execlists = &engine->execlists;
2258 
2259 	if (reset_in_progress(execlists))
2260 		return; /* defer until we restart the engine following reset */
2261 
2262 	if (execlists->tasklet.func == execlists_submission_tasklet)
2263 		__execlists_submission_tasklet(engine);
2264 	else
2265 		tasklet_hi_schedule(&execlists->tasklet);
2266 }
2267 
2268 static void submit_queue(struct intel_engine_cs *engine,
2269 			 const struct i915_request *rq)
2270 {
2271 	struct intel_engine_execlists *execlists = &engine->execlists;
2272 
2273 	if (rq_prio(rq) <= execlists->queue_priority_hint)
2274 		return;
2275 
2276 	execlists->queue_priority_hint = rq_prio(rq);
2277 	__submit_queue_imm(engine);
2278 }
2279 
2280 static void execlists_submit_request(struct i915_request *request)
2281 {
2282 	struct intel_engine_cs *engine = request->engine;
2283 	unsigned long flags;
2284 
2285 	/* Will be called from irq-context when using foreign fences. */
2286 	spin_lock_irqsave(&engine->active.lock, flags);
2287 
2288 	queue_request(engine, &request->sched, rq_prio(request));
2289 
2290 	GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2291 	GEM_BUG_ON(list_empty(&request->sched.link));
2292 
2293 	submit_queue(engine, request);
2294 
2295 	spin_unlock_irqrestore(&engine->active.lock, flags);
2296 }
2297 
2298 static void __execlists_context_fini(struct intel_context *ce)
2299 {
2300 	intel_ring_put(ce->ring);
2301 	i915_vma_put(ce->state);
2302 }
2303 
2304 static void execlists_context_destroy(struct kref *kref)
2305 {
2306 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2307 
2308 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2309 	GEM_BUG_ON(intel_context_is_pinned(ce));
2310 
2311 	if (ce->state)
2312 		__execlists_context_fini(ce);
2313 
2314 	intel_context_fini(ce);
2315 	intel_context_free(ce);
2316 }
2317 
2318 static void
2319 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2320 {
2321 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2322 		return;
2323 
2324 	vaddr += engine->context_size;
2325 
2326 	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
2327 }
2328 
2329 static void
2330 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2331 {
2332 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2333 		return;
2334 
2335 	vaddr += engine->context_size;
2336 
2337 	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE))
2338 		dev_err_once(engine->i915->drm.dev,
2339 			     "%s context redzone overwritten!\n",
2340 			     engine->name);
2341 }
2342 
2343 static void execlists_context_unpin(struct intel_context *ce)
2344 {
2345 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2346 		      ce->engine);
2347 
2348 	i915_gem_object_unpin_map(ce->state->obj);
2349 	intel_ring_reset(ce->ring, ce->ring->tail);
2350 }
2351 
2352 static void
2353 __execlists_update_reg_state(const struct intel_context *ce,
2354 			     const struct intel_engine_cs *engine)
2355 {
2356 	struct intel_ring *ring = ce->ring;
2357 	u32 *regs = ce->lrc_reg_state;
2358 
2359 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
2360 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2361 
2362 	regs[CTX_RING_BUFFER_START] = i915_ggtt_offset(ring->vma);
2363 	regs[CTX_RING_HEAD] = ring->head;
2364 	regs[CTX_RING_TAIL] = ring->tail;
2365 
2366 	/* RPCS */
2367 	if (engine->class == RENDER_CLASS) {
2368 		regs[CTX_R_PWR_CLK_STATE] =
2369 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2370 
2371 		i915_oa_init_reg_state(ce, engine);
2372 	}
2373 }
2374 
2375 static int
2376 __execlists_context_pin(struct intel_context *ce,
2377 			struct intel_engine_cs *engine)
2378 {
2379 	void *vaddr;
2380 	int ret;
2381 
2382 	GEM_BUG_ON(!ce->state);
2383 
2384 	ret = intel_context_active_acquire(ce);
2385 	if (ret)
2386 		goto err;
2387 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2388 
2389 	vaddr = i915_gem_object_pin_map(ce->state->obj,
2390 					i915_coherent_map_type(engine->i915) |
2391 					I915_MAP_OVERRIDE);
2392 	if (IS_ERR(vaddr)) {
2393 		ret = PTR_ERR(vaddr);
2394 		goto unpin_active;
2395 	}
2396 
2397 	ce->lrc_desc = lrc_descriptor(ce, engine);
2398 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2399 	__execlists_update_reg_state(ce, engine);
2400 
2401 	return 0;
2402 
2403 unpin_active:
2404 	intel_context_active_release(ce);
2405 err:
2406 	return ret;
2407 }
2408 
2409 static int execlists_context_pin(struct intel_context *ce)
2410 {
2411 	return __execlists_context_pin(ce, ce->engine);
2412 }
2413 
2414 static int execlists_context_alloc(struct intel_context *ce)
2415 {
2416 	return __execlists_context_alloc(ce, ce->engine);
2417 }
2418 
2419 static void execlists_context_reset(struct intel_context *ce)
2420 {
2421 	/*
2422 	 * Because we emit WA_TAIL_DWORDS there may be a disparity
2423 	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2424 	 * that stored in context. As we only write new commands from
2425 	 * ce->ring->tail onwards, everything before that is junk. If the GPU
2426 	 * starts reading from its RING_HEAD from the context, it may try to
2427 	 * execute that junk and die.
2428 	 *
2429 	 * The contexts that are stilled pinned on resume belong to the
2430 	 * kernel, and are local to each engine. All other contexts will
2431 	 * have their head/tail sanitized upon pinning before use, so they
2432 	 * will never see garbage,
2433 	 *
2434 	 * So to avoid that we reset the context images upon resume. For
2435 	 * simplicity, we just zero everything out.
2436 	 */
2437 	intel_ring_reset(ce->ring, 0);
2438 	__execlists_update_reg_state(ce, ce->engine);
2439 }
2440 
2441 static const struct intel_context_ops execlists_context_ops = {
2442 	.alloc = execlists_context_alloc,
2443 
2444 	.pin = execlists_context_pin,
2445 	.unpin = execlists_context_unpin,
2446 
2447 	.enter = intel_context_enter_engine,
2448 	.exit = intel_context_exit_engine,
2449 
2450 	.reset = execlists_context_reset,
2451 	.destroy = execlists_context_destroy,
2452 };
2453 
2454 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2455 {
2456 	u32 *cs;
2457 
2458 	GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2459 
2460 	cs = intel_ring_begin(rq, 6);
2461 	if (IS_ERR(cs))
2462 		return PTR_ERR(cs);
2463 
2464 	/*
2465 	 * Check if we have been preempted before we even get started.
2466 	 *
2467 	 * After this point i915_request_started() reports true, even if
2468 	 * we get preempted and so are no longer running.
2469 	 */
2470 	*cs++ = MI_ARB_CHECK;
2471 	*cs++ = MI_NOOP;
2472 
2473 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2474 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
2475 	*cs++ = 0;
2476 	*cs++ = rq->fence.seqno - 1;
2477 
2478 	intel_ring_advance(rq, cs);
2479 
2480 	/* Record the updated position of the request's payload */
2481 	rq->infix = intel_ring_offset(rq, cs);
2482 
2483 	return 0;
2484 }
2485 
2486 static int execlists_request_alloc(struct i915_request *request)
2487 {
2488 	int ret;
2489 
2490 	GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
2491 
2492 	/*
2493 	 * Flush enough space to reduce the likelihood of waiting after
2494 	 * we start building the request - in which case we will just
2495 	 * have to repeat work.
2496 	 */
2497 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
2498 
2499 	/*
2500 	 * Note that after this point, we have committed to using
2501 	 * this request as it is being used to both track the
2502 	 * state of engine initialisation and liveness of the
2503 	 * golden renderstate above. Think twice before you try
2504 	 * to cancel/unwind this request now.
2505 	 */
2506 
2507 	/* Unconditionally invalidate GPU caches and TLBs. */
2508 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
2509 	if (ret)
2510 		return ret;
2511 
2512 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
2513 	return 0;
2514 }
2515 
2516 /*
2517  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
2518  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
2519  * but there is a slight complication as this is applied in WA batch where the
2520  * values are only initialized once so we cannot take register value at the
2521  * beginning and reuse it further; hence we save its value to memory, upload a
2522  * constant value with bit21 set and then we restore it back with the saved value.
2523  * To simplify the WA, a constant value is formed by using the default value
2524  * of this register. This shouldn't be a problem because we are only modifying
2525  * it for a short period and this batch in non-premptible. We can ofcourse
2526  * use additional instructions that read the actual value of the register
2527  * at that time and set our bit of interest but it makes the WA complicated.
2528  *
2529  * This WA is also required for Gen9 so extracting as a function avoids
2530  * code duplication.
2531  */
2532 static u32 *
2533 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
2534 {
2535 	/* NB no one else is allowed to scribble over scratch + 256! */
2536 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2537 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2538 	*batch++ = intel_gt_scratch_offset(engine->gt,
2539 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2540 	*batch++ = 0;
2541 
2542 	*batch++ = MI_LOAD_REGISTER_IMM(1);
2543 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2544 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
2545 
2546 	batch = gen8_emit_pipe_control(batch,
2547 				       PIPE_CONTROL_CS_STALL |
2548 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
2549 				       0);
2550 
2551 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2552 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2553 	*batch++ = intel_gt_scratch_offset(engine->gt,
2554 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2555 	*batch++ = 0;
2556 
2557 	return batch;
2558 }
2559 
2560 /*
2561  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
2562  * initialized at the beginning and shared across all contexts but this field
2563  * helps us to have multiple batches at different offsets and select them based
2564  * on a criteria. At the moment this batch always start at the beginning of the page
2565  * and at this point we don't have multiple wa_ctx batch buffers.
2566  *
2567  * The number of WA applied are not known at the beginning; we use this field
2568  * to return the no of DWORDS written.
2569  *
2570  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
2571  * so it adds NOOPs as padding to make it cacheline aligned.
2572  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
2573  * makes a complete batch buffer.
2574  */
2575 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2576 {
2577 	/* WaDisableCtxRestoreArbitration:bdw,chv */
2578 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2579 
2580 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
2581 	if (IS_BROADWELL(engine->i915))
2582 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2583 
2584 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
2585 	/* Actual scratch location is at 128 bytes offset */
2586 	batch = gen8_emit_pipe_control(batch,
2587 				       PIPE_CONTROL_FLUSH_L3 |
2588 				       PIPE_CONTROL_STORE_DATA_INDEX |
2589 				       PIPE_CONTROL_CS_STALL |
2590 				       PIPE_CONTROL_QW_WRITE,
2591 				       LRC_PPHWSP_SCRATCH_ADDR);
2592 
2593 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2594 
2595 	/* Pad to end of cacheline */
2596 	while ((unsigned long)batch % CACHELINE_BYTES)
2597 		*batch++ = MI_NOOP;
2598 
2599 	/*
2600 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
2601 	 * execution depends on the length specified in terms of cache lines
2602 	 * in the register CTX_RCS_INDIRECT_CTX
2603 	 */
2604 
2605 	return batch;
2606 }
2607 
2608 struct lri {
2609 	i915_reg_t reg;
2610 	u32 value;
2611 };
2612 
2613 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
2614 {
2615 	GEM_BUG_ON(!count || count > 63);
2616 
2617 	*batch++ = MI_LOAD_REGISTER_IMM(count);
2618 	do {
2619 		*batch++ = i915_mmio_reg_offset(lri->reg);
2620 		*batch++ = lri->value;
2621 	} while (lri++, --count);
2622 	*batch++ = MI_NOOP;
2623 
2624 	return batch;
2625 }
2626 
2627 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2628 {
2629 	static const struct lri lri[] = {
2630 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
2631 		{
2632 			COMMON_SLICE_CHICKEN2,
2633 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
2634 				       0),
2635 		},
2636 
2637 		/* BSpec: 11391 */
2638 		{
2639 			FF_SLICE_CHICKEN,
2640 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
2641 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
2642 		},
2643 
2644 		/* BSpec: 11299 */
2645 		{
2646 			_3D_CHICKEN3,
2647 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
2648 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
2649 		}
2650 	};
2651 
2652 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2653 
2654 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
2655 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2656 
2657 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
2658 
2659 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
2660 	if (HAS_POOLED_EU(engine->i915)) {
2661 		/*
2662 		 * EU pool configuration is setup along with golden context
2663 		 * during context initialization. This value depends on
2664 		 * device type (2x6 or 3x6) and needs to be updated based
2665 		 * on which subslice is disabled especially for 2x6
2666 		 * devices, however it is safe to load default
2667 		 * configuration of 3x6 device instead of masking off
2668 		 * corresponding bits because HW ignores bits of a disabled
2669 		 * subslice and drops down to appropriate config. Please
2670 		 * see render_state_setup() in i915_gem_render_state.c for
2671 		 * possible configurations, to avoid duplication they are
2672 		 * not shown here again.
2673 		 */
2674 		*batch++ = GEN9_MEDIA_POOL_STATE;
2675 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
2676 		*batch++ = 0x00777000;
2677 		*batch++ = 0;
2678 		*batch++ = 0;
2679 		*batch++ = 0;
2680 	}
2681 
2682 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2683 
2684 	/* Pad to end of cacheline */
2685 	while ((unsigned long)batch % CACHELINE_BYTES)
2686 		*batch++ = MI_NOOP;
2687 
2688 	return batch;
2689 }
2690 
2691 static u32 *
2692 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2693 {
2694 	int i;
2695 
2696 	/*
2697 	 * WaPipeControlBefore3DStateSamplePattern: cnl
2698 	 *
2699 	 * Ensure the engine is idle prior to programming a
2700 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
2701 	 */
2702 	batch = gen8_emit_pipe_control(batch,
2703 				       PIPE_CONTROL_CS_STALL,
2704 				       0);
2705 	/*
2706 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
2707 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
2708 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
2709 	 * confusing. Since gen8_emit_pipe_control() already advances the
2710 	 * batch by 6 dwords, we advance the other 10 here, completing a
2711 	 * cacheline. It's not clear if the workaround requires this padding
2712 	 * before other commands, or if it's just the regular padding we would
2713 	 * already have for the workaround bb, so leave it here for now.
2714 	 */
2715 	for (i = 0; i < 10; i++)
2716 		*batch++ = MI_NOOP;
2717 
2718 	/* Pad to end of cacheline */
2719 	while ((unsigned long)batch % CACHELINE_BYTES)
2720 		*batch++ = MI_NOOP;
2721 
2722 	return batch;
2723 }
2724 
2725 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
2726 
2727 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
2728 {
2729 	struct drm_i915_gem_object *obj;
2730 	struct i915_vma *vma;
2731 	int err;
2732 
2733 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
2734 	if (IS_ERR(obj))
2735 		return PTR_ERR(obj);
2736 
2737 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
2738 	if (IS_ERR(vma)) {
2739 		err = PTR_ERR(vma);
2740 		goto err;
2741 	}
2742 
2743 	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2744 	if (err)
2745 		goto err;
2746 
2747 	engine->wa_ctx.vma = vma;
2748 	return 0;
2749 
2750 err:
2751 	i915_gem_object_put(obj);
2752 	return err;
2753 }
2754 
2755 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
2756 {
2757 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
2758 }
2759 
2760 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
2761 
2762 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
2763 {
2764 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2765 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
2766 					    &wa_ctx->per_ctx };
2767 	wa_bb_func_t wa_bb_fn[2];
2768 	struct page *page;
2769 	void *batch, *batch_ptr;
2770 	unsigned int i;
2771 	int ret;
2772 
2773 	if (engine->class != RENDER_CLASS)
2774 		return 0;
2775 
2776 	switch (INTEL_GEN(engine->i915)) {
2777 	case 12:
2778 	case 11:
2779 		return 0;
2780 	case 10:
2781 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
2782 		wa_bb_fn[1] = NULL;
2783 		break;
2784 	case 9:
2785 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
2786 		wa_bb_fn[1] = NULL;
2787 		break;
2788 	case 8:
2789 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
2790 		wa_bb_fn[1] = NULL;
2791 		break;
2792 	default:
2793 		MISSING_CASE(INTEL_GEN(engine->i915));
2794 		return 0;
2795 	}
2796 
2797 	ret = lrc_setup_wa_ctx(engine);
2798 	if (ret) {
2799 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
2800 		return ret;
2801 	}
2802 
2803 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
2804 	batch = batch_ptr = kmap_atomic(page);
2805 
2806 	/*
2807 	 * Emit the two workaround batch buffers, recording the offset from the
2808 	 * start of the workaround batch buffer object for each and their
2809 	 * respective sizes.
2810 	 */
2811 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
2812 		wa_bb[i]->offset = batch_ptr - batch;
2813 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
2814 						  CACHELINE_BYTES))) {
2815 			ret = -EINVAL;
2816 			break;
2817 		}
2818 		if (wa_bb_fn[i])
2819 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
2820 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
2821 	}
2822 
2823 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
2824 
2825 	kunmap_atomic(batch);
2826 	if (ret)
2827 		lrc_destroy_wa_ctx(engine);
2828 
2829 	return ret;
2830 }
2831 
2832 static void enable_execlists(struct intel_engine_cs *engine)
2833 {
2834 	u32 mode;
2835 
2836 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
2837 
2838 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
2839 
2840 	if (INTEL_GEN(engine->i915) >= 11)
2841 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
2842 	else
2843 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
2844 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
2845 
2846 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
2847 
2848 	ENGINE_WRITE_FW(engine,
2849 			RING_HWS_PGA,
2850 			i915_ggtt_offset(engine->status_page.vma));
2851 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
2852 }
2853 
2854 static bool unexpected_starting_state(struct intel_engine_cs *engine)
2855 {
2856 	bool unexpected = false;
2857 
2858 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
2859 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
2860 		unexpected = true;
2861 	}
2862 
2863 	return unexpected;
2864 }
2865 
2866 static int execlists_resume(struct intel_engine_cs *engine)
2867 {
2868 	intel_engine_apply_workarounds(engine);
2869 	intel_engine_apply_whitelist(engine);
2870 
2871 	intel_mocs_init_engine(engine);
2872 
2873 	intel_engine_reset_breadcrumbs(engine);
2874 
2875 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
2876 		struct drm_printer p = drm_debug_printer(__func__);
2877 
2878 		intel_engine_dump(engine, &p, NULL);
2879 	}
2880 
2881 	enable_execlists(engine);
2882 
2883 	return 0;
2884 }
2885 
2886 static void execlists_reset_prepare(struct intel_engine_cs *engine)
2887 {
2888 	struct intel_engine_execlists * const execlists = &engine->execlists;
2889 	unsigned long flags;
2890 
2891 	GEM_TRACE("%s: depth<-%d\n", engine->name,
2892 		  atomic_read(&execlists->tasklet.count));
2893 
2894 	/*
2895 	 * Prevent request submission to the hardware until we have
2896 	 * completed the reset in i915_gem_reset_finish(). If a request
2897 	 * is completed by one engine, it may then queue a request
2898 	 * to a second via its execlists->tasklet *just* as we are
2899 	 * calling engine->resume() and also writing the ELSP.
2900 	 * Turning off the execlists->tasklet until the reset is over
2901 	 * prevents the race.
2902 	 */
2903 	__tasklet_disable_sync_once(&execlists->tasklet);
2904 	GEM_BUG_ON(!reset_in_progress(execlists));
2905 
2906 	/* And flush any current direct submission. */
2907 	spin_lock_irqsave(&engine->active.lock, flags);
2908 	spin_unlock_irqrestore(&engine->active.lock, flags);
2909 
2910 	/*
2911 	 * We stop engines, otherwise we might get failed reset and a
2912 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
2913 	 * from system hang if batchbuffer is progressing when
2914 	 * the reset is issued, regardless of READY_TO_RESET ack.
2915 	 * Thus assume it is best to stop engines on all gens
2916 	 * where we have a gpu reset.
2917 	 *
2918 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
2919 	 *
2920 	 * FIXME: Wa for more modern gens needs to be validated
2921 	 */
2922 	intel_engine_stop_cs(engine);
2923 }
2924 
2925 static void reset_csb_pointers(struct intel_engine_cs *engine)
2926 {
2927 	struct intel_engine_execlists * const execlists = &engine->execlists;
2928 	const unsigned int reset_value = execlists->csb_size - 1;
2929 
2930 	ring_set_paused(engine, 0);
2931 
2932 	/*
2933 	 * After a reset, the HW starts writing into CSB entry [0]. We
2934 	 * therefore have to set our HEAD pointer back one entry so that
2935 	 * the *first* entry we check is entry 0. To complicate this further,
2936 	 * as we don't wait for the first interrupt after reset, we have to
2937 	 * fake the HW write to point back to the last entry so that our
2938 	 * inline comparison of our cached head position against the last HW
2939 	 * write works even before the first interrupt.
2940 	 */
2941 	execlists->csb_head = reset_value;
2942 	WRITE_ONCE(*execlists->csb_write, reset_value);
2943 	wmb(); /* Make sure this is visible to HW (paranoia?) */
2944 
2945 	invalidate_csb_entries(&execlists->csb_status[0],
2946 			       &execlists->csb_status[reset_value]);
2947 }
2948 
2949 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
2950 {
2951 	if (INTEL_GEN(engine->i915) >= 12)
2952 		return 0x60;
2953 	else if (INTEL_GEN(engine->i915) >= 9)
2954 		return 0x54;
2955 	else if (engine->class == RENDER_CLASS)
2956 		return 0x58;
2957 	else
2958 		return -1;
2959 }
2960 
2961 static void __execlists_reset_reg_state(const struct intel_context *ce,
2962 					const struct intel_engine_cs *engine)
2963 {
2964 	u32 *regs = ce->lrc_reg_state;
2965 	int x;
2966 
2967 	x = lrc_ring_mi_mode(engine);
2968 	if (x != -1) {
2969 		regs[x + 1] &= ~STOP_RING;
2970 		regs[x + 1] |= STOP_RING << 16;
2971 	}
2972 }
2973 
2974 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
2975 {
2976 	struct intel_engine_execlists * const execlists = &engine->execlists;
2977 	struct intel_context *ce;
2978 	struct i915_request *rq;
2979 
2980 	mb(); /* paranoia: read the CSB pointers from after the reset */
2981 	clflush(execlists->csb_write);
2982 	mb();
2983 
2984 	process_csb(engine); /* drain preemption events */
2985 
2986 	/* Following the reset, we need to reload the CSB read/write pointers */
2987 	reset_csb_pointers(engine);
2988 
2989 	/*
2990 	 * Save the currently executing context, even if we completed
2991 	 * its request, it was still running at the time of the
2992 	 * reset and will have been clobbered.
2993 	 */
2994 	rq = execlists_active(execlists);
2995 	if (!rq)
2996 		goto unwind;
2997 
2998 	/* We still have requests in-flight; the engine should be active */
2999 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3000 
3001 	ce = rq->hw_context;
3002 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3003 
3004 	if (i915_request_completed(rq)) {
3005 		/* Idle context; tidy up the ring so we can restart afresh */
3006 		ce->ring->head = intel_ring_wrap(ce->ring, rq->tail);
3007 		goto out_replay;
3008 	}
3009 
3010 	/* Context has requests still in-flight; it should not be idle! */
3011 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3012 	rq = active_request(ce->timeline, rq);
3013 	ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
3014 	GEM_BUG_ON(ce->ring->head == ce->ring->tail);
3015 
3016 	/*
3017 	 * If this request hasn't started yet, e.g. it is waiting on a
3018 	 * semaphore, we need to avoid skipping the request or else we
3019 	 * break the signaling chain. However, if the context is corrupt
3020 	 * the request will not restart and we will be stuck with a wedged
3021 	 * device. It is quite often the case that if we issue a reset
3022 	 * while the GPU is loading the context image, that the context
3023 	 * image becomes corrupt.
3024 	 *
3025 	 * Otherwise, if we have not started yet, the request should replay
3026 	 * perfectly and we do not need to flag the result as being erroneous.
3027 	 */
3028 	if (!i915_request_started(rq))
3029 		goto out_replay;
3030 
3031 	/*
3032 	 * If the request was innocent, we leave the request in the ELSP
3033 	 * and will try to replay it on restarting. The context image may
3034 	 * have been corrupted by the reset, in which case we may have
3035 	 * to service a new GPU hang, but more likely we can continue on
3036 	 * without impact.
3037 	 *
3038 	 * If the request was guilty, we presume the context is corrupt
3039 	 * and have to at least restore the RING register in the context
3040 	 * image back to the expected values to skip over the guilty request.
3041 	 */
3042 	__i915_request_reset(rq, stalled);
3043 	if (!stalled)
3044 		goto out_replay;
3045 
3046 	/*
3047 	 * We want a simple context + ring to execute the breadcrumb update.
3048 	 * We cannot rely on the context being intact across the GPU hang,
3049 	 * so clear it and rebuild just what we need for the breadcrumb.
3050 	 * All pending requests for this context will be zapped, and any
3051 	 * future request will be after userspace has had the opportunity
3052 	 * to recreate its own state.
3053 	 */
3054 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3055 	restore_default_state(ce, engine);
3056 
3057 out_replay:
3058 	GEM_TRACE("%s replay {head:%04x, tail:%04x}\n",
3059 		  engine->name, ce->ring->head, ce->ring->tail);
3060 	intel_ring_update_space(ce->ring);
3061 	__execlists_reset_reg_state(ce, engine);
3062 	__execlists_update_reg_state(ce, engine);
3063 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3064 
3065 unwind:
3066 	/* Push back any incomplete requests for replay after the reset. */
3067 	cancel_port_requests(execlists);
3068 	__unwind_incomplete_requests(engine);
3069 }
3070 
3071 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
3072 {
3073 	unsigned long flags;
3074 
3075 	GEM_TRACE("%s\n", engine->name);
3076 
3077 	spin_lock_irqsave(&engine->active.lock, flags);
3078 
3079 	__execlists_reset(engine, stalled);
3080 
3081 	spin_unlock_irqrestore(&engine->active.lock, flags);
3082 }
3083 
3084 static void nop_submission_tasklet(unsigned long data)
3085 {
3086 	/* The driver is wedged; don't process any more events. */
3087 }
3088 
3089 static void execlists_cancel_requests(struct intel_engine_cs *engine)
3090 {
3091 	struct intel_engine_execlists * const execlists = &engine->execlists;
3092 	struct i915_request *rq, *rn;
3093 	struct rb_node *rb;
3094 	unsigned long flags;
3095 
3096 	GEM_TRACE("%s\n", engine->name);
3097 
3098 	/*
3099 	 * Before we call engine->cancel_requests(), we should have exclusive
3100 	 * access to the submission state. This is arranged for us by the
3101 	 * caller disabling the interrupt generation, the tasklet and other
3102 	 * threads that may then access the same state, giving us a free hand
3103 	 * to reset state. However, we still need to let lockdep be aware that
3104 	 * we know this state may be accessed in hardirq context, so we
3105 	 * disable the irq around this manipulation and we want to keep
3106 	 * the spinlock focused on its duties and not accidentally conflate
3107 	 * coverage to the submission's irq state. (Similarly, although we
3108 	 * shouldn't need to disable irq around the manipulation of the
3109 	 * submission's irq state, we also wish to remind ourselves that
3110 	 * it is irq state.)
3111 	 */
3112 	spin_lock_irqsave(&engine->active.lock, flags);
3113 
3114 	__execlists_reset(engine, true);
3115 
3116 	/* Mark all executing requests as skipped. */
3117 	list_for_each_entry(rq, &engine->active.requests, sched.link)
3118 		mark_eio(rq);
3119 
3120 	/* Flush the queued requests to the timeline list (for retiring). */
3121 	while ((rb = rb_first_cached(&execlists->queue))) {
3122 		struct i915_priolist *p = to_priolist(rb);
3123 		int i;
3124 
3125 		priolist_for_each_request_consume(rq, rn, p, i) {
3126 			mark_eio(rq);
3127 			__i915_request_submit(rq);
3128 		}
3129 
3130 		rb_erase_cached(&p->node, &execlists->queue);
3131 		i915_priolist_free(p);
3132 	}
3133 
3134 	/* Cancel all attached virtual engines */
3135 	while ((rb = rb_first_cached(&execlists->virtual))) {
3136 		struct virtual_engine *ve =
3137 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3138 
3139 		rb_erase_cached(rb, &execlists->virtual);
3140 		RB_CLEAR_NODE(rb);
3141 
3142 		spin_lock(&ve->base.active.lock);
3143 		rq = fetch_and_zero(&ve->request);
3144 		if (rq) {
3145 			mark_eio(rq);
3146 
3147 			rq->engine = engine;
3148 			__i915_request_submit(rq);
3149 			i915_request_put(rq);
3150 
3151 			ve->base.execlists.queue_priority_hint = INT_MIN;
3152 		}
3153 		spin_unlock(&ve->base.active.lock);
3154 	}
3155 
3156 	/* Remaining _unready_ requests will be nop'ed when submitted */
3157 
3158 	execlists->queue_priority_hint = INT_MIN;
3159 	execlists->queue = RB_ROOT_CACHED;
3160 
3161 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3162 	execlists->tasklet.func = nop_submission_tasklet;
3163 
3164 	spin_unlock_irqrestore(&engine->active.lock, flags);
3165 }
3166 
3167 static void execlists_reset_finish(struct intel_engine_cs *engine)
3168 {
3169 	struct intel_engine_execlists * const execlists = &engine->execlists;
3170 
3171 	/*
3172 	 * After a GPU reset, we may have requests to replay. Do so now while
3173 	 * we still have the forcewake to be sure that the GPU is not allowed
3174 	 * to sleep before we restart and reload a context.
3175 	 */
3176 	GEM_BUG_ON(!reset_in_progress(execlists));
3177 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3178 		execlists->tasklet.func(execlists->tasklet.data);
3179 
3180 	if (__tasklet_enable(&execlists->tasklet))
3181 		/* And kick in case we missed a new request submission. */
3182 		tasklet_hi_schedule(&execlists->tasklet);
3183 	GEM_TRACE("%s: depth->%d\n", engine->name,
3184 		  atomic_read(&execlists->tasklet.count));
3185 }
3186 
3187 static int gen8_emit_bb_start(struct i915_request *rq,
3188 			      u64 offset, u32 len,
3189 			      const unsigned int flags)
3190 {
3191 	u32 *cs;
3192 
3193 	cs = intel_ring_begin(rq, 4);
3194 	if (IS_ERR(cs))
3195 		return PTR_ERR(cs);
3196 
3197 	/*
3198 	 * WaDisableCtxRestoreArbitration:bdw,chv
3199 	 *
3200 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3201 	 * particular all the gen that do not need the w/a at all!), if we
3202 	 * took care to make sure that on every switch into this context
3203 	 * (both ordinary and for preemption) that arbitrartion was enabled
3204 	 * we would be fine.  However, for gen8 there is another w/a that
3205 	 * requires us to not preempt inside GPGPU execution, so we keep
3206 	 * arbitration disabled for gen8 batches. Arbitration will be
3207 	 * re-enabled before we close the request
3208 	 * (engine->emit_fini_breadcrumb).
3209 	 */
3210 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3211 
3212 	/* FIXME(BDW+): Address space and security selectors. */
3213 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3214 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3215 	*cs++ = lower_32_bits(offset);
3216 	*cs++ = upper_32_bits(offset);
3217 
3218 	intel_ring_advance(rq, cs);
3219 
3220 	return 0;
3221 }
3222 
3223 static int gen9_emit_bb_start(struct i915_request *rq,
3224 			      u64 offset, u32 len,
3225 			      const unsigned int flags)
3226 {
3227 	u32 *cs;
3228 
3229 	cs = intel_ring_begin(rq, 6);
3230 	if (IS_ERR(cs))
3231 		return PTR_ERR(cs);
3232 
3233 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3234 
3235 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3236 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3237 	*cs++ = lower_32_bits(offset);
3238 	*cs++ = upper_32_bits(offset);
3239 
3240 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3241 	*cs++ = MI_NOOP;
3242 
3243 	intel_ring_advance(rq, cs);
3244 
3245 	return 0;
3246 }
3247 
3248 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3249 {
3250 	ENGINE_WRITE(engine, RING_IMR,
3251 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
3252 	ENGINE_POSTING_READ(engine, RING_IMR);
3253 }
3254 
3255 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3256 {
3257 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3258 }
3259 
3260 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3261 {
3262 	u32 cmd, *cs;
3263 
3264 	cs = intel_ring_begin(request, 4);
3265 	if (IS_ERR(cs))
3266 		return PTR_ERR(cs);
3267 
3268 	cmd = MI_FLUSH_DW + 1;
3269 
3270 	/* We always require a command barrier so that subsequent
3271 	 * commands, such as breadcrumb interrupts, are strictly ordered
3272 	 * wrt the contents of the write cache being flushed to memory
3273 	 * (and thus being coherent from the CPU).
3274 	 */
3275 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3276 
3277 	if (mode & EMIT_INVALIDATE) {
3278 		cmd |= MI_INVALIDATE_TLB;
3279 		if (request->engine->class == VIDEO_DECODE_CLASS)
3280 			cmd |= MI_INVALIDATE_BSD;
3281 	}
3282 
3283 	*cs++ = cmd;
3284 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3285 	*cs++ = 0; /* upper addr */
3286 	*cs++ = 0; /* value */
3287 	intel_ring_advance(request, cs);
3288 
3289 	return 0;
3290 }
3291 
3292 static int gen8_emit_flush_render(struct i915_request *request,
3293 				  u32 mode)
3294 {
3295 	bool vf_flush_wa = false, dc_flush_wa = false;
3296 	u32 *cs, flags = 0;
3297 	int len;
3298 
3299 	flags |= PIPE_CONTROL_CS_STALL;
3300 
3301 	if (mode & EMIT_FLUSH) {
3302 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3303 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3304 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3305 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3306 	}
3307 
3308 	if (mode & EMIT_INVALIDATE) {
3309 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3310 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3311 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3312 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3313 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3314 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3315 		flags |= PIPE_CONTROL_QW_WRITE;
3316 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3317 
3318 		/*
3319 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3320 		 * pipe control.
3321 		 */
3322 		if (IS_GEN(request->i915, 9))
3323 			vf_flush_wa = true;
3324 
3325 		/* WaForGAMHang:kbl */
3326 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3327 			dc_flush_wa = true;
3328 	}
3329 
3330 	len = 6;
3331 
3332 	if (vf_flush_wa)
3333 		len += 6;
3334 
3335 	if (dc_flush_wa)
3336 		len += 12;
3337 
3338 	cs = intel_ring_begin(request, len);
3339 	if (IS_ERR(cs))
3340 		return PTR_ERR(cs);
3341 
3342 	if (vf_flush_wa)
3343 		cs = gen8_emit_pipe_control(cs, 0, 0);
3344 
3345 	if (dc_flush_wa)
3346 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3347 					    0);
3348 
3349 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3350 
3351 	if (dc_flush_wa)
3352 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3353 
3354 	intel_ring_advance(request, cs);
3355 
3356 	return 0;
3357 }
3358 
3359 static int gen11_emit_flush_render(struct i915_request *request,
3360 				   u32 mode)
3361 {
3362 	if (mode & EMIT_FLUSH) {
3363 		u32 *cs;
3364 		u32 flags = 0;
3365 
3366 		flags |= PIPE_CONTROL_CS_STALL;
3367 
3368 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3369 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3370 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3371 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3372 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3373 		flags |= PIPE_CONTROL_QW_WRITE;
3374 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3375 
3376 		cs = intel_ring_begin(request, 6);
3377 		if (IS_ERR(cs))
3378 			return PTR_ERR(cs);
3379 
3380 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3381 		intel_ring_advance(request, cs);
3382 	}
3383 
3384 	if (mode & EMIT_INVALIDATE) {
3385 		u32 *cs;
3386 		u32 flags = 0;
3387 
3388 		flags |= PIPE_CONTROL_CS_STALL;
3389 
3390 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3391 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3392 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3393 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3394 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3395 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3396 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3397 		flags |= PIPE_CONTROL_QW_WRITE;
3398 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3399 
3400 		cs = intel_ring_begin(request, 6);
3401 		if (IS_ERR(cs))
3402 			return PTR_ERR(cs);
3403 
3404 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3405 		intel_ring_advance(request, cs);
3406 	}
3407 
3408 	return 0;
3409 }
3410 
3411 static u32 preparser_disable(bool state)
3412 {
3413 	return MI_ARB_CHECK | 1 << 8 | state;
3414 }
3415 
3416 static int gen12_emit_flush_render(struct i915_request *request,
3417 				   u32 mode)
3418 {
3419 	if (mode & EMIT_FLUSH) {
3420 		u32 flags = 0;
3421 		u32 *cs;
3422 
3423 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3424 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3425 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3426 		/* Wa_1409600907:tgl */
3427 		flags |= PIPE_CONTROL_DEPTH_STALL;
3428 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3429 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3430 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3431 
3432 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3433 		flags |= PIPE_CONTROL_QW_WRITE;
3434 
3435 		flags |= PIPE_CONTROL_CS_STALL;
3436 
3437 		cs = intel_ring_begin(request, 6);
3438 		if (IS_ERR(cs))
3439 			return PTR_ERR(cs);
3440 
3441 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3442 		intel_ring_advance(request, cs);
3443 	}
3444 
3445 	if (mode & EMIT_INVALIDATE) {
3446 		u32 flags = 0;
3447 		u32 *cs;
3448 
3449 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3450 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3451 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3452 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3453 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3454 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3455 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3456 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
3457 
3458 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3459 		flags |= PIPE_CONTROL_QW_WRITE;
3460 
3461 		flags |= PIPE_CONTROL_CS_STALL;
3462 
3463 		cs = intel_ring_begin(request, 8);
3464 		if (IS_ERR(cs))
3465 			return PTR_ERR(cs);
3466 
3467 		/*
3468 		 * Prevent the pre-parser from skipping past the TLB
3469 		 * invalidate and loading a stale page for the batch
3470 		 * buffer / request payload.
3471 		 */
3472 		*cs++ = preparser_disable(true);
3473 
3474 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3475 
3476 		*cs++ = preparser_disable(false);
3477 		intel_ring_advance(request, cs);
3478 
3479 		/*
3480 		 * Wa_1604544889:tgl
3481 		 */
3482 		if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
3483 			flags = 0;
3484 			flags |= PIPE_CONTROL_CS_STALL;
3485 			flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3486 
3487 			flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3488 			flags |= PIPE_CONTROL_QW_WRITE;
3489 
3490 			cs = intel_ring_begin(request, 6);
3491 			if (IS_ERR(cs))
3492 				return PTR_ERR(cs);
3493 
3494 			cs = gen8_emit_pipe_control(cs, flags,
3495 						    LRC_PPHWSP_SCRATCH_ADDR);
3496 			intel_ring_advance(request, cs);
3497 		}
3498 	}
3499 
3500 	return 0;
3501 }
3502 
3503 /*
3504  * Reserve space for 2 NOOPs at the end of each request to be
3505  * used as a workaround for not being allowed to do lite
3506  * restore with HEAD==TAIL (WaIdleLiteRestore).
3507  */
3508 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
3509 {
3510 	/* Ensure there's always at least one preemption point per-request. */
3511 	*cs++ = MI_ARB_CHECK;
3512 	*cs++ = MI_NOOP;
3513 	request->wa_tail = intel_ring_offset(request, cs);
3514 
3515 	return cs;
3516 }
3517 
3518 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
3519 {
3520 	*cs++ = MI_SEMAPHORE_WAIT |
3521 		MI_SEMAPHORE_GLOBAL_GTT |
3522 		MI_SEMAPHORE_POLL |
3523 		MI_SEMAPHORE_SAD_EQ_SDD;
3524 	*cs++ = 0;
3525 	*cs++ = intel_hws_preempt_address(request->engine);
3526 	*cs++ = 0;
3527 
3528 	return cs;
3529 }
3530 
3531 static __always_inline u32*
3532 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
3533 				 u32 *cs)
3534 {
3535 	*cs++ = MI_USER_INTERRUPT;
3536 
3537 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3538 	if (intel_engine_has_semaphores(request->engine))
3539 		cs = emit_preempt_busywait(request, cs);
3540 
3541 	request->tail = intel_ring_offset(request, cs);
3542 	assert_ring_tail_valid(request->ring, request->tail);
3543 
3544 	return gen8_emit_wa_tail(request, cs);
3545 }
3546 
3547 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3548 {
3549 	cs = gen8_emit_ggtt_write(cs,
3550 				  request->fence.seqno,
3551 				  i915_request_active_timeline(request)->hwsp_offset,
3552 				  0);
3553 
3554 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3555 }
3556 
3557 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3558 {
3559 	cs = gen8_emit_pipe_control(cs,
3560 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3561 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3562 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
3563 				    0);
3564 
3565 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
3566 	cs = gen8_emit_ggtt_write_rcs(cs,
3567 				      request->fence.seqno,
3568 				      i915_request_active_timeline(request)->hwsp_offset,
3569 				      PIPE_CONTROL_FLUSH_ENABLE |
3570 				      PIPE_CONTROL_CS_STALL);
3571 
3572 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3573 }
3574 
3575 static u32 *
3576 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3577 {
3578 	cs = gen8_emit_ggtt_write_rcs(cs,
3579 				      request->fence.seqno,
3580 				      i915_request_active_timeline(request)->hwsp_offset,
3581 				      PIPE_CONTROL_CS_STALL |
3582 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
3583 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3584 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3585 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
3586 				      PIPE_CONTROL_FLUSH_ENABLE);
3587 
3588 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3589 }
3590 
3591 /*
3592  * Note that the CS instruction pre-parser will not stall on the breadcrumb
3593  * flush and will continue pre-fetching the instructions after it before the
3594  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
3595  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
3596  * of the next request before the memory has been flushed, we're guaranteed that
3597  * we won't access the batch itself too early.
3598  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
3599  * so, if the current request is modifying an instruction in the next request on
3600  * the same intel_context, we might pre-fetch and then execute the pre-update
3601  * instruction. To avoid this, the users of self-modifying code should either
3602  * disable the parser around the code emitting the memory writes, via a new flag
3603  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
3604  * the in-kernel use-cases we've opted to use a separate context, see
3605  * reloc_gpu() as an example.
3606  * All the above applies only to the instructions themselves. Non-inline data
3607  * used by the instructions is not pre-fetched.
3608  */
3609 
3610 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
3611 {
3612 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
3613 		MI_SEMAPHORE_GLOBAL_GTT |
3614 		MI_SEMAPHORE_POLL |
3615 		MI_SEMAPHORE_SAD_EQ_SDD;
3616 	*cs++ = 0;
3617 	*cs++ = intel_hws_preempt_address(request->engine);
3618 	*cs++ = 0;
3619 	*cs++ = 0;
3620 	*cs++ = MI_NOOP;
3621 
3622 	return cs;
3623 }
3624 
3625 static __always_inline u32*
3626 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
3627 {
3628 	*cs++ = MI_USER_INTERRUPT;
3629 
3630 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3631 	if (intel_engine_has_semaphores(request->engine))
3632 		cs = gen12_emit_preempt_busywait(request, cs);
3633 
3634 	request->tail = intel_ring_offset(request, cs);
3635 	assert_ring_tail_valid(request->ring, request->tail);
3636 
3637 	return gen8_emit_wa_tail(request, cs);
3638 }
3639 
3640 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3641 {
3642 	cs = gen8_emit_ggtt_write(cs,
3643 				  request->fence.seqno,
3644 				  i915_request_active_timeline(request)->hwsp_offset,
3645 				  0);
3646 
3647 	return gen12_emit_fini_breadcrumb_footer(request, cs);
3648 }
3649 
3650 static u32 *
3651 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3652 {
3653 	cs = gen8_emit_ggtt_write_rcs(cs,
3654 				      request->fence.seqno,
3655 				      i915_request_active_timeline(request)->hwsp_offset,
3656 				      PIPE_CONTROL_CS_STALL |
3657 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
3658 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3659 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3660 				      /* Wa_1409600907:tgl */
3661 				      PIPE_CONTROL_DEPTH_STALL |
3662 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
3663 				      PIPE_CONTROL_FLUSH_ENABLE |
3664 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
3665 
3666 	return gen12_emit_fini_breadcrumb_footer(request, cs);
3667 }
3668 
3669 static void execlists_park(struct intel_engine_cs *engine)
3670 {
3671 	cancel_timer(&engine->execlists.timer);
3672 	cancel_timer(&engine->execlists.preempt);
3673 }
3674 
3675 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
3676 {
3677 	engine->submit_request = execlists_submit_request;
3678 	engine->cancel_requests = execlists_cancel_requests;
3679 	engine->schedule = i915_schedule;
3680 	engine->execlists.tasklet.func = execlists_submission_tasklet;
3681 
3682 	engine->reset.prepare = execlists_reset_prepare;
3683 	engine->reset.reset = execlists_reset;
3684 	engine->reset.finish = execlists_reset_finish;
3685 
3686 	engine->park = execlists_park;
3687 	engine->unpark = NULL;
3688 
3689 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
3690 	if (!intel_vgpu_active(engine->i915)) {
3691 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
3692 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
3693 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3694 	}
3695 
3696 	if (INTEL_GEN(engine->i915) >= 12)
3697 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
3698 }
3699 
3700 static void execlists_destroy(struct intel_engine_cs *engine)
3701 {
3702 	intel_engine_cleanup_common(engine);
3703 	lrc_destroy_wa_ctx(engine);
3704 	kfree(engine);
3705 }
3706 
3707 static void
3708 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3709 {
3710 	/* Default vfuncs which can be overriden by each engine. */
3711 
3712 	engine->destroy = execlists_destroy;
3713 	engine->resume = execlists_resume;
3714 
3715 	engine->reset.prepare = execlists_reset_prepare;
3716 	engine->reset.reset = execlists_reset;
3717 	engine->reset.finish = execlists_reset_finish;
3718 
3719 	engine->cops = &execlists_context_ops;
3720 	engine->request_alloc = execlists_request_alloc;
3721 
3722 	engine->emit_flush = gen8_emit_flush;
3723 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3724 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
3725 	if (INTEL_GEN(engine->i915) >= 12)
3726 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
3727 
3728 	engine->set_default_submission = intel_execlists_set_default_submission;
3729 
3730 	if (INTEL_GEN(engine->i915) < 11) {
3731 		engine->irq_enable = gen8_logical_ring_enable_irq;
3732 		engine->irq_disable = gen8_logical_ring_disable_irq;
3733 	} else {
3734 		/*
3735 		 * TODO: On Gen11 interrupt masks need to be clear
3736 		 * to allow C6 entry. Keep interrupts enabled at
3737 		 * and take the hit of generating extra interrupts
3738 		 * until a more refined solution exists.
3739 		 */
3740 	}
3741 	if (IS_GEN(engine->i915, 8))
3742 		engine->emit_bb_start = gen8_emit_bb_start;
3743 	else
3744 		engine->emit_bb_start = gen9_emit_bb_start;
3745 }
3746 
3747 static inline void
3748 logical_ring_default_irqs(struct intel_engine_cs *engine)
3749 {
3750 	unsigned int shift = 0;
3751 
3752 	if (INTEL_GEN(engine->i915) < 11) {
3753 		const u8 irq_shifts[] = {
3754 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
3755 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
3756 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
3757 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
3758 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
3759 		};
3760 
3761 		shift = irq_shifts[engine->id];
3762 	}
3763 
3764 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3765 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3766 }
3767 
3768 static void rcs_submission_override(struct intel_engine_cs *engine)
3769 {
3770 	switch (INTEL_GEN(engine->i915)) {
3771 	case 12:
3772 		engine->emit_flush = gen12_emit_flush_render;
3773 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
3774 		break;
3775 	case 11:
3776 		engine->emit_flush = gen11_emit_flush_render;
3777 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3778 		break;
3779 	default:
3780 		engine->emit_flush = gen8_emit_flush_render;
3781 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3782 		break;
3783 	}
3784 }
3785 
3786 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3787 {
3788 	tasklet_init(&engine->execlists.tasklet,
3789 		     execlists_submission_tasklet, (unsigned long)engine);
3790 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
3791 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
3792 
3793 	logical_ring_default_vfuncs(engine);
3794 	logical_ring_default_irqs(engine);
3795 
3796 	if (engine->class == RENDER_CLASS)
3797 		rcs_submission_override(engine);
3798 
3799 	return 0;
3800 }
3801 
3802 int intel_execlists_submission_init(struct intel_engine_cs *engine)
3803 {
3804 	struct intel_engine_execlists * const execlists = &engine->execlists;
3805 	struct drm_i915_private *i915 = engine->i915;
3806 	struct intel_uncore *uncore = engine->uncore;
3807 	u32 base = engine->mmio_base;
3808 	int ret;
3809 
3810 	ret = intel_engine_init_common(engine);
3811 	if (ret)
3812 		return ret;
3813 
3814 	if (intel_init_workaround_bb(engine))
3815 		/*
3816 		 * We continue even if we fail to initialize WA batch
3817 		 * because we only expect rare glitches but nothing
3818 		 * critical to prevent us from using GPU
3819 		 */
3820 		DRM_ERROR("WA batch buffer initialization failed\n");
3821 
3822 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
3823 		execlists->submit_reg = uncore->regs +
3824 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
3825 		execlists->ctrl_reg = uncore->regs +
3826 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
3827 	} else {
3828 		execlists->submit_reg = uncore->regs +
3829 			i915_mmio_reg_offset(RING_ELSP(base));
3830 	}
3831 
3832 	execlists->csb_status =
3833 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
3834 
3835 	execlists->csb_write =
3836 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
3837 
3838 	if (INTEL_GEN(i915) < 11)
3839 		execlists->csb_size = GEN8_CSB_ENTRIES;
3840 	else
3841 		execlists->csb_size = GEN11_CSB_ENTRIES;
3842 
3843 	reset_csb_pointers(engine);
3844 
3845 	return 0;
3846 }
3847 
3848 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
3849 {
3850 	u32 indirect_ctx_offset;
3851 
3852 	switch (INTEL_GEN(engine->i915)) {
3853 	default:
3854 		MISSING_CASE(INTEL_GEN(engine->i915));
3855 		/* fall through */
3856 	case 12:
3857 		indirect_ctx_offset =
3858 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3859 		break;
3860 	case 11:
3861 		indirect_ctx_offset =
3862 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3863 		break;
3864 	case 10:
3865 		indirect_ctx_offset =
3866 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3867 		break;
3868 	case 9:
3869 		indirect_ctx_offset =
3870 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3871 		break;
3872 	case 8:
3873 		indirect_ctx_offset =
3874 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3875 		break;
3876 	}
3877 
3878 	return indirect_ctx_offset;
3879 }
3880 
3881 
3882 static void init_common_reg_state(u32 * const regs,
3883 				  const struct intel_engine_cs *engine,
3884 				  const struct intel_ring *ring)
3885 {
3886 	regs[CTX_CONTEXT_CONTROL] =
3887 		_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
3888 		_MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
3889 	if (INTEL_GEN(engine->i915) < 11)
3890 		regs[CTX_CONTEXT_CONTROL] |=
3891 			_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
3892 					    CTX_CTRL_RS_CTX_ENABLE);
3893 
3894 	regs[CTX_RING_BUFFER_CONTROL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3895 	regs[CTX_BB_STATE] = RING_BB_PPGTT;
3896 }
3897 
3898 static void init_wa_bb_reg_state(u32 * const regs,
3899 				 const struct intel_engine_cs *engine,
3900 				 u32 pos_bb_per_ctx)
3901 {
3902 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
3903 
3904 	if (wa_ctx->per_ctx.size) {
3905 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3906 
3907 		regs[pos_bb_per_ctx] =
3908 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
3909 	}
3910 
3911 	if (wa_ctx->indirect_ctx.size) {
3912 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3913 
3914 		regs[pos_bb_per_ctx + 2] =
3915 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
3916 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
3917 
3918 		regs[pos_bb_per_ctx + 4] =
3919 			intel_lr_indirect_ctx_offset(engine) << 6;
3920 	}
3921 }
3922 
3923 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
3924 {
3925 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
3926 		/* 64b PPGTT (48bit canonical)
3927 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
3928 		 * other PDP Descriptors are ignored.
3929 		 */
3930 		ASSIGN_CTX_PML4(ppgtt, regs);
3931 	} else {
3932 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
3933 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
3934 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
3935 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
3936 	}
3937 }
3938 
3939 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
3940 {
3941 	if (i915_is_ggtt(vm))
3942 		return i915_vm_to_ggtt(vm)->alias;
3943 	else
3944 		return i915_vm_to_ppgtt(vm);
3945 }
3946 
3947 static void execlists_init_reg_state(u32 *regs,
3948 				     const struct intel_context *ce,
3949 				     const struct intel_engine_cs *engine,
3950 				     const struct intel_ring *ring,
3951 				     bool close)
3952 {
3953 	/*
3954 	 * A context is actually a big batch buffer with several
3955 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
3956 	 * values we are setting here are only for the first context restore:
3957 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
3958 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
3959 	 * we are not initializing here).
3960 	 *
3961 	 * Must keep consistent with virtual_update_register_offsets().
3962 	 */
3963 	u32 *bbe = set_offsets(regs, reg_offsets(engine), engine);
3964 
3965 	if (close) { /* Close the batch; used mainly by live_lrc_layout() */
3966 		*bbe = MI_BATCH_BUFFER_END;
3967 		if (INTEL_GEN(engine->i915) >= 10)
3968 			*bbe |= BIT(0);
3969 	}
3970 
3971 	init_common_reg_state(regs, engine, ring);
3972 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
3973 
3974 	init_wa_bb_reg_state(regs, engine,
3975 			     INTEL_GEN(engine->i915) >= 12 ?
3976 			     GEN12_CTX_BB_PER_CTX_PTR :
3977 			     CTX_BB_PER_CTX_PTR);
3978 }
3979 
3980 static int
3981 populate_lr_context(struct intel_context *ce,
3982 		    struct drm_i915_gem_object *ctx_obj,
3983 		    struct intel_engine_cs *engine,
3984 		    struct intel_ring *ring)
3985 {
3986 	bool inhibit = true;
3987 	void *vaddr;
3988 	u32 *regs;
3989 	int ret;
3990 
3991 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
3992 	if (IS_ERR(vaddr)) {
3993 		ret = PTR_ERR(vaddr);
3994 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
3995 		return ret;
3996 	}
3997 
3998 	set_redzone(vaddr, engine);
3999 
4000 	if (engine->default_state) {
4001 		void *defaults;
4002 
4003 		defaults = i915_gem_object_pin_map(engine->default_state,
4004 						   I915_MAP_WB);
4005 		if (IS_ERR(defaults)) {
4006 			ret = PTR_ERR(defaults);
4007 			goto err_unpin_ctx;
4008 		}
4009 
4010 		memcpy(vaddr, defaults, engine->context_size);
4011 		i915_gem_object_unpin_map(engine->default_state);
4012 		inhibit = false;
4013 	}
4014 
4015 	/* The second page of the context object contains some fields which must
4016 	 * be set up prior to the first execution. */
4017 	regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
4018 	execlists_init_reg_state(regs, ce, engine, ring, inhibit);
4019 	if (inhibit)
4020 		regs[CTX_CONTEXT_CONTROL] |=
4021 			_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4022 
4023 	ret = 0;
4024 err_unpin_ctx:
4025 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4026 	i915_gem_object_unpin_map(ctx_obj);
4027 	return ret;
4028 }
4029 
4030 static int __execlists_context_alloc(struct intel_context *ce,
4031 				     struct intel_engine_cs *engine)
4032 {
4033 	struct drm_i915_gem_object *ctx_obj;
4034 	struct intel_ring *ring;
4035 	struct i915_vma *vma;
4036 	u32 context_size;
4037 	int ret;
4038 
4039 	GEM_BUG_ON(ce->state);
4040 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4041 
4042 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4043 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4044 
4045 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4046 	if (IS_ERR(ctx_obj))
4047 		return PTR_ERR(ctx_obj);
4048 
4049 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4050 	if (IS_ERR(vma)) {
4051 		ret = PTR_ERR(vma);
4052 		goto error_deref_obj;
4053 	}
4054 
4055 	if (!ce->timeline) {
4056 		struct intel_timeline *tl;
4057 
4058 		tl = intel_timeline_create(engine->gt, NULL);
4059 		if (IS_ERR(tl)) {
4060 			ret = PTR_ERR(tl);
4061 			goto error_deref_obj;
4062 		}
4063 
4064 		ce->timeline = tl;
4065 	}
4066 
4067 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4068 	if (IS_ERR(ring)) {
4069 		ret = PTR_ERR(ring);
4070 		goto error_deref_obj;
4071 	}
4072 
4073 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4074 	if (ret) {
4075 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4076 		goto error_ring_free;
4077 	}
4078 
4079 	ce->ring = ring;
4080 	ce->state = vma;
4081 
4082 	return 0;
4083 
4084 error_ring_free:
4085 	intel_ring_put(ring);
4086 error_deref_obj:
4087 	i915_gem_object_put(ctx_obj);
4088 	return ret;
4089 }
4090 
4091 static struct list_head *virtual_queue(struct virtual_engine *ve)
4092 {
4093 	return &ve->base.execlists.default_priolist.requests[0];
4094 }
4095 
4096 static void virtual_context_destroy(struct kref *kref)
4097 {
4098 	struct virtual_engine *ve =
4099 		container_of(kref, typeof(*ve), context.ref);
4100 	unsigned int n;
4101 
4102 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4103 	GEM_BUG_ON(ve->request);
4104 	GEM_BUG_ON(ve->context.inflight);
4105 
4106 	for (n = 0; n < ve->num_siblings; n++) {
4107 		struct intel_engine_cs *sibling = ve->siblings[n];
4108 		struct rb_node *node = &ve->nodes[sibling->id].rb;
4109 
4110 		if (RB_EMPTY_NODE(node))
4111 			continue;
4112 
4113 		spin_lock_irq(&sibling->active.lock);
4114 
4115 		/* Detachment is lazily performed in the execlists tasklet */
4116 		if (!RB_EMPTY_NODE(node))
4117 			rb_erase_cached(node, &sibling->execlists.virtual);
4118 
4119 		spin_unlock_irq(&sibling->active.lock);
4120 	}
4121 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4122 
4123 	if (ve->context.state)
4124 		__execlists_context_fini(&ve->context);
4125 	intel_context_fini(&ve->context);
4126 
4127 	kfree(ve->bonds);
4128 	kfree(ve);
4129 }
4130 
4131 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4132 {
4133 	int swp;
4134 
4135 	/*
4136 	 * Pick a random sibling on starting to help spread the load around.
4137 	 *
4138 	 * New contexts are typically created with exactly the same order
4139 	 * of siblings, and often started in batches. Due to the way we iterate
4140 	 * the array of sibling when submitting requests, sibling[0] is
4141 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4142 	 * randomised across the system, we also help spread the load by the
4143 	 * first engine we inspect being different each time.
4144 	 *
4145 	 * NB This does not force us to execute on this engine, it will just
4146 	 * typically be the first we inspect for submission.
4147 	 */
4148 	swp = prandom_u32_max(ve->num_siblings);
4149 	if (!swp)
4150 		return;
4151 
4152 	swap(ve->siblings[swp], ve->siblings[0]);
4153 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4154 		virtual_update_register_offsets(ve->context.lrc_reg_state,
4155 						ve->siblings[0]);
4156 }
4157 
4158 static int virtual_context_pin(struct intel_context *ce)
4159 {
4160 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4161 	int err;
4162 
4163 	/* Note: we must use a real engine class for setting up reg state */
4164 	err = __execlists_context_pin(ce, ve->siblings[0]);
4165 	if (err)
4166 		return err;
4167 
4168 	virtual_engine_initial_hint(ve);
4169 	return 0;
4170 }
4171 
4172 static void virtual_context_enter(struct intel_context *ce)
4173 {
4174 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4175 	unsigned int n;
4176 
4177 	for (n = 0; n < ve->num_siblings; n++)
4178 		intel_engine_pm_get(ve->siblings[n]);
4179 
4180 	intel_timeline_enter(ce->timeline);
4181 }
4182 
4183 static void virtual_context_exit(struct intel_context *ce)
4184 {
4185 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4186 	unsigned int n;
4187 
4188 	intel_timeline_exit(ce->timeline);
4189 
4190 	for (n = 0; n < ve->num_siblings; n++)
4191 		intel_engine_pm_put(ve->siblings[n]);
4192 }
4193 
4194 static const struct intel_context_ops virtual_context_ops = {
4195 	.pin = virtual_context_pin,
4196 	.unpin = execlists_context_unpin,
4197 
4198 	.enter = virtual_context_enter,
4199 	.exit = virtual_context_exit,
4200 
4201 	.destroy = virtual_context_destroy,
4202 };
4203 
4204 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4205 {
4206 	struct i915_request *rq;
4207 	intel_engine_mask_t mask;
4208 
4209 	rq = READ_ONCE(ve->request);
4210 	if (!rq)
4211 		return 0;
4212 
4213 	/* The rq is ready for submission; rq->execution_mask is now stable. */
4214 	mask = rq->execution_mask;
4215 	if (unlikely(!mask)) {
4216 		/* Invalid selection, submit to a random engine in error */
4217 		i915_request_skip(rq, -ENODEV);
4218 		mask = ve->siblings[0]->mask;
4219 	}
4220 
4221 	GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n",
4222 		  ve->base.name,
4223 		  rq->fence.context, rq->fence.seqno,
4224 		  mask, ve->base.execlists.queue_priority_hint);
4225 
4226 	return mask;
4227 }
4228 
4229 static void virtual_submission_tasklet(unsigned long data)
4230 {
4231 	struct virtual_engine * const ve = (struct virtual_engine *)data;
4232 	const int prio = ve->base.execlists.queue_priority_hint;
4233 	intel_engine_mask_t mask;
4234 	unsigned int n;
4235 
4236 	rcu_read_lock();
4237 	mask = virtual_submission_mask(ve);
4238 	rcu_read_unlock();
4239 	if (unlikely(!mask))
4240 		return;
4241 
4242 	local_irq_disable();
4243 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4244 		struct intel_engine_cs *sibling = ve->siblings[n];
4245 		struct ve_node * const node = &ve->nodes[sibling->id];
4246 		struct rb_node **parent, *rb;
4247 		bool first;
4248 
4249 		if (unlikely(!(mask & sibling->mask))) {
4250 			if (!RB_EMPTY_NODE(&node->rb)) {
4251 				spin_lock(&sibling->active.lock);
4252 				rb_erase_cached(&node->rb,
4253 						&sibling->execlists.virtual);
4254 				RB_CLEAR_NODE(&node->rb);
4255 				spin_unlock(&sibling->active.lock);
4256 			}
4257 			continue;
4258 		}
4259 
4260 		spin_lock(&sibling->active.lock);
4261 
4262 		if (!RB_EMPTY_NODE(&node->rb)) {
4263 			/*
4264 			 * Cheat and avoid rebalancing the tree if we can
4265 			 * reuse this node in situ.
4266 			 */
4267 			first = rb_first_cached(&sibling->execlists.virtual) ==
4268 				&node->rb;
4269 			if (prio == node->prio || (prio > node->prio && first))
4270 				goto submit_engine;
4271 
4272 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4273 		}
4274 
4275 		rb = NULL;
4276 		first = true;
4277 		parent = &sibling->execlists.virtual.rb_root.rb_node;
4278 		while (*parent) {
4279 			struct ve_node *other;
4280 
4281 			rb = *parent;
4282 			other = rb_entry(rb, typeof(*other), rb);
4283 			if (prio > other->prio) {
4284 				parent = &rb->rb_left;
4285 			} else {
4286 				parent = &rb->rb_right;
4287 				first = false;
4288 			}
4289 		}
4290 
4291 		rb_link_node(&node->rb, rb, parent);
4292 		rb_insert_color_cached(&node->rb,
4293 				       &sibling->execlists.virtual,
4294 				       first);
4295 
4296 submit_engine:
4297 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4298 		node->prio = prio;
4299 		if (first && prio > sibling->execlists.queue_priority_hint) {
4300 			sibling->execlists.queue_priority_hint = prio;
4301 			tasklet_hi_schedule(&sibling->execlists.tasklet);
4302 		}
4303 
4304 		spin_unlock(&sibling->active.lock);
4305 	}
4306 	local_irq_enable();
4307 }
4308 
4309 static void virtual_submit_request(struct i915_request *rq)
4310 {
4311 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4312 	struct i915_request *old;
4313 	unsigned long flags;
4314 
4315 	GEM_TRACE("%s: rq=%llx:%lld\n",
4316 		  ve->base.name,
4317 		  rq->fence.context,
4318 		  rq->fence.seqno);
4319 
4320 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4321 
4322 	spin_lock_irqsave(&ve->base.active.lock, flags);
4323 
4324 	old = ve->request;
4325 	if (old) { /* background completion event from preempt-to-busy */
4326 		GEM_BUG_ON(!i915_request_completed(old));
4327 		__i915_request_submit(old);
4328 		i915_request_put(old);
4329 	}
4330 
4331 	if (i915_request_completed(rq)) {
4332 		__i915_request_submit(rq);
4333 
4334 		ve->base.execlists.queue_priority_hint = INT_MIN;
4335 		ve->request = NULL;
4336 	} else {
4337 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
4338 		ve->request = i915_request_get(rq);
4339 
4340 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4341 		list_move_tail(&rq->sched.link, virtual_queue(ve));
4342 
4343 		tasklet_schedule(&ve->base.execlists.tasklet);
4344 	}
4345 
4346 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
4347 }
4348 
4349 static struct ve_bond *
4350 virtual_find_bond(struct virtual_engine *ve,
4351 		  const struct intel_engine_cs *master)
4352 {
4353 	int i;
4354 
4355 	for (i = 0; i < ve->num_bonds; i++) {
4356 		if (ve->bonds[i].master == master)
4357 			return &ve->bonds[i];
4358 	}
4359 
4360 	return NULL;
4361 }
4362 
4363 static void
4364 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4365 {
4366 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4367 	intel_engine_mask_t allowed, exec;
4368 	struct ve_bond *bond;
4369 
4370 	allowed = ~to_request(signal)->engine->mask;
4371 
4372 	bond = virtual_find_bond(ve, to_request(signal)->engine);
4373 	if (bond)
4374 		allowed &= bond->sibling_mask;
4375 
4376 	/* Restrict the bonded request to run on only the available engines */
4377 	exec = READ_ONCE(rq->execution_mask);
4378 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4379 		;
4380 
4381 	/* Prevent the master from being re-run on the bonded engines */
4382 	to_request(signal)->execution_mask &= ~allowed;
4383 }
4384 
4385 struct intel_context *
4386 intel_execlists_create_virtual(struct i915_gem_context *ctx,
4387 			       struct intel_engine_cs **siblings,
4388 			       unsigned int count)
4389 {
4390 	struct virtual_engine *ve;
4391 	unsigned int n;
4392 	int err;
4393 
4394 	if (count == 0)
4395 		return ERR_PTR(-EINVAL);
4396 
4397 	if (count == 1)
4398 		return intel_context_create(ctx, siblings[0]);
4399 
4400 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4401 	if (!ve)
4402 		return ERR_PTR(-ENOMEM);
4403 
4404 	ve->base.i915 = ctx->i915;
4405 	ve->base.gt = siblings[0]->gt;
4406 	ve->base.uncore = siblings[0]->uncore;
4407 	ve->base.id = -1;
4408 	ve->base.class = OTHER_CLASS;
4409 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4410 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4411 
4412 	/*
4413 	 * The decision on whether to submit a request using semaphores
4414 	 * depends on the saturated state of the engine. We only compute
4415 	 * this during HW submission of the request, and we need for this
4416 	 * state to be globally applied to all requests being submitted
4417 	 * to this engine. Virtual engines encompass more than one physical
4418 	 * engine and so we cannot accurately tell in advance if one of those
4419 	 * engines is already saturated and so cannot afford to use a semaphore
4420 	 * and be pessimized in priority for doing so -- if we are the only
4421 	 * context using semaphores after all other clients have stopped, we
4422 	 * will be starved on the saturated system. Such a global switch for
4423 	 * semaphores is less than ideal, but alas is the current compromise.
4424 	 */
4425 	ve->base.saturated = ALL_ENGINES;
4426 
4427 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4428 
4429 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4430 	intel_engine_init_breadcrumbs(&ve->base);
4431 
4432 	intel_engine_init_execlists(&ve->base);
4433 
4434 	ve->base.cops = &virtual_context_ops;
4435 	ve->base.request_alloc = execlists_request_alloc;
4436 
4437 	ve->base.schedule = i915_schedule;
4438 	ve->base.submit_request = virtual_submit_request;
4439 	ve->base.bond_execute = virtual_bond_execute;
4440 
4441 	INIT_LIST_HEAD(virtual_queue(ve));
4442 	ve->base.execlists.queue_priority_hint = INT_MIN;
4443 	tasklet_init(&ve->base.execlists.tasklet,
4444 		     virtual_submission_tasklet,
4445 		     (unsigned long)ve);
4446 
4447 	intel_context_init(&ve->context, ctx, &ve->base);
4448 
4449 	for (n = 0; n < count; n++) {
4450 		struct intel_engine_cs *sibling = siblings[n];
4451 
4452 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
4453 		if (sibling->mask & ve->base.mask) {
4454 			DRM_DEBUG("duplicate %s entry in load balancer\n",
4455 				  sibling->name);
4456 			err = -EINVAL;
4457 			goto err_put;
4458 		}
4459 
4460 		/*
4461 		 * The virtual engine implementation is tightly coupled to
4462 		 * the execlists backend -- we push out request directly
4463 		 * into a tree inside each physical engine. We could support
4464 		 * layering if we handle cloning of the requests and
4465 		 * submitting a copy into each backend.
4466 		 */
4467 		if (sibling->execlists.tasklet.func !=
4468 		    execlists_submission_tasklet) {
4469 			err = -ENODEV;
4470 			goto err_put;
4471 		}
4472 
4473 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4474 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4475 
4476 		ve->siblings[ve->num_siblings++] = sibling;
4477 		ve->base.mask |= sibling->mask;
4478 
4479 		/*
4480 		 * All physical engines must be compatible for their emission
4481 		 * functions (as we build the instructions during request
4482 		 * construction and do not alter them before submission
4483 		 * on the physical engine). We use the engine class as a guide
4484 		 * here, although that could be refined.
4485 		 */
4486 		if (ve->base.class != OTHER_CLASS) {
4487 			if (ve->base.class != sibling->class) {
4488 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
4489 					  sibling->class, ve->base.class);
4490 				err = -EINVAL;
4491 				goto err_put;
4492 			}
4493 			continue;
4494 		}
4495 
4496 		ve->base.class = sibling->class;
4497 		ve->base.uabi_class = sibling->uabi_class;
4498 		snprintf(ve->base.name, sizeof(ve->base.name),
4499 			 "v%dx%d", ve->base.class, count);
4500 		ve->base.context_size = sibling->context_size;
4501 
4502 		ve->base.emit_bb_start = sibling->emit_bb_start;
4503 		ve->base.emit_flush = sibling->emit_flush;
4504 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
4505 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
4506 		ve->base.emit_fini_breadcrumb_dw =
4507 			sibling->emit_fini_breadcrumb_dw;
4508 
4509 		ve->base.flags = sibling->flags;
4510 	}
4511 
4512 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
4513 
4514 	err = __execlists_context_alloc(&ve->context, siblings[0]);
4515 	if (err)
4516 		goto err_put;
4517 
4518 	__set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags);
4519 
4520 	return &ve->context;
4521 
4522 err_put:
4523 	intel_context_put(&ve->context);
4524 	return ERR_PTR(err);
4525 }
4526 
4527 struct intel_context *
4528 intel_execlists_clone_virtual(struct i915_gem_context *ctx,
4529 			      struct intel_engine_cs *src)
4530 {
4531 	struct virtual_engine *se = to_virtual_engine(src);
4532 	struct intel_context *dst;
4533 
4534 	dst = intel_execlists_create_virtual(ctx,
4535 					     se->siblings,
4536 					     se->num_siblings);
4537 	if (IS_ERR(dst))
4538 		return dst;
4539 
4540 	if (se->num_bonds) {
4541 		struct virtual_engine *de = to_virtual_engine(dst->engine);
4542 
4543 		de->bonds = kmemdup(se->bonds,
4544 				    sizeof(*se->bonds) * se->num_bonds,
4545 				    GFP_KERNEL);
4546 		if (!de->bonds) {
4547 			intel_context_put(dst);
4548 			return ERR_PTR(-ENOMEM);
4549 		}
4550 
4551 		de->num_bonds = se->num_bonds;
4552 	}
4553 
4554 	return dst;
4555 }
4556 
4557 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
4558 				     const struct intel_engine_cs *master,
4559 				     const struct intel_engine_cs *sibling)
4560 {
4561 	struct virtual_engine *ve = to_virtual_engine(engine);
4562 	struct ve_bond *bond;
4563 	int n;
4564 
4565 	/* Sanity check the sibling is part of the virtual engine */
4566 	for (n = 0; n < ve->num_siblings; n++)
4567 		if (sibling == ve->siblings[n])
4568 			break;
4569 	if (n == ve->num_siblings)
4570 		return -EINVAL;
4571 
4572 	bond = virtual_find_bond(ve, master);
4573 	if (bond) {
4574 		bond->sibling_mask |= sibling->mask;
4575 		return 0;
4576 	}
4577 
4578 	bond = krealloc(ve->bonds,
4579 			sizeof(*bond) * (ve->num_bonds + 1),
4580 			GFP_KERNEL);
4581 	if (!bond)
4582 		return -ENOMEM;
4583 
4584 	bond[ve->num_bonds].master = master;
4585 	bond[ve->num_bonds].sibling_mask = sibling->mask;
4586 
4587 	ve->bonds = bond;
4588 	ve->num_bonds++;
4589 
4590 	return 0;
4591 }
4592 
4593 struct intel_engine_cs *
4594 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
4595 				 unsigned int sibling)
4596 {
4597 	struct virtual_engine *ve = to_virtual_engine(engine);
4598 
4599 	if (sibling >= ve->num_siblings)
4600 		return NULL;
4601 
4602 	return ve->siblings[sibling];
4603 }
4604 
4605 void intel_execlists_show_requests(struct intel_engine_cs *engine,
4606 				   struct drm_printer *m,
4607 				   void (*show_request)(struct drm_printer *m,
4608 							struct i915_request *rq,
4609 							const char *prefix),
4610 				   unsigned int max)
4611 {
4612 	const struct intel_engine_execlists *execlists = &engine->execlists;
4613 	struct i915_request *rq, *last;
4614 	unsigned long flags;
4615 	unsigned int count;
4616 	struct rb_node *rb;
4617 
4618 	spin_lock_irqsave(&engine->active.lock, flags);
4619 
4620 	last = NULL;
4621 	count = 0;
4622 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
4623 		if (count++ < max - 1)
4624 			show_request(m, rq, "\t\tE ");
4625 		else
4626 			last = rq;
4627 	}
4628 	if (last) {
4629 		if (count > max) {
4630 			drm_printf(m,
4631 				   "\t\t...skipping %d executing requests...\n",
4632 				   count - max);
4633 		}
4634 		show_request(m, last, "\t\tE ");
4635 	}
4636 
4637 	last = NULL;
4638 	count = 0;
4639 	if (execlists->queue_priority_hint != INT_MIN)
4640 		drm_printf(m, "\t\tQueue priority hint: %d\n",
4641 			   execlists->queue_priority_hint);
4642 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
4643 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
4644 		int i;
4645 
4646 		priolist_for_each_request(rq, p, i) {
4647 			if (count++ < max - 1)
4648 				show_request(m, rq, "\t\tQ ");
4649 			else
4650 				last = rq;
4651 		}
4652 	}
4653 	if (last) {
4654 		if (count > max) {
4655 			drm_printf(m,
4656 				   "\t\t...skipping %d queued requests...\n",
4657 				   count - max);
4658 		}
4659 		show_request(m, last, "\t\tQ ");
4660 	}
4661 
4662 	last = NULL;
4663 	count = 0;
4664 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
4665 		struct virtual_engine *ve =
4666 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4667 		struct i915_request *rq = READ_ONCE(ve->request);
4668 
4669 		if (rq) {
4670 			if (count++ < max - 1)
4671 				show_request(m, rq, "\t\tV ");
4672 			else
4673 				last = rq;
4674 		}
4675 	}
4676 	if (last) {
4677 		if (count > max) {
4678 			drm_printf(m,
4679 				   "\t\t...skipping %d virtual requests...\n",
4680 				   count - max);
4681 		}
4682 		show_request(m, last, "\t\tV ");
4683 	}
4684 
4685 	spin_unlock_irqrestore(&engine->active.lock, flags);
4686 }
4687 
4688 void intel_lr_context_reset(struct intel_engine_cs *engine,
4689 			    struct intel_context *ce,
4690 			    u32 head,
4691 			    bool scrub)
4692 {
4693 	GEM_BUG_ON(!intel_context_is_pinned(ce));
4694 
4695 	/*
4696 	 * We want a simple context + ring to execute the breadcrumb update.
4697 	 * We cannot rely on the context being intact across the GPU hang,
4698 	 * so clear it and rebuild just what we need for the breadcrumb.
4699 	 * All pending requests for this context will be zapped, and any
4700 	 * future request will be after userspace has had the opportunity
4701 	 * to recreate its own state.
4702 	 */
4703 	if (scrub)
4704 		restore_default_state(ce, engine);
4705 
4706 	/* Rerun the request; its payload has been neutered (if guilty). */
4707 	ce->ring->head = head;
4708 	intel_ring_update_space(ce->ring);
4709 
4710 	__execlists_update_reg_state(ce, engine);
4711 }
4712 
4713 bool
4714 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
4715 {
4716 	return engine->set_default_submission ==
4717 	       intel_execlists_set_default_submission;
4718 }
4719 
4720 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4721 #include "selftest_lrc.c"
4722 #endif
4723