xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 7b73a9c8e26ce5769c41d4b787767c10fe7269db)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "gem/i915_gem_context.h"
137 
138 #include "i915_drv.h"
139 #include "i915_perf.h"
140 #include "i915_trace.h"
141 #include "i915_vgpu.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_gt_requests.h"
146 #include "intel_lrc_reg.h"
147 #include "intel_mocs.h"
148 #include "intel_reset.h"
149 #include "intel_ring.h"
150 #include "intel_workarounds.h"
151 
152 #define RING_EXECLIST_QFULL		(1 << 0x2)
153 #define RING_EXECLIST1_VALID		(1 << 0x3)
154 #define RING_EXECLIST0_VALID		(1 << 0x4)
155 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
156 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
157 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
158 
159 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
160 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
162 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
163 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
164 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
165 
166 #define GEN8_CTX_STATUS_COMPLETED_MASK \
167 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
168 
169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
170 
171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
173 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
174 #define GEN12_IDLE_CTX_ID		0x7FF
175 #define GEN12_CSB_CTX_VALID(csb_dw) \
176 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
177 
178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
180 #define WA_TAIL_DWORDS 2
181 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
182 
183 struct virtual_engine {
184 	struct intel_engine_cs base;
185 	struct intel_context context;
186 
187 	/*
188 	 * We allow only a single request through the virtual engine at a time
189 	 * (each request in the timeline waits for the completion fence of
190 	 * the previous before being submitted). By restricting ourselves to
191 	 * only submitting a single request, each request is placed on to a
192 	 * physical to maximise load spreading (by virtue of the late greedy
193 	 * scheduling -- each real engine takes the next available request
194 	 * upon idling).
195 	 */
196 	struct i915_request *request;
197 
198 	/*
199 	 * We keep a rbtree of available virtual engines inside each physical
200 	 * engine, sorted by priority. Here we preallocate the nodes we need
201 	 * for the virtual engine, indexed by physical_engine->id.
202 	 */
203 	struct ve_node {
204 		struct rb_node rb;
205 		int prio;
206 	} nodes[I915_NUM_ENGINES];
207 
208 	/*
209 	 * Keep track of bonded pairs -- restrictions upon on our selection
210 	 * of physical engines any particular request may be submitted to.
211 	 * If we receive a submit-fence from a master engine, we will only
212 	 * use one of sibling_mask physical engines.
213 	 */
214 	struct ve_bond {
215 		const struct intel_engine_cs *master;
216 		intel_engine_mask_t sibling_mask;
217 	} *bonds;
218 	unsigned int num_bonds;
219 
220 	/* And finally, which physical engines this virtual engine maps onto. */
221 	unsigned int num_siblings;
222 	struct intel_engine_cs *siblings[0];
223 };
224 
225 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
226 {
227 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
228 	return container_of(engine, struct virtual_engine, base);
229 }
230 
231 static int __execlists_context_alloc(struct intel_context *ce,
232 				     struct intel_engine_cs *engine);
233 
234 static void execlists_init_reg_state(u32 *reg_state,
235 				     const struct intel_context *ce,
236 				     const struct intel_engine_cs *engine,
237 				     const struct intel_ring *ring,
238 				     bool close);
239 static void
240 __execlists_update_reg_state(const struct intel_context *ce,
241 			     const struct intel_engine_cs *engine);
242 
243 static void mark_eio(struct i915_request *rq)
244 {
245 	if (i915_request_completed(rq))
246 		return;
247 
248 	GEM_BUG_ON(i915_request_signaled(rq));
249 
250 	dma_fence_set_error(&rq->fence, -EIO);
251 	i915_request_mark_complete(rq);
252 }
253 
254 static struct i915_request *
255 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
256 {
257 	struct i915_request *active = rq;
258 
259 	rcu_read_lock();
260 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
261 		if (i915_request_completed(rq))
262 			break;
263 
264 		active = rq;
265 	}
266 	rcu_read_unlock();
267 
268 	return active;
269 }
270 
271 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
272 {
273 	return (i915_ggtt_offset(engine->status_page.vma) +
274 		I915_GEM_HWS_PREEMPT_ADDR);
275 }
276 
277 static inline void
278 ring_set_paused(const struct intel_engine_cs *engine, int state)
279 {
280 	/*
281 	 * We inspect HWS_PREEMPT with a semaphore inside
282 	 * engine->emit_fini_breadcrumb. If the dword is true,
283 	 * the ring is paused as the semaphore will busywait
284 	 * until the dword is false.
285 	 */
286 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
287 	if (state)
288 		wmb();
289 }
290 
291 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
292 {
293 	return rb_entry(rb, struct i915_priolist, node);
294 }
295 
296 static inline int rq_prio(const struct i915_request *rq)
297 {
298 	return rq->sched.attr.priority;
299 }
300 
301 static int effective_prio(const struct i915_request *rq)
302 {
303 	int prio = rq_prio(rq);
304 
305 	/*
306 	 * If this request is special and must not be interrupted at any
307 	 * cost, so be it. Note we are only checking the most recent request
308 	 * in the context and so may be masking an earlier vip request. It
309 	 * is hoped that under the conditions where nopreempt is used, this
310 	 * will not matter (i.e. all requests to that context will be
311 	 * nopreempt for as long as desired).
312 	 */
313 	if (i915_request_has_nopreempt(rq))
314 		prio = I915_PRIORITY_UNPREEMPTABLE;
315 
316 	/*
317 	 * On unwinding the active request, we give it a priority bump
318 	 * if it has completed waiting on any semaphore. If we know that
319 	 * the request has already started, we can prevent an unwanted
320 	 * preempt-to-idle cycle by taking that into account now.
321 	 */
322 	if (__i915_request_has_started(rq))
323 		prio |= I915_PRIORITY_NOSEMAPHORE;
324 
325 	/* Restrict mere WAIT boosts from triggering preemption */
326 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
327 	return prio | __NO_PREEMPTION;
328 }
329 
330 static int queue_prio(const struct intel_engine_execlists *execlists)
331 {
332 	struct i915_priolist *p;
333 	struct rb_node *rb;
334 
335 	rb = rb_first_cached(&execlists->queue);
336 	if (!rb)
337 		return INT_MIN;
338 
339 	/*
340 	 * As the priolist[] are inverted, with the highest priority in [0],
341 	 * we have to flip the index value to become priority.
342 	 */
343 	p = to_priolist(rb);
344 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
345 }
346 
347 static inline bool need_preempt(const struct intel_engine_cs *engine,
348 				const struct i915_request *rq,
349 				struct rb_node *rb)
350 {
351 	int last_prio;
352 
353 	if (!intel_engine_has_semaphores(engine))
354 		return false;
355 
356 	/*
357 	 * Check if the current priority hint merits a preemption attempt.
358 	 *
359 	 * We record the highest value priority we saw during rescheduling
360 	 * prior to this dequeue, therefore we know that if it is strictly
361 	 * less than the current tail of ESLP[0], we do not need to force
362 	 * a preempt-to-idle cycle.
363 	 *
364 	 * However, the priority hint is a mere hint that we may need to
365 	 * preempt. If that hint is stale or we may be trying to preempt
366 	 * ourselves, ignore the request.
367 	 *
368 	 * More naturally we would write
369 	 *      prio >= max(0, last);
370 	 * except that we wish to prevent triggering preemption at the same
371 	 * priority level: the task that is running should remain running
372 	 * to preserve FIFO ordering of dependencies.
373 	 */
374 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
375 	if (engine->execlists.queue_priority_hint <= last_prio)
376 		return false;
377 
378 	/*
379 	 * Check against the first request in ELSP[1], it will, thanks to the
380 	 * power of PI, be the highest priority of that context.
381 	 */
382 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
383 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
384 		return true;
385 
386 	if (rb) {
387 		struct virtual_engine *ve =
388 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
389 		bool preempt = false;
390 
391 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
392 			struct i915_request *next;
393 
394 			rcu_read_lock();
395 			next = READ_ONCE(ve->request);
396 			if (next)
397 				preempt = rq_prio(next) > last_prio;
398 			rcu_read_unlock();
399 		}
400 
401 		if (preempt)
402 			return preempt;
403 	}
404 
405 	/*
406 	 * If the inflight context did not trigger the preemption, then maybe
407 	 * it was the set of queued requests? Pick the highest priority in
408 	 * the queue (the first active priolist) and see if it deserves to be
409 	 * running instead of ELSP[0].
410 	 *
411 	 * The highest priority request in the queue can not be either
412 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
413 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
414 	 */
415 	return queue_prio(&engine->execlists) > last_prio;
416 }
417 
418 __maybe_unused static inline bool
419 assert_priority_queue(const struct i915_request *prev,
420 		      const struct i915_request *next)
421 {
422 	/*
423 	 * Without preemption, the prev may refer to the still active element
424 	 * which we refuse to let go.
425 	 *
426 	 * Even with preemption, there are times when we think it is better not
427 	 * to preempt and leave an ostensibly lower priority request in flight.
428 	 */
429 	if (i915_request_is_active(prev))
430 		return true;
431 
432 	return rq_prio(prev) >= rq_prio(next);
433 }
434 
435 /*
436  * The context descriptor encodes various attributes of a context,
437  * including its GTT address and some flags. Because it's fairly
438  * expensive to calculate, we'll just do it once and cache the result,
439  * which remains valid until the context is unpinned.
440  *
441  * This is what a descriptor looks like, from LSB to MSB::
442  *
443  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
444  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
445  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
446  *      bits 53-54:    mbz, reserved for use by hardware
447  *      bits 55-63:    group ID, currently unused and set to 0
448  *
449  * Starting from Gen11, the upper dword of the descriptor has a new format:
450  *
451  *      bits 32-36:    reserved
452  *      bits 37-47:    SW context ID
453  *      bits 48:53:    engine instance
454  *      bit 54:        mbz, reserved for use by hardware
455  *      bits 55-60:    SW counter
456  *      bits 61-63:    engine class
457  *
458  * engine info, SW context ID and SW counter need to form a unique number
459  * (Context ID) per lrc.
460  */
461 static u64
462 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
463 {
464 	u64 desc;
465 
466 	desc = INTEL_LEGACY_32B_CONTEXT;
467 	if (i915_vm_is_4lvl(ce->vm))
468 		desc = INTEL_LEGACY_64B_CONTEXT;
469 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
470 
471 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
472 	if (IS_GEN(engine->i915, 8))
473 		desc |= GEN8_CTX_L3LLC_COHERENT;
474 
475 	desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
476 	/*
477 	 * The following 32bits are copied into the OA reports (dword 2).
478 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
479 	 * anything below.
480 	 */
481 	if (INTEL_GEN(engine->i915) >= 11) {
482 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
483 								/* bits 48-53 */
484 
485 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
486 								/* bits 61-63 */
487 	}
488 
489 	return desc;
490 }
491 
492 static u32 *set_offsets(u32 *regs,
493 			const u8 *data,
494 			const struct intel_engine_cs *engine)
495 #define NOP(x) (BIT(7) | (x))
496 #define LRI(count, flags) ((flags) << 6 | (count))
497 #define POSTED BIT(0)
498 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
499 #define REG16(x) \
500 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
501 	(((x) >> 2) & 0x7f)
502 #define END() 0
503 {
504 	const u32 base = engine->mmio_base;
505 
506 	while (*data) {
507 		u8 count, flags;
508 
509 		if (*data & BIT(7)) { /* skip */
510 			regs += *data++ & ~BIT(7);
511 			continue;
512 		}
513 
514 		count = *data & 0x3f;
515 		flags = *data >> 6;
516 		data++;
517 
518 		*regs = MI_LOAD_REGISTER_IMM(count);
519 		if (flags & POSTED)
520 			*regs |= MI_LRI_FORCE_POSTED;
521 		if (INTEL_GEN(engine->i915) >= 11)
522 			*regs |= MI_LRI_CS_MMIO;
523 		regs++;
524 
525 		GEM_BUG_ON(!count);
526 		do {
527 			u32 offset = 0;
528 			u8 v;
529 
530 			do {
531 				v = *data++;
532 				offset <<= 7;
533 				offset |= v & ~BIT(7);
534 			} while (v & BIT(7));
535 
536 			*regs = base + (offset << 2);
537 			regs += 2;
538 		} while (--count);
539 	}
540 
541 	return regs;
542 }
543 
544 static const u8 gen8_xcs_offsets[] = {
545 	NOP(1),
546 	LRI(11, 0),
547 	REG16(0x244),
548 	REG(0x034),
549 	REG(0x030),
550 	REG(0x038),
551 	REG(0x03c),
552 	REG(0x168),
553 	REG(0x140),
554 	REG(0x110),
555 	REG(0x11c),
556 	REG(0x114),
557 	REG(0x118),
558 
559 	NOP(9),
560 	LRI(9, 0),
561 	REG16(0x3a8),
562 	REG16(0x28c),
563 	REG16(0x288),
564 	REG16(0x284),
565 	REG16(0x280),
566 	REG16(0x27c),
567 	REG16(0x278),
568 	REG16(0x274),
569 	REG16(0x270),
570 
571 	NOP(13),
572 	LRI(2, 0),
573 	REG16(0x200),
574 	REG(0x028),
575 
576 	END(),
577 };
578 
579 static const u8 gen9_xcs_offsets[] = {
580 	NOP(1),
581 	LRI(14, POSTED),
582 	REG16(0x244),
583 	REG(0x034),
584 	REG(0x030),
585 	REG(0x038),
586 	REG(0x03c),
587 	REG(0x168),
588 	REG(0x140),
589 	REG(0x110),
590 	REG(0x11c),
591 	REG(0x114),
592 	REG(0x118),
593 	REG(0x1c0),
594 	REG(0x1c4),
595 	REG(0x1c8),
596 
597 	NOP(3),
598 	LRI(9, POSTED),
599 	REG16(0x3a8),
600 	REG16(0x28c),
601 	REG16(0x288),
602 	REG16(0x284),
603 	REG16(0x280),
604 	REG16(0x27c),
605 	REG16(0x278),
606 	REG16(0x274),
607 	REG16(0x270),
608 
609 	NOP(13),
610 	LRI(1, POSTED),
611 	REG16(0x200),
612 
613 	NOP(13),
614 	LRI(44, POSTED),
615 	REG(0x028),
616 	REG(0x09c),
617 	REG(0x0c0),
618 	REG(0x178),
619 	REG(0x17c),
620 	REG16(0x358),
621 	REG(0x170),
622 	REG(0x150),
623 	REG(0x154),
624 	REG(0x158),
625 	REG16(0x41c),
626 	REG16(0x600),
627 	REG16(0x604),
628 	REG16(0x608),
629 	REG16(0x60c),
630 	REG16(0x610),
631 	REG16(0x614),
632 	REG16(0x618),
633 	REG16(0x61c),
634 	REG16(0x620),
635 	REG16(0x624),
636 	REG16(0x628),
637 	REG16(0x62c),
638 	REG16(0x630),
639 	REG16(0x634),
640 	REG16(0x638),
641 	REG16(0x63c),
642 	REG16(0x640),
643 	REG16(0x644),
644 	REG16(0x648),
645 	REG16(0x64c),
646 	REG16(0x650),
647 	REG16(0x654),
648 	REG16(0x658),
649 	REG16(0x65c),
650 	REG16(0x660),
651 	REG16(0x664),
652 	REG16(0x668),
653 	REG16(0x66c),
654 	REG16(0x670),
655 	REG16(0x674),
656 	REG16(0x678),
657 	REG16(0x67c),
658 	REG(0x068),
659 
660 	END(),
661 };
662 
663 static const u8 gen12_xcs_offsets[] = {
664 	NOP(1),
665 	LRI(13, POSTED),
666 	REG16(0x244),
667 	REG(0x034),
668 	REG(0x030),
669 	REG(0x038),
670 	REG(0x03c),
671 	REG(0x168),
672 	REG(0x140),
673 	REG(0x110),
674 	REG(0x1c0),
675 	REG(0x1c4),
676 	REG(0x1c8),
677 	REG(0x180),
678 	REG16(0x2b4),
679 
680 	NOP(5),
681 	LRI(9, POSTED),
682 	REG16(0x3a8),
683 	REG16(0x28c),
684 	REG16(0x288),
685 	REG16(0x284),
686 	REG16(0x280),
687 	REG16(0x27c),
688 	REG16(0x278),
689 	REG16(0x274),
690 	REG16(0x270),
691 
692 	END(),
693 };
694 
695 static const u8 gen8_rcs_offsets[] = {
696 	NOP(1),
697 	LRI(14, POSTED),
698 	REG16(0x244),
699 	REG(0x034),
700 	REG(0x030),
701 	REG(0x038),
702 	REG(0x03c),
703 	REG(0x168),
704 	REG(0x140),
705 	REG(0x110),
706 	REG(0x11c),
707 	REG(0x114),
708 	REG(0x118),
709 	REG(0x1c0),
710 	REG(0x1c4),
711 	REG(0x1c8),
712 
713 	NOP(3),
714 	LRI(9, POSTED),
715 	REG16(0x3a8),
716 	REG16(0x28c),
717 	REG16(0x288),
718 	REG16(0x284),
719 	REG16(0x280),
720 	REG16(0x27c),
721 	REG16(0x278),
722 	REG16(0x274),
723 	REG16(0x270),
724 
725 	NOP(13),
726 	LRI(1, 0),
727 	REG(0x0c8),
728 
729 	END(),
730 };
731 
732 static const u8 gen11_rcs_offsets[] = {
733 	NOP(1),
734 	LRI(15, POSTED),
735 	REG16(0x244),
736 	REG(0x034),
737 	REG(0x030),
738 	REG(0x038),
739 	REG(0x03c),
740 	REG(0x168),
741 	REG(0x140),
742 	REG(0x110),
743 	REG(0x11c),
744 	REG(0x114),
745 	REG(0x118),
746 	REG(0x1c0),
747 	REG(0x1c4),
748 	REG(0x1c8),
749 	REG(0x180),
750 
751 	NOP(1),
752 	LRI(9, POSTED),
753 	REG16(0x3a8),
754 	REG16(0x28c),
755 	REG16(0x288),
756 	REG16(0x284),
757 	REG16(0x280),
758 	REG16(0x27c),
759 	REG16(0x278),
760 	REG16(0x274),
761 	REG16(0x270),
762 
763 	LRI(1, POSTED),
764 	REG(0x1b0),
765 
766 	NOP(10),
767 	LRI(1, 0),
768 	REG(0x0c8),
769 
770 	END(),
771 };
772 
773 static const u8 gen12_rcs_offsets[] = {
774 	NOP(1),
775 	LRI(13, POSTED),
776 	REG16(0x244),
777 	REG(0x034),
778 	REG(0x030),
779 	REG(0x038),
780 	REG(0x03c),
781 	REG(0x168),
782 	REG(0x140),
783 	REG(0x110),
784 	REG(0x1c0),
785 	REG(0x1c4),
786 	REG(0x1c8),
787 	REG(0x180),
788 	REG16(0x2b4),
789 
790 	NOP(5),
791 	LRI(9, POSTED),
792 	REG16(0x3a8),
793 	REG16(0x28c),
794 	REG16(0x288),
795 	REG16(0x284),
796 	REG16(0x280),
797 	REG16(0x27c),
798 	REG16(0x278),
799 	REG16(0x274),
800 	REG16(0x270),
801 
802 	LRI(3, POSTED),
803 	REG(0x1b0),
804 	REG16(0x5a8),
805 	REG16(0x5ac),
806 
807 	NOP(6),
808 	LRI(1, 0),
809 	REG(0x0c8),
810 
811 	END(),
812 };
813 
814 #undef END
815 #undef REG16
816 #undef REG
817 #undef LRI
818 #undef NOP
819 
820 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
821 {
822 	/*
823 	 * The gen12+ lists only have the registers we program in the basic
824 	 * default state. We rely on the context image using relative
825 	 * addressing to automatic fixup the register state between the
826 	 * physical engines for virtual engine.
827 	 */
828 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
829 		   !intel_engine_has_relative_mmio(engine));
830 
831 	if (engine->class == RENDER_CLASS) {
832 		if (INTEL_GEN(engine->i915) >= 12)
833 			return gen12_rcs_offsets;
834 		else if (INTEL_GEN(engine->i915) >= 11)
835 			return gen11_rcs_offsets;
836 		else
837 			return gen8_rcs_offsets;
838 	} else {
839 		if (INTEL_GEN(engine->i915) >= 12)
840 			return gen12_xcs_offsets;
841 		else if (INTEL_GEN(engine->i915) >= 9)
842 			return gen9_xcs_offsets;
843 		else
844 			return gen8_xcs_offsets;
845 	}
846 }
847 
848 static struct i915_request *
849 __unwind_incomplete_requests(struct intel_engine_cs *engine)
850 {
851 	struct i915_request *rq, *rn, *active = NULL;
852 	struct list_head *uninitialized_var(pl);
853 	int prio = I915_PRIORITY_INVALID;
854 
855 	lockdep_assert_held(&engine->active.lock);
856 
857 	list_for_each_entry_safe_reverse(rq, rn,
858 					 &engine->active.requests,
859 					 sched.link) {
860 		if (i915_request_completed(rq))
861 			continue; /* XXX */
862 
863 		__i915_request_unsubmit(rq);
864 
865 		/*
866 		 * Push the request back into the queue for later resubmission.
867 		 * If this request is not native to this physical engine (i.e.
868 		 * it came from a virtual source), push it back onto the virtual
869 		 * engine so that it can be moved across onto another physical
870 		 * engine as load dictates.
871 		 */
872 		if (likely(rq->execution_mask == engine->mask)) {
873 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
874 			if (rq_prio(rq) != prio) {
875 				prio = rq_prio(rq);
876 				pl = i915_sched_lookup_priolist(engine, prio);
877 			}
878 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
879 
880 			list_move(&rq->sched.link, pl);
881 			active = rq;
882 		} else {
883 			struct intel_engine_cs *owner = rq->hw_context->engine;
884 
885 			/*
886 			 * Decouple the virtual breadcrumb before moving it
887 			 * back to the virtual engine -- we don't want the
888 			 * request to complete in the background and try
889 			 * and cancel the breadcrumb on the virtual engine
890 			 * (instead of the old engine where it is linked)!
891 			 */
892 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
893 				     &rq->fence.flags)) {
894 				spin_lock_nested(&rq->lock,
895 						 SINGLE_DEPTH_NESTING);
896 				i915_request_cancel_breadcrumb(rq);
897 				spin_unlock(&rq->lock);
898 			}
899 			rq->engine = owner;
900 			owner->submit_request(rq);
901 			active = NULL;
902 		}
903 	}
904 
905 	return active;
906 }
907 
908 struct i915_request *
909 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
910 {
911 	struct intel_engine_cs *engine =
912 		container_of(execlists, typeof(*engine), execlists);
913 
914 	return __unwind_incomplete_requests(engine);
915 }
916 
917 static inline void
918 execlists_context_status_change(struct i915_request *rq, unsigned long status)
919 {
920 	/*
921 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
922 	 * The compiler should eliminate this function as dead-code.
923 	 */
924 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
925 		return;
926 
927 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
928 				   status, rq);
929 }
930 
931 static void intel_engine_context_in(struct intel_engine_cs *engine)
932 {
933 	unsigned long flags;
934 
935 	if (READ_ONCE(engine->stats.enabled) == 0)
936 		return;
937 
938 	write_seqlock_irqsave(&engine->stats.lock, flags);
939 
940 	if (engine->stats.enabled > 0) {
941 		if (engine->stats.active++ == 0)
942 			engine->stats.start = ktime_get();
943 		GEM_BUG_ON(engine->stats.active == 0);
944 	}
945 
946 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
947 }
948 
949 static void intel_engine_context_out(struct intel_engine_cs *engine)
950 {
951 	unsigned long flags;
952 
953 	if (READ_ONCE(engine->stats.enabled) == 0)
954 		return;
955 
956 	write_seqlock_irqsave(&engine->stats.lock, flags);
957 
958 	if (engine->stats.enabled > 0) {
959 		ktime_t last;
960 
961 		if (engine->stats.active && --engine->stats.active == 0) {
962 			/*
963 			 * Decrement the active context count and in case GPU
964 			 * is now idle add up to the running total.
965 			 */
966 			last = ktime_sub(ktime_get(), engine->stats.start);
967 
968 			engine->stats.total = ktime_add(engine->stats.total,
969 							last);
970 		} else if (engine->stats.active == 0) {
971 			/*
972 			 * After turning on engine stats, context out might be
973 			 * the first event in which case we account from the
974 			 * time stats gathering was turned on.
975 			 */
976 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
977 
978 			engine->stats.total = ktime_add(engine->stats.total,
979 							last);
980 		}
981 	}
982 
983 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
984 }
985 
986 static void restore_default_state(struct intel_context *ce,
987 				  struct intel_engine_cs *engine)
988 {
989 	u32 *regs = ce->lrc_reg_state;
990 
991 	if (engine->pinned_default_state)
992 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
993 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
994 		       engine->context_size - PAGE_SIZE);
995 
996 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
997 }
998 
999 static void reset_active(struct i915_request *rq,
1000 			 struct intel_engine_cs *engine)
1001 {
1002 	struct intel_context * const ce = rq->hw_context;
1003 	u32 head;
1004 
1005 	/*
1006 	 * The executing context has been cancelled. We want to prevent
1007 	 * further execution along this context and propagate the error on
1008 	 * to anything depending on its results.
1009 	 *
1010 	 * In __i915_request_submit(), we apply the -EIO and remove the
1011 	 * requests' payloads for any banned requests. But first, we must
1012 	 * rewind the context back to the start of the incomplete request so
1013 	 * that we do not jump back into the middle of the batch.
1014 	 *
1015 	 * We preserve the breadcrumbs and semaphores of the incomplete
1016 	 * requests so that inter-timeline dependencies (i.e other timelines)
1017 	 * remain correctly ordered. And we defer to __i915_request_submit()
1018 	 * so that all asynchronous waits are correctly handled.
1019 	 */
1020 	GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
1021 		  __func__, engine->name, rq->fence.context, rq->fence.seqno);
1022 
1023 	/* On resubmission of the active request, payload will be scrubbed */
1024 	if (i915_request_completed(rq))
1025 		head = rq->tail;
1026 	else
1027 		head = active_request(ce->timeline, rq)->head;
1028 	ce->ring->head = intel_ring_wrap(ce->ring, head);
1029 	intel_ring_update_space(ce->ring);
1030 
1031 	/* Scrub the context image to prevent replaying the previous batch */
1032 	restore_default_state(ce, engine);
1033 	__execlists_update_reg_state(ce, engine);
1034 
1035 	/* We've switched away, so this should be a no-op, but intent matters */
1036 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1037 }
1038 
1039 static inline struct intel_engine_cs *
1040 __execlists_schedule_in(struct i915_request *rq)
1041 {
1042 	struct intel_engine_cs * const engine = rq->engine;
1043 	struct intel_context * const ce = rq->hw_context;
1044 
1045 	intel_context_get(ce);
1046 
1047 	if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
1048 		reset_active(rq, engine);
1049 
1050 	if (ce->tag) {
1051 		/* Use a fixed tag for OA and friends */
1052 		ce->lrc_desc |= (u64)ce->tag << 32;
1053 	} else {
1054 		/* We don't need a strict matching tag, just different values */
1055 		ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1056 		ce->lrc_desc |=
1057 			(u64)(engine->context_tag++ % NUM_CONTEXT_TAG) <<
1058 			GEN11_SW_CTX_ID_SHIFT;
1059 		BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1060 	}
1061 
1062 	intel_gt_pm_get(engine->gt);
1063 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1064 	intel_engine_context_in(engine);
1065 
1066 	return engine;
1067 }
1068 
1069 static inline struct i915_request *
1070 execlists_schedule_in(struct i915_request *rq, int idx)
1071 {
1072 	struct intel_context * const ce = rq->hw_context;
1073 	struct intel_engine_cs *old;
1074 
1075 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1076 	trace_i915_request_in(rq, idx);
1077 
1078 	old = READ_ONCE(ce->inflight);
1079 	do {
1080 		if (!old) {
1081 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1082 			break;
1083 		}
1084 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1085 
1086 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1087 	return i915_request_get(rq);
1088 }
1089 
1090 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1091 {
1092 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1093 	struct i915_request *next = READ_ONCE(ve->request);
1094 
1095 	if (next && next->execution_mask & ~rq->execution_mask)
1096 		tasklet_schedule(&ve->base.execlists.tasklet);
1097 }
1098 
1099 static inline void
1100 __execlists_schedule_out(struct i915_request *rq,
1101 			 struct intel_engine_cs * const engine)
1102 {
1103 	struct intel_context * const ce = rq->hw_context;
1104 
1105 	/*
1106 	 * NB process_csb() is not under the engine->active.lock and hence
1107 	 * schedule_out can race with schedule_in meaning that we should
1108 	 * refrain from doing non-trivial work here.
1109 	 */
1110 
1111 	/*
1112 	 * If we have just completed this context, the engine may now be
1113 	 * idle and we want to re-enter powersaving.
1114 	 */
1115 	if (list_is_last(&rq->link, &ce->timeline->requests) &&
1116 	    i915_request_completed(rq))
1117 		intel_engine_add_retire(engine, ce->timeline);
1118 
1119 	intel_engine_context_out(engine);
1120 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1121 	intel_gt_pm_put_async(engine->gt);
1122 
1123 	/*
1124 	 * If this is part of a virtual engine, its next request may
1125 	 * have been blocked waiting for access to the active context.
1126 	 * We have to kick all the siblings again in case we need to
1127 	 * switch (e.g. the next request is not runnable on this
1128 	 * engine). Hopefully, we will already have submitted the next
1129 	 * request before the tasklet runs and do not need to rebuild
1130 	 * each virtual tree and kick everyone again.
1131 	 */
1132 	if (ce->engine != engine)
1133 		kick_siblings(rq, ce);
1134 
1135 	intel_context_put(ce);
1136 }
1137 
1138 static inline void
1139 execlists_schedule_out(struct i915_request *rq)
1140 {
1141 	struct intel_context * const ce = rq->hw_context;
1142 	struct intel_engine_cs *cur, *old;
1143 
1144 	trace_i915_request_out(rq);
1145 
1146 	old = READ_ONCE(ce->inflight);
1147 	do
1148 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1149 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1150 	if (!cur)
1151 		__execlists_schedule_out(rq, old);
1152 
1153 	i915_request_put(rq);
1154 }
1155 
1156 static u64 execlists_update_context(struct i915_request *rq)
1157 {
1158 	struct intel_context *ce = rq->hw_context;
1159 	u64 desc = ce->lrc_desc;
1160 	u32 tail;
1161 
1162 	/*
1163 	 * WaIdleLiteRestore:bdw,skl
1164 	 *
1165 	 * We should never submit the context with the same RING_TAIL twice
1166 	 * just in case we submit an empty ring, which confuses the HW.
1167 	 *
1168 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1169 	 * the normal request to be able to always advance the RING_TAIL on
1170 	 * subsequent resubmissions (for lite restore). Should that fail us,
1171 	 * and we try and submit the same tail again, force the context
1172 	 * reload.
1173 	 */
1174 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1175 	if (unlikely(ce->lrc_reg_state[CTX_RING_TAIL] == tail))
1176 		desc |= CTX_DESC_FORCE_RESTORE;
1177 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1178 	rq->tail = rq->wa_tail;
1179 
1180 	/*
1181 	 * Make sure the context image is complete before we submit it to HW.
1182 	 *
1183 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1184 	 * an uncached write such as our mmio register access, the empirical
1185 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1186 	 * may not be visible to the HW prior to the completion of the UC
1187 	 * register write and that we may begin execution from the context
1188 	 * before its image is complete leading to invalid PD chasing.
1189 	 *
1190 	 * Furthermore, Braswell, at least, wants a full mb to be sure that
1191 	 * the writes are coherent in memory (visible to the GPU) prior to
1192 	 * execution, and not just visible to other CPUs (as is the result of
1193 	 * wmb).
1194 	 */
1195 	mb();
1196 
1197 	/* Wa_1607138340:tgl */
1198 	if (IS_TGL_REVID(rq->i915, TGL_REVID_A0, TGL_REVID_A0))
1199 		desc |= CTX_DESC_FORCE_RESTORE;
1200 
1201 	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1202 	return desc;
1203 }
1204 
1205 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1206 {
1207 	if (execlists->ctrl_reg) {
1208 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1209 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1210 	} else {
1211 		writel(upper_32_bits(desc), execlists->submit_reg);
1212 		writel(lower_32_bits(desc), execlists->submit_reg);
1213 	}
1214 }
1215 
1216 static __maybe_unused void
1217 trace_ports(const struct intel_engine_execlists *execlists,
1218 	    const char *msg,
1219 	    struct i915_request * const *ports)
1220 {
1221 	const struct intel_engine_cs *engine =
1222 		container_of(execlists, typeof(*engine), execlists);
1223 
1224 	if (!ports[0])
1225 		return;
1226 
1227 	GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n",
1228 		  engine->name, msg,
1229 		  ports[0]->fence.context,
1230 		  ports[0]->fence.seqno,
1231 		  i915_request_completed(ports[0]) ? "!" :
1232 		  i915_request_started(ports[0]) ? "*" :
1233 		  "",
1234 		  ports[1] ? ports[1]->fence.context : 0,
1235 		  ports[1] ? ports[1]->fence.seqno : 0);
1236 }
1237 
1238 static __maybe_unused bool
1239 assert_pending_valid(const struct intel_engine_execlists *execlists,
1240 		     const char *msg)
1241 {
1242 	struct i915_request * const *port, *rq;
1243 	struct intel_context *ce = NULL;
1244 
1245 	trace_ports(execlists, msg, execlists->pending);
1246 
1247 	if (!execlists->pending[0]) {
1248 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1249 		return false;
1250 	}
1251 
1252 	if (execlists->pending[execlists_num_ports(execlists)]) {
1253 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1254 			      execlists_num_ports(execlists));
1255 		return false;
1256 	}
1257 
1258 	for (port = execlists->pending; (rq = *port); port++) {
1259 		if (ce == rq->hw_context) {
1260 			GEM_TRACE_ERR("Duplicate context in pending[%zd]\n",
1261 				      port - execlists->pending);
1262 			return false;
1263 		}
1264 
1265 		ce = rq->hw_context;
1266 		if (i915_request_completed(rq))
1267 			continue;
1268 
1269 		if (i915_active_is_idle(&ce->active)) {
1270 			GEM_TRACE_ERR("Inactive context in pending[%zd]\n",
1271 				      port - execlists->pending);
1272 			return false;
1273 		}
1274 
1275 		if (!i915_vma_is_pinned(ce->state)) {
1276 			GEM_TRACE_ERR("Unpinned context in pending[%zd]\n",
1277 				      port - execlists->pending);
1278 			return false;
1279 		}
1280 
1281 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1282 			GEM_TRACE_ERR("Unpinned ringbuffer in pending[%zd]\n",
1283 				      port - execlists->pending);
1284 			return false;
1285 		}
1286 	}
1287 
1288 	return ce;
1289 }
1290 
1291 static void execlists_submit_ports(struct intel_engine_cs *engine)
1292 {
1293 	struct intel_engine_execlists *execlists = &engine->execlists;
1294 	unsigned int n;
1295 
1296 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1297 
1298 	/*
1299 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1300 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1301 	 * not be relinquished until the device is idle (see
1302 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1303 	 * that all ELSP are drained i.e. we have processed the CSB,
1304 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1305 	 */
1306 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1307 
1308 	/*
1309 	 * ELSQ note: the submit queue is not cleared after being submitted
1310 	 * to the HW so we need to make sure we always clean it up. This is
1311 	 * currently ensured by the fact that we always write the same number
1312 	 * of elsq entries, keep this in mind before changing the loop below.
1313 	 */
1314 	for (n = execlists_num_ports(execlists); n--; ) {
1315 		struct i915_request *rq = execlists->pending[n];
1316 
1317 		write_desc(execlists,
1318 			   rq ? execlists_update_context(rq) : 0,
1319 			   n);
1320 	}
1321 
1322 	/* we need to manually load the submit queue */
1323 	if (execlists->ctrl_reg)
1324 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1325 }
1326 
1327 static bool ctx_single_port_submission(const struct intel_context *ce)
1328 {
1329 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1330 		i915_gem_context_force_single_submission(ce->gem_context));
1331 }
1332 
1333 static bool can_merge_ctx(const struct intel_context *prev,
1334 			  const struct intel_context *next)
1335 {
1336 	if (prev != next)
1337 		return false;
1338 
1339 	if (ctx_single_port_submission(prev))
1340 		return false;
1341 
1342 	return true;
1343 }
1344 
1345 static bool can_merge_rq(const struct i915_request *prev,
1346 			 const struct i915_request *next)
1347 {
1348 	GEM_BUG_ON(prev == next);
1349 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1350 
1351 	/*
1352 	 * We do not submit known completed requests. Therefore if the next
1353 	 * request is already completed, we can pretend to merge it in
1354 	 * with the previous context (and we will skip updating the ELSP
1355 	 * and tracking). Thus hopefully keeping the ELSP full with active
1356 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1357 	 * us.
1358 	 */
1359 	if (i915_request_completed(next))
1360 		return true;
1361 
1362 	if (unlikely((prev->flags ^ next->flags) &
1363 		     (I915_REQUEST_NOPREEMPT | I915_REQUEST_SENTINEL)))
1364 		return false;
1365 
1366 	if (!can_merge_ctx(prev->hw_context, next->hw_context))
1367 		return false;
1368 
1369 	return true;
1370 }
1371 
1372 static void virtual_update_register_offsets(u32 *regs,
1373 					    struct intel_engine_cs *engine)
1374 {
1375 	set_offsets(regs, reg_offsets(engine), engine);
1376 }
1377 
1378 static bool virtual_matches(const struct virtual_engine *ve,
1379 			    const struct i915_request *rq,
1380 			    const struct intel_engine_cs *engine)
1381 {
1382 	const struct intel_engine_cs *inflight;
1383 
1384 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1385 		return false;
1386 
1387 	/*
1388 	 * We track when the HW has completed saving the context image
1389 	 * (i.e. when we have seen the final CS event switching out of
1390 	 * the context) and must not overwrite the context image before
1391 	 * then. This restricts us to only using the active engine
1392 	 * while the previous virtualized request is inflight (so
1393 	 * we reuse the register offsets). This is a very small
1394 	 * hystersis on the greedy seelction algorithm.
1395 	 */
1396 	inflight = intel_context_inflight(&ve->context);
1397 	if (inflight && inflight != engine)
1398 		return false;
1399 
1400 	return true;
1401 }
1402 
1403 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1404 				     struct intel_engine_cs *engine)
1405 {
1406 	struct intel_engine_cs *old = ve->siblings[0];
1407 
1408 	/* All unattached (rq->engine == old) must already be completed */
1409 
1410 	spin_lock(&old->breadcrumbs.irq_lock);
1411 	if (!list_empty(&ve->context.signal_link)) {
1412 		list_move_tail(&ve->context.signal_link,
1413 			       &engine->breadcrumbs.signalers);
1414 		intel_engine_queue_breadcrumbs(engine);
1415 	}
1416 	spin_unlock(&old->breadcrumbs.irq_lock);
1417 }
1418 
1419 static struct i915_request *
1420 last_active(const struct intel_engine_execlists *execlists)
1421 {
1422 	struct i915_request * const *last = READ_ONCE(execlists->active);
1423 
1424 	while (*last && i915_request_completed(*last))
1425 		last++;
1426 
1427 	return *last;
1428 }
1429 
1430 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1431 {
1432 	LIST_HEAD(list);
1433 
1434 	/*
1435 	 * We want to move the interrupted request to the back of
1436 	 * the round-robin list (i.e. its priority level), but
1437 	 * in doing so, we must then move all requests that were in
1438 	 * flight and were waiting for the interrupted request to
1439 	 * be run after it again.
1440 	 */
1441 	do {
1442 		struct i915_dependency *p;
1443 
1444 		GEM_BUG_ON(i915_request_is_active(rq));
1445 		list_move_tail(&rq->sched.link, pl);
1446 
1447 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
1448 			struct i915_request *w =
1449 				container_of(p->waiter, typeof(*w), sched);
1450 
1451 			/* Leave semaphores spinning on the other engines */
1452 			if (w->engine != rq->engine)
1453 				continue;
1454 
1455 			/* No waiter should start before its signaler */
1456 			GEM_BUG_ON(i915_request_started(w) &&
1457 				   !i915_request_completed(rq));
1458 
1459 			GEM_BUG_ON(i915_request_is_active(w));
1460 			if (list_empty(&w->sched.link))
1461 				continue; /* Not yet submitted; unready */
1462 
1463 			if (rq_prio(w) < rq_prio(rq))
1464 				continue;
1465 
1466 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1467 			list_move_tail(&w->sched.link, &list);
1468 		}
1469 
1470 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1471 	} while (rq);
1472 }
1473 
1474 static void defer_active(struct intel_engine_cs *engine)
1475 {
1476 	struct i915_request *rq;
1477 
1478 	rq = __unwind_incomplete_requests(engine);
1479 	if (!rq)
1480 		return;
1481 
1482 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1483 }
1484 
1485 static bool
1486 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1487 {
1488 	int hint;
1489 
1490 	if (!intel_engine_has_timeslices(engine))
1491 		return false;
1492 
1493 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1494 		return false;
1495 
1496 	hint = max(rq_prio(list_next_entry(rq, sched.link)),
1497 		   engine->execlists.queue_priority_hint);
1498 
1499 	return hint >= effective_prio(rq);
1500 }
1501 
1502 static int
1503 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1504 {
1505 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1506 		return INT_MIN;
1507 
1508 	return rq_prio(list_next_entry(rq, sched.link));
1509 }
1510 
1511 static inline unsigned long
1512 timeslice(const struct intel_engine_cs *engine)
1513 {
1514 	return READ_ONCE(engine->props.timeslice_duration_ms);
1515 }
1516 
1517 static unsigned long
1518 active_timeslice(const struct intel_engine_cs *engine)
1519 {
1520 	const struct i915_request *rq = *engine->execlists.active;
1521 
1522 	if (i915_request_completed(rq))
1523 		return 0;
1524 
1525 	if (engine->execlists.switch_priority_hint < effective_prio(rq))
1526 		return 0;
1527 
1528 	return timeslice(engine);
1529 }
1530 
1531 static void set_timeslice(struct intel_engine_cs *engine)
1532 {
1533 	if (!intel_engine_has_timeslices(engine))
1534 		return;
1535 
1536 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1537 }
1538 
1539 static void record_preemption(struct intel_engine_execlists *execlists)
1540 {
1541 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1542 }
1543 
1544 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
1545 {
1546 	struct i915_request *rq;
1547 
1548 	rq = last_active(&engine->execlists);
1549 	if (!rq)
1550 		return 0;
1551 
1552 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1553 	if (unlikely(i915_gem_context_is_banned(rq->gem_context)))
1554 		return 1;
1555 
1556 	return READ_ONCE(engine->props.preempt_timeout_ms);
1557 }
1558 
1559 static void set_preempt_timeout(struct intel_engine_cs *engine)
1560 {
1561 	if (!intel_engine_has_preempt_reset(engine))
1562 		return;
1563 
1564 	set_timer_ms(&engine->execlists.preempt,
1565 		     active_preempt_timeout(engine));
1566 }
1567 
1568 static void execlists_dequeue(struct intel_engine_cs *engine)
1569 {
1570 	struct intel_engine_execlists * const execlists = &engine->execlists;
1571 	struct i915_request **port = execlists->pending;
1572 	struct i915_request ** const last_port = port + execlists->port_mask;
1573 	struct i915_request *last;
1574 	struct rb_node *rb;
1575 	bool submit = false;
1576 
1577 	/*
1578 	 * Hardware submission is through 2 ports. Conceptually each port
1579 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1580 	 * static for a context, and unique to each, so we only execute
1581 	 * requests belonging to a single context from each ring. RING_HEAD
1582 	 * is maintained by the CS in the context image, it marks the place
1583 	 * where it got up to last time, and through RING_TAIL we tell the CS
1584 	 * where we want to execute up to this time.
1585 	 *
1586 	 * In this list the requests are in order of execution. Consecutive
1587 	 * requests from the same context are adjacent in the ringbuffer. We
1588 	 * can combine these requests into a single RING_TAIL update:
1589 	 *
1590 	 *              RING_HEAD...req1...req2
1591 	 *                                    ^- RING_TAIL
1592 	 * since to execute req2 the CS must first execute req1.
1593 	 *
1594 	 * Our goal then is to point each port to the end of a consecutive
1595 	 * sequence of requests as being the most optimal (fewest wake ups
1596 	 * and context switches) submission.
1597 	 */
1598 
1599 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1600 		struct virtual_engine *ve =
1601 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1602 		struct i915_request *rq = READ_ONCE(ve->request);
1603 
1604 		if (!rq) { /* lazily cleanup after another engine handled rq */
1605 			rb_erase_cached(rb, &execlists->virtual);
1606 			RB_CLEAR_NODE(rb);
1607 			rb = rb_first_cached(&execlists->virtual);
1608 			continue;
1609 		}
1610 
1611 		if (!virtual_matches(ve, rq, engine)) {
1612 			rb = rb_next(rb);
1613 			continue;
1614 		}
1615 
1616 		break;
1617 	}
1618 
1619 	/*
1620 	 * If the queue is higher priority than the last
1621 	 * request in the currently active context, submit afresh.
1622 	 * We will resubmit again afterwards in case we need to split
1623 	 * the active context to interject the preemption request,
1624 	 * i.e. we will retrigger preemption following the ack in case
1625 	 * of trouble.
1626 	 */
1627 	last = last_active(execlists);
1628 	if (last) {
1629 		if (need_preempt(engine, last, rb)) {
1630 			GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n",
1631 				  engine->name,
1632 				  last->fence.context,
1633 				  last->fence.seqno,
1634 				  last->sched.attr.priority,
1635 				  execlists->queue_priority_hint);
1636 			record_preemption(execlists);
1637 
1638 			/*
1639 			 * Don't let the RING_HEAD advance past the breadcrumb
1640 			 * as we unwind (and until we resubmit) so that we do
1641 			 * not accidentally tell it to go backwards.
1642 			 */
1643 			ring_set_paused(engine, 1);
1644 
1645 			/*
1646 			 * Note that we have not stopped the GPU at this point,
1647 			 * so we are unwinding the incomplete requests as they
1648 			 * remain inflight and so by the time we do complete
1649 			 * the preemption, some of the unwound requests may
1650 			 * complete!
1651 			 */
1652 			__unwind_incomplete_requests(engine);
1653 
1654 			/*
1655 			 * If we need to return to the preempted context, we
1656 			 * need to skip the lite-restore and force it to
1657 			 * reload the RING_TAIL. Otherwise, the HW has a
1658 			 * tendency to ignore us rewinding the TAIL to the
1659 			 * end of an earlier request.
1660 			 */
1661 			last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1662 			last = NULL;
1663 		} else if (need_timeslice(engine, last) &&
1664 			   timer_expired(&engine->execlists.timer)) {
1665 			GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n",
1666 				  engine->name,
1667 				  last->fence.context,
1668 				  last->fence.seqno,
1669 				  last->sched.attr.priority,
1670 				  execlists->queue_priority_hint);
1671 
1672 			ring_set_paused(engine, 1);
1673 			defer_active(engine);
1674 
1675 			/*
1676 			 * Unlike for preemption, if we rewind and continue
1677 			 * executing the same context as previously active,
1678 			 * the order of execution will remain the same and
1679 			 * the tail will only advance. We do not need to
1680 			 * force a full context restore, as a lite-restore
1681 			 * is sufficient to resample the monotonic TAIL.
1682 			 *
1683 			 * If we switch to any other context, similarly we
1684 			 * will not rewind TAIL of current context, and
1685 			 * normal save/restore will preserve state and allow
1686 			 * us to later continue executing the same request.
1687 			 */
1688 			last = NULL;
1689 		} else {
1690 			/*
1691 			 * Otherwise if we already have a request pending
1692 			 * for execution after the current one, we can
1693 			 * just wait until the next CS event before
1694 			 * queuing more. In either case we will force a
1695 			 * lite-restore preemption event, but if we wait
1696 			 * we hopefully coalesce several updates into a single
1697 			 * submission.
1698 			 */
1699 			if (!list_is_last(&last->sched.link,
1700 					  &engine->active.requests)) {
1701 				/*
1702 				 * Even if ELSP[1] is occupied and not worthy
1703 				 * of timeslices, our queue might be.
1704 				 */
1705 				if (!execlists->timer.expires &&
1706 				    need_timeslice(engine, last))
1707 					set_timer_ms(&execlists->timer,
1708 						     timeslice(engine));
1709 
1710 				return;
1711 			}
1712 		}
1713 	}
1714 
1715 	while (rb) { /* XXX virtual is always taking precedence */
1716 		struct virtual_engine *ve =
1717 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1718 		struct i915_request *rq;
1719 
1720 		spin_lock(&ve->base.active.lock);
1721 
1722 		rq = ve->request;
1723 		if (unlikely(!rq)) { /* lost the race to a sibling */
1724 			spin_unlock(&ve->base.active.lock);
1725 			rb_erase_cached(rb, &execlists->virtual);
1726 			RB_CLEAR_NODE(rb);
1727 			rb = rb_first_cached(&execlists->virtual);
1728 			continue;
1729 		}
1730 
1731 		GEM_BUG_ON(rq != ve->request);
1732 		GEM_BUG_ON(rq->engine != &ve->base);
1733 		GEM_BUG_ON(rq->hw_context != &ve->context);
1734 
1735 		if (rq_prio(rq) >= queue_prio(execlists)) {
1736 			if (!virtual_matches(ve, rq, engine)) {
1737 				spin_unlock(&ve->base.active.lock);
1738 				rb = rb_next(rb);
1739 				continue;
1740 			}
1741 
1742 			if (last && !can_merge_rq(last, rq)) {
1743 				spin_unlock(&ve->base.active.lock);
1744 				return; /* leave this for another */
1745 			}
1746 
1747 			GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
1748 				  engine->name,
1749 				  rq->fence.context,
1750 				  rq->fence.seqno,
1751 				  i915_request_completed(rq) ? "!" :
1752 				  i915_request_started(rq) ? "*" :
1753 				  "",
1754 				  yesno(engine != ve->siblings[0]));
1755 
1756 			ve->request = NULL;
1757 			ve->base.execlists.queue_priority_hint = INT_MIN;
1758 			rb_erase_cached(rb, &execlists->virtual);
1759 			RB_CLEAR_NODE(rb);
1760 
1761 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1762 			rq->engine = engine;
1763 
1764 			if (engine != ve->siblings[0]) {
1765 				u32 *regs = ve->context.lrc_reg_state;
1766 				unsigned int n;
1767 
1768 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1769 
1770 				if (!intel_engine_has_relative_mmio(engine))
1771 					virtual_update_register_offsets(regs,
1772 									engine);
1773 
1774 				if (!list_empty(&ve->context.signals))
1775 					virtual_xfer_breadcrumbs(ve, engine);
1776 
1777 				/*
1778 				 * Move the bound engine to the top of the list
1779 				 * for future execution. We then kick this
1780 				 * tasklet first before checking others, so that
1781 				 * we preferentially reuse this set of bound
1782 				 * registers.
1783 				 */
1784 				for (n = 1; n < ve->num_siblings; n++) {
1785 					if (ve->siblings[n] == engine) {
1786 						swap(ve->siblings[n],
1787 						     ve->siblings[0]);
1788 						break;
1789 					}
1790 				}
1791 
1792 				GEM_BUG_ON(ve->siblings[0] != engine);
1793 			}
1794 
1795 			if (__i915_request_submit(rq)) {
1796 				submit = true;
1797 				last = rq;
1798 			}
1799 			i915_request_put(rq);
1800 
1801 			/*
1802 			 * Hmm, we have a bunch of virtual engine requests,
1803 			 * but the first one was already completed (thanks
1804 			 * preempt-to-busy!). Keep looking at the veng queue
1805 			 * until we have no more relevant requests (i.e.
1806 			 * the normal submit queue has higher priority).
1807 			 */
1808 			if (!submit) {
1809 				spin_unlock(&ve->base.active.lock);
1810 				rb = rb_first_cached(&execlists->virtual);
1811 				continue;
1812 			}
1813 		}
1814 
1815 		spin_unlock(&ve->base.active.lock);
1816 		break;
1817 	}
1818 
1819 	while ((rb = rb_first_cached(&execlists->queue))) {
1820 		struct i915_priolist *p = to_priolist(rb);
1821 		struct i915_request *rq, *rn;
1822 		int i;
1823 
1824 		priolist_for_each_request_consume(rq, rn, p, i) {
1825 			bool merge = true;
1826 
1827 			/*
1828 			 * Can we combine this request with the current port?
1829 			 * It has to be the same context/ringbuffer and not
1830 			 * have any exceptions (e.g. GVT saying never to
1831 			 * combine contexts).
1832 			 *
1833 			 * If we can combine the requests, we can execute both
1834 			 * by updating the RING_TAIL to point to the end of the
1835 			 * second request, and so we never need to tell the
1836 			 * hardware about the first.
1837 			 */
1838 			if (last && !can_merge_rq(last, rq)) {
1839 				/*
1840 				 * If we are on the second port and cannot
1841 				 * combine this request with the last, then we
1842 				 * are done.
1843 				 */
1844 				if (port == last_port)
1845 					goto done;
1846 
1847 				/*
1848 				 * We must not populate both ELSP[] with the
1849 				 * same LRCA, i.e. we must submit 2 different
1850 				 * contexts if we submit 2 ELSP.
1851 				 */
1852 				if (last->hw_context == rq->hw_context)
1853 					goto done;
1854 
1855 				if (i915_request_has_sentinel(last))
1856 					goto done;
1857 
1858 				/*
1859 				 * If GVT overrides us we only ever submit
1860 				 * port[0], leaving port[1] empty. Note that we
1861 				 * also have to be careful that we don't queue
1862 				 * the same context (even though a different
1863 				 * request) to the second port.
1864 				 */
1865 				if (ctx_single_port_submission(last->hw_context) ||
1866 				    ctx_single_port_submission(rq->hw_context))
1867 					goto done;
1868 
1869 				merge = false;
1870 			}
1871 
1872 			if (__i915_request_submit(rq)) {
1873 				if (!merge) {
1874 					*port = execlists_schedule_in(last, port - execlists->pending);
1875 					port++;
1876 					last = NULL;
1877 				}
1878 
1879 				GEM_BUG_ON(last &&
1880 					   !can_merge_ctx(last->hw_context,
1881 							  rq->hw_context));
1882 
1883 				submit = true;
1884 				last = rq;
1885 			}
1886 		}
1887 
1888 		rb_erase_cached(&p->node, &execlists->queue);
1889 		i915_priolist_free(p);
1890 	}
1891 
1892 done:
1893 	/*
1894 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
1895 	 *
1896 	 * We choose the priority hint such that if we add a request of greater
1897 	 * priority than this, we kick the submission tasklet to decide on
1898 	 * the right order of submitting the requests to hardware. We must
1899 	 * also be prepared to reorder requests as they are in-flight on the
1900 	 * HW. We derive the priority hint then as the first "hole" in
1901 	 * the HW submission ports and if there are no available slots,
1902 	 * the priority of the lowest executing request, i.e. last.
1903 	 *
1904 	 * When we do receive a higher priority request ready to run from the
1905 	 * user, see queue_request(), the priority hint is bumped to that
1906 	 * request triggering preemption on the next dequeue (or subsequent
1907 	 * interrupt for secondary ports).
1908 	 */
1909 	execlists->queue_priority_hint = queue_prio(execlists);
1910 	GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n",
1911 		  engine->name, execlists->queue_priority_hint,
1912 		  yesno(submit));
1913 
1914 	if (submit) {
1915 		*port = execlists_schedule_in(last, port - execlists->pending);
1916 		execlists->switch_priority_hint =
1917 			switch_prio(engine, *execlists->pending);
1918 
1919 		/*
1920 		 * Skip if we ended up with exactly the same set of requests,
1921 		 * e.g. trying to timeslice a pair of ordered contexts
1922 		 */
1923 		if (!memcmp(execlists->active, execlists->pending,
1924 			    (port - execlists->pending + 1) * sizeof(*port))) {
1925 			do
1926 				execlists_schedule_out(fetch_and_zero(port));
1927 			while (port-- != execlists->pending);
1928 
1929 			goto skip_submit;
1930 		}
1931 
1932 		memset(port + 1, 0, (last_port - port) * sizeof(*port));
1933 		execlists_submit_ports(engine);
1934 
1935 		set_preempt_timeout(engine);
1936 	} else {
1937 skip_submit:
1938 		ring_set_paused(engine, 0);
1939 	}
1940 }
1941 
1942 static void
1943 cancel_port_requests(struct intel_engine_execlists * const execlists)
1944 {
1945 	struct i915_request * const *port;
1946 
1947 	for (port = execlists->pending; *port; port++)
1948 		execlists_schedule_out(*port);
1949 	memset(execlists->pending, 0, sizeof(execlists->pending));
1950 
1951 	/* Mark the end of active before we overwrite *active */
1952 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
1953 		execlists_schedule_out(*port);
1954 	WRITE_ONCE(execlists->active,
1955 		   memset(execlists->inflight, 0, sizeof(execlists->inflight)));
1956 }
1957 
1958 static inline void
1959 invalidate_csb_entries(const u32 *first, const u32 *last)
1960 {
1961 	clflush((void *)first);
1962 	clflush((void *)last);
1963 }
1964 
1965 static inline bool
1966 reset_in_progress(const struct intel_engine_execlists *execlists)
1967 {
1968 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1969 }
1970 
1971 /*
1972  * Starting with Gen12, the status has a new format:
1973  *
1974  *     bit  0:     switched to new queue
1975  *     bit  1:     reserved
1976  *     bit  2:     semaphore wait mode (poll or signal), only valid when
1977  *                 switch detail is set to "wait on semaphore"
1978  *     bits 3-5:   engine class
1979  *     bits 6-11:  engine instance
1980  *     bits 12-14: reserved
1981  *     bits 15-25: sw context id of the lrc the GT switched to
1982  *     bits 26-31: sw counter of the lrc the GT switched to
1983  *     bits 32-35: context switch detail
1984  *                  - 0: ctx complete
1985  *                  - 1: wait on sync flip
1986  *                  - 2: wait on vblank
1987  *                  - 3: wait on scanline
1988  *                  - 4: wait on semaphore
1989  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
1990  *                       WAIT_FOR_EVENT)
1991  *     bit  36:    reserved
1992  *     bits 37-43: wait detail (for switch detail 1 to 4)
1993  *     bits 44-46: reserved
1994  *     bits 47-57: sw context id of the lrc the GT switched away from
1995  *     bits 58-63: sw counter of the lrc the GT switched away from
1996  */
1997 static inline bool
1998 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1999 {
2000 	u32 lower_dw = csb[0];
2001 	u32 upper_dw = csb[1];
2002 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2003 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2004 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2005 
2006 	/*
2007 	 * The context switch detail is not guaranteed to be 5 when a preemption
2008 	 * occurs, so we can't just check for that. The check below works for
2009 	 * all the cases we care about, including preemptions of WAIT
2010 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2011 	 * would require some extra handling, but we don't support that.
2012 	 */
2013 	if (!ctx_away_valid || new_queue) {
2014 		GEM_BUG_ON(!ctx_to_valid);
2015 		return true;
2016 	}
2017 
2018 	/*
2019 	 * switch detail = 5 is covered by the case above and we do not expect a
2020 	 * context switch on an unsuccessful wait instruction since we always
2021 	 * use polling mode.
2022 	 */
2023 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2024 	return false;
2025 }
2026 
2027 static inline bool
2028 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2029 {
2030 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2031 }
2032 
2033 static void process_csb(struct intel_engine_cs *engine)
2034 {
2035 	struct intel_engine_execlists * const execlists = &engine->execlists;
2036 	const u32 * const buf = execlists->csb_status;
2037 	const u8 num_entries = execlists->csb_size;
2038 	u8 head, tail;
2039 
2040 	/*
2041 	 * As we modify our execlists state tracking we require exclusive
2042 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2043 	 * and we assume that is only inside the reset paths and so serialised.
2044 	 */
2045 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2046 		   !reset_in_progress(execlists));
2047 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2048 
2049 	/*
2050 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2051 	 * When reading from the csb_write mmio register, we have to be
2052 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2053 	 * the low 4bits. As it happens we know the next 4bits are always
2054 	 * zero and so we can simply masked off the low u8 of the register
2055 	 * and treat it identically to reading from the HWSP (without having
2056 	 * to use explicit shifting and masking, and probably bifurcating
2057 	 * the code to handle the legacy mmio read).
2058 	 */
2059 	head = execlists->csb_head;
2060 	tail = READ_ONCE(*execlists->csb_write);
2061 	GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
2062 	if (unlikely(head == tail))
2063 		return;
2064 
2065 	/*
2066 	 * Hopefully paired with a wmb() in HW!
2067 	 *
2068 	 * We must complete the read of the write pointer before any reads
2069 	 * from the CSB, so that we do not see stale values. Without an rmb
2070 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2071 	 * we perform the READ_ONCE(*csb_write).
2072 	 */
2073 	rmb();
2074 
2075 	do {
2076 		bool promote;
2077 
2078 		if (++head == num_entries)
2079 			head = 0;
2080 
2081 		/*
2082 		 * We are flying near dragons again.
2083 		 *
2084 		 * We hold a reference to the request in execlist_port[]
2085 		 * but no more than that. We are operating in softirq
2086 		 * context and so cannot hold any mutex or sleep. That
2087 		 * prevents us stopping the requests we are processing
2088 		 * in port[] from being retired simultaneously (the
2089 		 * breadcrumb will be complete before we see the
2090 		 * context-switch). As we only hold the reference to the
2091 		 * request, any pointer chasing underneath the request
2092 		 * is subject to a potential use-after-free. Thus we
2093 		 * store all of the bookkeeping within port[] as
2094 		 * required, and avoid using unguarded pointers beneath
2095 		 * request itself. The same applies to the atomic
2096 		 * status notifier.
2097 		 */
2098 
2099 		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
2100 			  engine->name, head,
2101 			  buf[2 * head + 0], buf[2 * head + 1]);
2102 
2103 		if (INTEL_GEN(engine->i915) >= 12)
2104 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2105 		else
2106 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2107 		if (promote) {
2108 			struct i915_request * const *old = execlists->active;
2109 
2110 			/* Point active to the new ELSP; prevent overwriting */
2111 			WRITE_ONCE(execlists->active, execlists->pending);
2112 			set_timeslice(engine);
2113 
2114 			if (!inject_preempt_hang(execlists))
2115 				ring_set_paused(engine, 0);
2116 
2117 			/* cancel old inflight, prepare for switch */
2118 			trace_ports(execlists, "preempted", old);
2119 			while (*old)
2120 				execlists_schedule_out(*old++);
2121 
2122 			/* switch pending to inflight */
2123 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2124 			WRITE_ONCE(execlists->active,
2125 				   memcpy(execlists->inflight,
2126 					  execlists->pending,
2127 					  execlists_num_ports(execlists) *
2128 					  sizeof(*execlists->pending)));
2129 
2130 			WRITE_ONCE(execlists->pending[0], NULL);
2131 		} else {
2132 			GEM_BUG_ON(!*execlists->active);
2133 
2134 			/* port0 completed, advanced to port1 */
2135 			trace_ports(execlists, "completed", execlists->active);
2136 
2137 			/*
2138 			 * We rely on the hardware being strongly
2139 			 * ordered, that the breadcrumb write is
2140 			 * coherent (visible from the CPU) before the
2141 			 * user interrupt and CSB is processed.
2142 			 */
2143 			GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2144 				   !reset_in_progress(execlists));
2145 			execlists_schedule_out(*execlists->active++);
2146 
2147 			GEM_BUG_ON(execlists->active - execlists->inflight >
2148 				   execlists_num_ports(execlists));
2149 		}
2150 	} while (head != tail);
2151 
2152 	execlists->csb_head = head;
2153 
2154 	/*
2155 	 * Gen11 has proven to fail wrt global observation point between
2156 	 * entry and tail update, failing on the ordering and thus
2157 	 * we see an old entry in the context status buffer.
2158 	 *
2159 	 * Forcibly evict out entries for the next gpu csb update,
2160 	 * to increase the odds that we get a fresh entries with non
2161 	 * working hardware. The cost for doing so comes out mostly with
2162 	 * the wash as hardware, working or not, will need to do the
2163 	 * invalidation before.
2164 	 */
2165 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2166 }
2167 
2168 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2169 {
2170 	lockdep_assert_held(&engine->active.lock);
2171 	if (!engine->execlists.pending[0]) {
2172 		rcu_read_lock(); /* protect peeking at execlists->active */
2173 		execlists_dequeue(engine);
2174 		rcu_read_unlock();
2175 	}
2176 }
2177 
2178 static noinline void preempt_reset(struct intel_engine_cs *engine)
2179 {
2180 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2181 	unsigned long *lock = &engine->gt->reset.flags;
2182 
2183 	if (i915_modparams.reset < 3)
2184 		return;
2185 
2186 	if (test_and_set_bit(bit, lock))
2187 		return;
2188 
2189 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2190 	tasklet_disable_nosync(&engine->execlists.tasklet);
2191 
2192 	GEM_TRACE("%s: preempt timeout %lu+%ums\n",
2193 		  engine->name,
2194 		  READ_ONCE(engine->props.preempt_timeout_ms),
2195 		  jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2196 	intel_engine_reset(engine, "preemption time out");
2197 
2198 	tasklet_enable(&engine->execlists.tasklet);
2199 	clear_and_wake_up_bit(bit, lock);
2200 }
2201 
2202 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2203 {
2204 	const struct timer_list *t = &engine->execlists.preempt;
2205 
2206 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2207 		return false;
2208 
2209 	if (!timer_expired(t))
2210 		return false;
2211 
2212 	return READ_ONCE(engine->execlists.pending[0]);
2213 }
2214 
2215 /*
2216  * Check the unread Context Status Buffers and manage the submission of new
2217  * contexts to the ELSP accordingly.
2218  */
2219 static void execlists_submission_tasklet(unsigned long data)
2220 {
2221 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2222 	bool timeout = preempt_timeout(engine);
2223 
2224 	process_csb(engine);
2225 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2226 		unsigned long flags;
2227 
2228 		spin_lock_irqsave(&engine->active.lock, flags);
2229 		__execlists_submission_tasklet(engine);
2230 		spin_unlock_irqrestore(&engine->active.lock, flags);
2231 
2232 		/* Recheck after serialising with direct-submission */
2233 		if (timeout && preempt_timeout(engine))
2234 			preempt_reset(engine);
2235 	}
2236 }
2237 
2238 static void __execlists_kick(struct intel_engine_execlists *execlists)
2239 {
2240 	/* Kick the tasklet for some interrupt coalescing and reset handling */
2241 	tasklet_hi_schedule(&execlists->tasklet);
2242 }
2243 
2244 #define execlists_kick(t, member) \
2245 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2246 
2247 static void execlists_timeslice(struct timer_list *timer)
2248 {
2249 	execlists_kick(timer, timer);
2250 }
2251 
2252 static void execlists_preempt(struct timer_list *timer)
2253 {
2254 	execlists_kick(timer, preempt);
2255 }
2256 
2257 static void queue_request(struct intel_engine_cs *engine,
2258 			  struct i915_sched_node *node,
2259 			  int prio)
2260 {
2261 	GEM_BUG_ON(!list_empty(&node->link));
2262 	list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
2263 }
2264 
2265 static void __submit_queue_imm(struct intel_engine_cs *engine)
2266 {
2267 	struct intel_engine_execlists * const execlists = &engine->execlists;
2268 
2269 	if (reset_in_progress(execlists))
2270 		return; /* defer until we restart the engine following reset */
2271 
2272 	if (execlists->tasklet.func == execlists_submission_tasklet)
2273 		__execlists_submission_tasklet(engine);
2274 	else
2275 		tasklet_hi_schedule(&execlists->tasklet);
2276 }
2277 
2278 static void submit_queue(struct intel_engine_cs *engine,
2279 			 const struct i915_request *rq)
2280 {
2281 	struct intel_engine_execlists *execlists = &engine->execlists;
2282 
2283 	if (rq_prio(rq) <= execlists->queue_priority_hint)
2284 		return;
2285 
2286 	execlists->queue_priority_hint = rq_prio(rq);
2287 	__submit_queue_imm(engine);
2288 }
2289 
2290 static void execlists_submit_request(struct i915_request *request)
2291 {
2292 	struct intel_engine_cs *engine = request->engine;
2293 	unsigned long flags;
2294 
2295 	/* Will be called from irq-context when using foreign fences. */
2296 	spin_lock_irqsave(&engine->active.lock, flags);
2297 
2298 	queue_request(engine, &request->sched, rq_prio(request));
2299 
2300 	GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2301 	GEM_BUG_ON(list_empty(&request->sched.link));
2302 
2303 	submit_queue(engine, request);
2304 
2305 	spin_unlock_irqrestore(&engine->active.lock, flags);
2306 }
2307 
2308 static void __execlists_context_fini(struct intel_context *ce)
2309 {
2310 	intel_ring_put(ce->ring);
2311 	i915_vma_put(ce->state);
2312 }
2313 
2314 static void execlists_context_destroy(struct kref *kref)
2315 {
2316 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2317 
2318 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2319 	GEM_BUG_ON(intel_context_is_pinned(ce));
2320 
2321 	if (ce->state)
2322 		__execlists_context_fini(ce);
2323 
2324 	intel_context_fini(ce);
2325 	intel_context_free(ce);
2326 }
2327 
2328 static void
2329 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2330 {
2331 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2332 		return;
2333 
2334 	vaddr += engine->context_size;
2335 
2336 	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
2337 }
2338 
2339 static void
2340 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2341 {
2342 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2343 		return;
2344 
2345 	vaddr += engine->context_size;
2346 
2347 	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE))
2348 		dev_err_once(engine->i915->drm.dev,
2349 			     "%s context redzone overwritten!\n",
2350 			     engine->name);
2351 }
2352 
2353 static void execlists_context_unpin(struct intel_context *ce)
2354 {
2355 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2356 		      ce->engine);
2357 
2358 	i915_gem_object_unpin_map(ce->state->obj);
2359 	intel_ring_reset(ce->ring, ce->ring->tail);
2360 }
2361 
2362 static void
2363 __execlists_update_reg_state(const struct intel_context *ce,
2364 			     const struct intel_engine_cs *engine)
2365 {
2366 	struct intel_ring *ring = ce->ring;
2367 	u32 *regs = ce->lrc_reg_state;
2368 
2369 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
2370 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2371 
2372 	regs[CTX_RING_BUFFER_START] = i915_ggtt_offset(ring->vma);
2373 	regs[CTX_RING_HEAD] = ring->head;
2374 	regs[CTX_RING_TAIL] = ring->tail;
2375 
2376 	/* RPCS */
2377 	if (engine->class == RENDER_CLASS) {
2378 		regs[CTX_R_PWR_CLK_STATE] =
2379 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2380 
2381 		i915_oa_init_reg_state(ce, engine);
2382 	}
2383 }
2384 
2385 static int
2386 __execlists_context_pin(struct intel_context *ce,
2387 			struct intel_engine_cs *engine)
2388 {
2389 	void *vaddr;
2390 	int ret;
2391 
2392 	GEM_BUG_ON(!ce->state);
2393 
2394 	ret = intel_context_active_acquire(ce);
2395 	if (ret)
2396 		goto err;
2397 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2398 
2399 	vaddr = i915_gem_object_pin_map(ce->state->obj,
2400 					i915_coherent_map_type(engine->i915) |
2401 					I915_MAP_OVERRIDE);
2402 	if (IS_ERR(vaddr)) {
2403 		ret = PTR_ERR(vaddr);
2404 		goto unpin_active;
2405 	}
2406 
2407 	ce->lrc_desc = lrc_descriptor(ce, engine);
2408 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2409 	__execlists_update_reg_state(ce, engine);
2410 
2411 	return 0;
2412 
2413 unpin_active:
2414 	intel_context_active_release(ce);
2415 err:
2416 	return ret;
2417 }
2418 
2419 static int execlists_context_pin(struct intel_context *ce)
2420 {
2421 	return __execlists_context_pin(ce, ce->engine);
2422 }
2423 
2424 static int execlists_context_alloc(struct intel_context *ce)
2425 {
2426 	return __execlists_context_alloc(ce, ce->engine);
2427 }
2428 
2429 static void execlists_context_reset(struct intel_context *ce)
2430 {
2431 	/*
2432 	 * Because we emit WA_TAIL_DWORDS there may be a disparity
2433 	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2434 	 * that stored in context. As we only write new commands from
2435 	 * ce->ring->tail onwards, everything before that is junk. If the GPU
2436 	 * starts reading from its RING_HEAD from the context, it may try to
2437 	 * execute that junk and die.
2438 	 *
2439 	 * The contexts that are stilled pinned on resume belong to the
2440 	 * kernel, and are local to each engine. All other contexts will
2441 	 * have their head/tail sanitized upon pinning before use, so they
2442 	 * will never see garbage,
2443 	 *
2444 	 * So to avoid that we reset the context images upon resume. For
2445 	 * simplicity, we just zero everything out.
2446 	 */
2447 	intel_ring_reset(ce->ring, 0);
2448 	__execlists_update_reg_state(ce, ce->engine);
2449 }
2450 
2451 static const struct intel_context_ops execlists_context_ops = {
2452 	.alloc = execlists_context_alloc,
2453 
2454 	.pin = execlists_context_pin,
2455 	.unpin = execlists_context_unpin,
2456 
2457 	.enter = intel_context_enter_engine,
2458 	.exit = intel_context_exit_engine,
2459 
2460 	.reset = execlists_context_reset,
2461 	.destroy = execlists_context_destroy,
2462 };
2463 
2464 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2465 {
2466 	u32 *cs;
2467 
2468 	GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2469 
2470 	cs = intel_ring_begin(rq, 6);
2471 	if (IS_ERR(cs))
2472 		return PTR_ERR(cs);
2473 
2474 	/*
2475 	 * Check if we have been preempted before we even get started.
2476 	 *
2477 	 * After this point i915_request_started() reports true, even if
2478 	 * we get preempted and so are no longer running.
2479 	 */
2480 	*cs++ = MI_ARB_CHECK;
2481 	*cs++ = MI_NOOP;
2482 
2483 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2484 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
2485 	*cs++ = 0;
2486 	*cs++ = rq->fence.seqno - 1;
2487 
2488 	intel_ring_advance(rq, cs);
2489 
2490 	/* Record the updated position of the request's payload */
2491 	rq->infix = intel_ring_offset(rq, cs);
2492 
2493 	return 0;
2494 }
2495 
2496 static int execlists_request_alloc(struct i915_request *request)
2497 {
2498 	int ret;
2499 
2500 	GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
2501 
2502 	/*
2503 	 * Flush enough space to reduce the likelihood of waiting after
2504 	 * we start building the request - in which case we will just
2505 	 * have to repeat work.
2506 	 */
2507 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
2508 
2509 	/*
2510 	 * Note that after this point, we have committed to using
2511 	 * this request as it is being used to both track the
2512 	 * state of engine initialisation and liveness of the
2513 	 * golden renderstate above. Think twice before you try
2514 	 * to cancel/unwind this request now.
2515 	 */
2516 
2517 	/* Unconditionally invalidate GPU caches and TLBs. */
2518 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
2519 	if (ret)
2520 		return ret;
2521 
2522 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
2523 	return 0;
2524 }
2525 
2526 /*
2527  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
2528  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
2529  * but there is a slight complication as this is applied in WA batch where the
2530  * values are only initialized once so we cannot take register value at the
2531  * beginning and reuse it further; hence we save its value to memory, upload a
2532  * constant value with bit21 set and then we restore it back with the saved value.
2533  * To simplify the WA, a constant value is formed by using the default value
2534  * of this register. This shouldn't be a problem because we are only modifying
2535  * it for a short period and this batch in non-premptible. We can ofcourse
2536  * use additional instructions that read the actual value of the register
2537  * at that time and set our bit of interest but it makes the WA complicated.
2538  *
2539  * This WA is also required for Gen9 so extracting as a function avoids
2540  * code duplication.
2541  */
2542 static u32 *
2543 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
2544 {
2545 	/* NB no one else is allowed to scribble over scratch + 256! */
2546 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2547 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2548 	*batch++ = intel_gt_scratch_offset(engine->gt,
2549 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2550 	*batch++ = 0;
2551 
2552 	*batch++ = MI_LOAD_REGISTER_IMM(1);
2553 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2554 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
2555 
2556 	batch = gen8_emit_pipe_control(batch,
2557 				       PIPE_CONTROL_CS_STALL |
2558 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
2559 				       0);
2560 
2561 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2562 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2563 	*batch++ = intel_gt_scratch_offset(engine->gt,
2564 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2565 	*batch++ = 0;
2566 
2567 	return batch;
2568 }
2569 
2570 /*
2571  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
2572  * initialized at the beginning and shared across all contexts but this field
2573  * helps us to have multiple batches at different offsets and select them based
2574  * on a criteria. At the moment this batch always start at the beginning of the page
2575  * and at this point we don't have multiple wa_ctx batch buffers.
2576  *
2577  * The number of WA applied are not known at the beginning; we use this field
2578  * to return the no of DWORDS written.
2579  *
2580  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
2581  * so it adds NOOPs as padding to make it cacheline aligned.
2582  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
2583  * makes a complete batch buffer.
2584  */
2585 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2586 {
2587 	/* WaDisableCtxRestoreArbitration:bdw,chv */
2588 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2589 
2590 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
2591 	if (IS_BROADWELL(engine->i915))
2592 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2593 
2594 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
2595 	/* Actual scratch location is at 128 bytes offset */
2596 	batch = gen8_emit_pipe_control(batch,
2597 				       PIPE_CONTROL_FLUSH_L3 |
2598 				       PIPE_CONTROL_STORE_DATA_INDEX |
2599 				       PIPE_CONTROL_CS_STALL |
2600 				       PIPE_CONTROL_QW_WRITE,
2601 				       LRC_PPHWSP_SCRATCH_ADDR);
2602 
2603 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2604 
2605 	/* Pad to end of cacheline */
2606 	while ((unsigned long)batch % CACHELINE_BYTES)
2607 		*batch++ = MI_NOOP;
2608 
2609 	/*
2610 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
2611 	 * execution depends on the length specified in terms of cache lines
2612 	 * in the register CTX_RCS_INDIRECT_CTX
2613 	 */
2614 
2615 	return batch;
2616 }
2617 
2618 struct lri {
2619 	i915_reg_t reg;
2620 	u32 value;
2621 };
2622 
2623 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
2624 {
2625 	GEM_BUG_ON(!count || count > 63);
2626 
2627 	*batch++ = MI_LOAD_REGISTER_IMM(count);
2628 	do {
2629 		*batch++ = i915_mmio_reg_offset(lri->reg);
2630 		*batch++ = lri->value;
2631 	} while (lri++, --count);
2632 	*batch++ = MI_NOOP;
2633 
2634 	return batch;
2635 }
2636 
2637 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2638 {
2639 	static const struct lri lri[] = {
2640 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
2641 		{
2642 			COMMON_SLICE_CHICKEN2,
2643 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
2644 				       0),
2645 		},
2646 
2647 		/* BSpec: 11391 */
2648 		{
2649 			FF_SLICE_CHICKEN,
2650 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
2651 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
2652 		},
2653 
2654 		/* BSpec: 11299 */
2655 		{
2656 			_3D_CHICKEN3,
2657 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
2658 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
2659 		}
2660 	};
2661 
2662 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2663 
2664 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
2665 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2666 
2667 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
2668 
2669 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
2670 	if (HAS_POOLED_EU(engine->i915)) {
2671 		/*
2672 		 * EU pool configuration is setup along with golden context
2673 		 * during context initialization. This value depends on
2674 		 * device type (2x6 or 3x6) and needs to be updated based
2675 		 * on which subslice is disabled especially for 2x6
2676 		 * devices, however it is safe to load default
2677 		 * configuration of 3x6 device instead of masking off
2678 		 * corresponding bits because HW ignores bits of a disabled
2679 		 * subslice and drops down to appropriate config. Please
2680 		 * see render_state_setup() in i915_gem_render_state.c for
2681 		 * possible configurations, to avoid duplication they are
2682 		 * not shown here again.
2683 		 */
2684 		*batch++ = GEN9_MEDIA_POOL_STATE;
2685 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
2686 		*batch++ = 0x00777000;
2687 		*batch++ = 0;
2688 		*batch++ = 0;
2689 		*batch++ = 0;
2690 	}
2691 
2692 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2693 
2694 	/* Pad to end of cacheline */
2695 	while ((unsigned long)batch % CACHELINE_BYTES)
2696 		*batch++ = MI_NOOP;
2697 
2698 	return batch;
2699 }
2700 
2701 static u32 *
2702 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2703 {
2704 	int i;
2705 
2706 	/*
2707 	 * WaPipeControlBefore3DStateSamplePattern: cnl
2708 	 *
2709 	 * Ensure the engine is idle prior to programming a
2710 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
2711 	 */
2712 	batch = gen8_emit_pipe_control(batch,
2713 				       PIPE_CONTROL_CS_STALL,
2714 				       0);
2715 	/*
2716 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
2717 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
2718 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
2719 	 * confusing. Since gen8_emit_pipe_control() already advances the
2720 	 * batch by 6 dwords, we advance the other 10 here, completing a
2721 	 * cacheline. It's not clear if the workaround requires this padding
2722 	 * before other commands, or if it's just the regular padding we would
2723 	 * already have for the workaround bb, so leave it here for now.
2724 	 */
2725 	for (i = 0; i < 10; i++)
2726 		*batch++ = MI_NOOP;
2727 
2728 	/* Pad to end of cacheline */
2729 	while ((unsigned long)batch % CACHELINE_BYTES)
2730 		*batch++ = MI_NOOP;
2731 
2732 	return batch;
2733 }
2734 
2735 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
2736 
2737 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
2738 {
2739 	struct drm_i915_gem_object *obj;
2740 	struct i915_vma *vma;
2741 	int err;
2742 
2743 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
2744 	if (IS_ERR(obj))
2745 		return PTR_ERR(obj);
2746 
2747 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
2748 	if (IS_ERR(vma)) {
2749 		err = PTR_ERR(vma);
2750 		goto err;
2751 	}
2752 
2753 	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2754 	if (err)
2755 		goto err;
2756 
2757 	engine->wa_ctx.vma = vma;
2758 	return 0;
2759 
2760 err:
2761 	i915_gem_object_put(obj);
2762 	return err;
2763 }
2764 
2765 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
2766 {
2767 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
2768 }
2769 
2770 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
2771 
2772 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
2773 {
2774 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2775 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
2776 					    &wa_ctx->per_ctx };
2777 	wa_bb_func_t wa_bb_fn[2];
2778 	struct page *page;
2779 	void *batch, *batch_ptr;
2780 	unsigned int i;
2781 	int ret;
2782 
2783 	if (engine->class != RENDER_CLASS)
2784 		return 0;
2785 
2786 	switch (INTEL_GEN(engine->i915)) {
2787 	case 12:
2788 	case 11:
2789 		return 0;
2790 	case 10:
2791 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
2792 		wa_bb_fn[1] = NULL;
2793 		break;
2794 	case 9:
2795 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
2796 		wa_bb_fn[1] = NULL;
2797 		break;
2798 	case 8:
2799 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
2800 		wa_bb_fn[1] = NULL;
2801 		break;
2802 	default:
2803 		MISSING_CASE(INTEL_GEN(engine->i915));
2804 		return 0;
2805 	}
2806 
2807 	ret = lrc_setup_wa_ctx(engine);
2808 	if (ret) {
2809 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
2810 		return ret;
2811 	}
2812 
2813 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
2814 	batch = batch_ptr = kmap_atomic(page);
2815 
2816 	/*
2817 	 * Emit the two workaround batch buffers, recording the offset from the
2818 	 * start of the workaround batch buffer object for each and their
2819 	 * respective sizes.
2820 	 */
2821 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
2822 		wa_bb[i]->offset = batch_ptr - batch;
2823 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
2824 						  CACHELINE_BYTES))) {
2825 			ret = -EINVAL;
2826 			break;
2827 		}
2828 		if (wa_bb_fn[i])
2829 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
2830 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
2831 	}
2832 
2833 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
2834 
2835 	kunmap_atomic(batch);
2836 	if (ret)
2837 		lrc_destroy_wa_ctx(engine);
2838 
2839 	return ret;
2840 }
2841 
2842 static void enable_execlists(struct intel_engine_cs *engine)
2843 {
2844 	u32 mode;
2845 
2846 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
2847 
2848 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
2849 
2850 	if (INTEL_GEN(engine->i915) >= 11)
2851 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
2852 	else
2853 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
2854 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
2855 
2856 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
2857 
2858 	ENGINE_WRITE_FW(engine,
2859 			RING_HWS_PGA,
2860 			i915_ggtt_offset(engine->status_page.vma));
2861 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
2862 }
2863 
2864 static bool unexpected_starting_state(struct intel_engine_cs *engine)
2865 {
2866 	bool unexpected = false;
2867 
2868 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
2869 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
2870 		unexpected = true;
2871 	}
2872 
2873 	return unexpected;
2874 }
2875 
2876 static int execlists_resume(struct intel_engine_cs *engine)
2877 {
2878 	intel_engine_apply_workarounds(engine);
2879 	intel_engine_apply_whitelist(engine);
2880 
2881 	intel_mocs_init_engine(engine);
2882 
2883 	intel_engine_reset_breadcrumbs(engine);
2884 
2885 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
2886 		struct drm_printer p = drm_debug_printer(__func__);
2887 
2888 		intel_engine_dump(engine, &p, NULL);
2889 	}
2890 
2891 	enable_execlists(engine);
2892 
2893 	return 0;
2894 }
2895 
2896 static void execlists_reset_prepare(struct intel_engine_cs *engine)
2897 {
2898 	struct intel_engine_execlists * const execlists = &engine->execlists;
2899 	unsigned long flags;
2900 
2901 	GEM_TRACE("%s: depth<-%d\n", engine->name,
2902 		  atomic_read(&execlists->tasklet.count));
2903 
2904 	/*
2905 	 * Prevent request submission to the hardware until we have
2906 	 * completed the reset in i915_gem_reset_finish(). If a request
2907 	 * is completed by one engine, it may then queue a request
2908 	 * to a second via its execlists->tasklet *just* as we are
2909 	 * calling engine->resume() and also writing the ELSP.
2910 	 * Turning off the execlists->tasklet until the reset is over
2911 	 * prevents the race.
2912 	 */
2913 	__tasklet_disable_sync_once(&execlists->tasklet);
2914 	GEM_BUG_ON(!reset_in_progress(execlists));
2915 
2916 	/* And flush any current direct submission. */
2917 	spin_lock_irqsave(&engine->active.lock, flags);
2918 	spin_unlock_irqrestore(&engine->active.lock, flags);
2919 
2920 	/*
2921 	 * We stop engines, otherwise we might get failed reset and a
2922 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
2923 	 * from system hang if batchbuffer is progressing when
2924 	 * the reset is issued, regardless of READY_TO_RESET ack.
2925 	 * Thus assume it is best to stop engines on all gens
2926 	 * where we have a gpu reset.
2927 	 *
2928 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
2929 	 *
2930 	 * FIXME: Wa for more modern gens needs to be validated
2931 	 */
2932 	intel_engine_stop_cs(engine);
2933 }
2934 
2935 static void reset_csb_pointers(struct intel_engine_cs *engine)
2936 {
2937 	struct intel_engine_execlists * const execlists = &engine->execlists;
2938 	const unsigned int reset_value = execlists->csb_size - 1;
2939 
2940 	ring_set_paused(engine, 0);
2941 
2942 	/*
2943 	 * After a reset, the HW starts writing into CSB entry [0]. We
2944 	 * therefore have to set our HEAD pointer back one entry so that
2945 	 * the *first* entry we check is entry 0. To complicate this further,
2946 	 * as we don't wait for the first interrupt after reset, we have to
2947 	 * fake the HW write to point back to the last entry so that our
2948 	 * inline comparison of our cached head position against the last HW
2949 	 * write works even before the first interrupt.
2950 	 */
2951 	execlists->csb_head = reset_value;
2952 	WRITE_ONCE(*execlists->csb_write, reset_value);
2953 	wmb(); /* Make sure this is visible to HW (paranoia?) */
2954 
2955 	invalidate_csb_entries(&execlists->csb_status[0],
2956 			       &execlists->csb_status[reset_value]);
2957 }
2958 
2959 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
2960 {
2961 	if (INTEL_GEN(engine->i915) >= 12)
2962 		return 0x60;
2963 	else if (INTEL_GEN(engine->i915) >= 9)
2964 		return 0x54;
2965 	else if (engine->class == RENDER_CLASS)
2966 		return 0x58;
2967 	else
2968 		return -1;
2969 }
2970 
2971 static void __execlists_reset_reg_state(const struct intel_context *ce,
2972 					const struct intel_engine_cs *engine)
2973 {
2974 	u32 *regs = ce->lrc_reg_state;
2975 	int x;
2976 
2977 	x = lrc_ring_mi_mode(engine);
2978 	if (x != -1) {
2979 		regs[x + 1] &= ~STOP_RING;
2980 		regs[x + 1] |= STOP_RING << 16;
2981 	}
2982 }
2983 
2984 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
2985 {
2986 	struct intel_engine_execlists * const execlists = &engine->execlists;
2987 	struct intel_context *ce;
2988 	struct i915_request *rq;
2989 
2990 	mb(); /* paranoia: read the CSB pointers from after the reset */
2991 	clflush(execlists->csb_write);
2992 	mb();
2993 
2994 	process_csb(engine); /* drain preemption events */
2995 
2996 	/* Following the reset, we need to reload the CSB read/write pointers */
2997 	reset_csb_pointers(engine);
2998 
2999 	/*
3000 	 * Save the currently executing context, even if we completed
3001 	 * its request, it was still running at the time of the
3002 	 * reset and will have been clobbered.
3003 	 */
3004 	rq = execlists_active(execlists);
3005 	if (!rq)
3006 		goto unwind;
3007 
3008 	/* We still have requests in-flight; the engine should be active */
3009 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3010 
3011 	ce = rq->hw_context;
3012 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3013 
3014 	if (i915_request_completed(rq)) {
3015 		/* Idle context; tidy up the ring so we can restart afresh */
3016 		ce->ring->head = intel_ring_wrap(ce->ring, rq->tail);
3017 		goto out_replay;
3018 	}
3019 
3020 	/* Context has requests still in-flight; it should not be idle! */
3021 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3022 	rq = active_request(ce->timeline, rq);
3023 	ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
3024 	GEM_BUG_ON(ce->ring->head == ce->ring->tail);
3025 
3026 	/*
3027 	 * If this request hasn't started yet, e.g. it is waiting on a
3028 	 * semaphore, we need to avoid skipping the request or else we
3029 	 * break the signaling chain. However, if the context is corrupt
3030 	 * the request will not restart and we will be stuck with a wedged
3031 	 * device. It is quite often the case that if we issue a reset
3032 	 * while the GPU is loading the context image, that the context
3033 	 * image becomes corrupt.
3034 	 *
3035 	 * Otherwise, if we have not started yet, the request should replay
3036 	 * perfectly and we do not need to flag the result as being erroneous.
3037 	 */
3038 	if (!i915_request_started(rq))
3039 		goto out_replay;
3040 
3041 	/*
3042 	 * If the request was innocent, we leave the request in the ELSP
3043 	 * and will try to replay it on restarting. The context image may
3044 	 * have been corrupted by the reset, in which case we may have
3045 	 * to service a new GPU hang, but more likely we can continue on
3046 	 * without impact.
3047 	 *
3048 	 * If the request was guilty, we presume the context is corrupt
3049 	 * and have to at least restore the RING register in the context
3050 	 * image back to the expected values to skip over the guilty request.
3051 	 */
3052 	__i915_request_reset(rq, stalled);
3053 	if (!stalled)
3054 		goto out_replay;
3055 
3056 	/*
3057 	 * We want a simple context + ring to execute the breadcrumb update.
3058 	 * We cannot rely on the context being intact across the GPU hang,
3059 	 * so clear it and rebuild just what we need for the breadcrumb.
3060 	 * All pending requests for this context will be zapped, and any
3061 	 * future request will be after userspace has had the opportunity
3062 	 * to recreate its own state.
3063 	 */
3064 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3065 	restore_default_state(ce, engine);
3066 
3067 out_replay:
3068 	GEM_TRACE("%s replay {head:%04x, tail:%04x}\n",
3069 		  engine->name, ce->ring->head, ce->ring->tail);
3070 	intel_ring_update_space(ce->ring);
3071 	__execlists_reset_reg_state(ce, engine);
3072 	__execlists_update_reg_state(ce, engine);
3073 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3074 
3075 unwind:
3076 	/* Push back any incomplete requests for replay after the reset. */
3077 	cancel_port_requests(execlists);
3078 	__unwind_incomplete_requests(engine);
3079 }
3080 
3081 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
3082 {
3083 	unsigned long flags;
3084 
3085 	GEM_TRACE("%s\n", engine->name);
3086 
3087 	spin_lock_irqsave(&engine->active.lock, flags);
3088 
3089 	__execlists_reset(engine, stalled);
3090 
3091 	spin_unlock_irqrestore(&engine->active.lock, flags);
3092 }
3093 
3094 static void nop_submission_tasklet(unsigned long data)
3095 {
3096 	/* The driver is wedged; don't process any more events. */
3097 }
3098 
3099 static void execlists_cancel_requests(struct intel_engine_cs *engine)
3100 {
3101 	struct intel_engine_execlists * const execlists = &engine->execlists;
3102 	struct i915_request *rq, *rn;
3103 	struct rb_node *rb;
3104 	unsigned long flags;
3105 
3106 	GEM_TRACE("%s\n", engine->name);
3107 
3108 	/*
3109 	 * Before we call engine->cancel_requests(), we should have exclusive
3110 	 * access to the submission state. This is arranged for us by the
3111 	 * caller disabling the interrupt generation, the tasklet and other
3112 	 * threads that may then access the same state, giving us a free hand
3113 	 * to reset state. However, we still need to let lockdep be aware that
3114 	 * we know this state may be accessed in hardirq context, so we
3115 	 * disable the irq around this manipulation and we want to keep
3116 	 * the spinlock focused on its duties and not accidentally conflate
3117 	 * coverage to the submission's irq state. (Similarly, although we
3118 	 * shouldn't need to disable irq around the manipulation of the
3119 	 * submission's irq state, we also wish to remind ourselves that
3120 	 * it is irq state.)
3121 	 */
3122 	spin_lock_irqsave(&engine->active.lock, flags);
3123 
3124 	__execlists_reset(engine, true);
3125 
3126 	/* Mark all executing requests as skipped. */
3127 	list_for_each_entry(rq, &engine->active.requests, sched.link)
3128 		mark_eio(rq);
3129 
3130 	/* Flush the queued requests to the timeline list (for retiring). */
3131 	while ((rb = rb_first_cached(&execlists->queue))) {
3132 		struct i915_priolist *p = to_priolist(rb);
3133 		int i;
3134 
3135 		priolist_for_each_request_consume(rq, rn, p, i) {
3136 			mark_eio(rq);
3137 			__i915_request_submit(rq);
3138 		}
3139 
3140 		rb_erase_cached(&p->node, &execlists->queue);
3141 		i915_priolist_free(p);
3142 	}
3143 
3144 	/* Cancel all attached virtual engines */
3145 	while ((rb = rb_first_cached(&execlists->virtual))) {
3146 		struct virtual_engine *ve =
3147 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3148 
3149 		rb_erase_cached(rb, &execlists->virtual);
3150 		RB_CLEAR_NODE(rb);
3151 
3152 		spin_lock(&ve->base.active.lock);
3153 		rq = fetch_and_zero(&ve->request);
3154 		if (rq) {
3155 			mark_eio(rq);
3156 
3157 			rq->engine = engine;
3158 			__i915_request_submit(rq);
3159 			i915_request_put(rq);
3160 
3161 			ve->base.execlists.queue_priority_hint = INT_MIN;
3162 		}
3163 		spin_unlock(&ve->base.active.lock);
3164 	}
3165 
3166 	/* Remaining _unready_ requests will be nop'ed when submitted */
3167 
3168 	execlists->queue_priority_hint = INT_MIN;
3169 	execlists->queue = RB_ROOT_CACHED;
3170 
3171 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3172 	execlists->tasklet.func = nop_submission_tasklet;
3173 
3174 	spin_unlock_irqrestore(&engine->active.lock, flags);
3175 }
3176 
3177 static void execlists_reset_finish(struct intel_engine_cs *engine)
3178 {
3179 	struct intel_engine_execlists * const execlists = &engine->execlists;
3180 
3181 	/*
3182 	 * After a GPU reset, we may have requests to replay. Do so now while
3183 	 * we still have the forcewake to be sure that the GPU is not allowed
3184 	 * to sleep before we restart and reload a context.
3185 	 */
3186 	GEM_BUG_ON(!reset_in_progress(execlists));
3187 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3188 		execlists->tasklet.func(execlists->tasklet.data);
3189 
3190 	if (__tasklet_enable(&execlists->tasklet))
3191 		/* And kick in case we missed a new request submission. */
3192 		tasklet_hi_schedule(&execlists->tasklet);
3193 	GEM_TRACE("%s: depth->%d\n", engine->name,
3194 		  atomic_read(&execlists->tasklet.count));
3195 }
3196 
3197 static int gen8_emit_bb_start(struct i915_request *rq,
3198 			      u64 offset, u32 len,
3199 			      const unsigned int flags)
3200 {
3201 	u32 *cs;
3202 
3203 	cs = intel_ring_begin(rq, 4);
3204 	if (IS_ERR(cs))
3205 		return PTR_ERR(cs);
3206 
3207 	/*
3208 	 * WaDisableCtxRestoreArbitration:bdw,chv
3209 	 *
3210 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3211 	 * particular all the gen that do not need the w/a at all!), if we
3212 	 * took care to make sure that on every switch into this context
3213 	 * (both ordinary and for preemption) that arbitrartion was enabled
3214 	 * we would be fine.  However, for gen8 there is another w/a that
3215 	 * requires us to not preempt inside GPGPU execution, so we keep
3216 	 * arbitration disabled for gen8 batches. Arbitration will be
3217 	 * re-enabled before we close the request
3218 	 * (engine->emit_fini_breadcrumb).
3219 	 */
3220 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3221 
3222 	/* FIXME(BDW+): Address space and security selectors. */
3223 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3224 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3225 	*cs++ = lower_32_bits(offset);
3226 	*cs++ = upper_32_bits(offset);
3227 
3228 	intel_ring_advance(rq, cs);
3229 
3230 	return 0;
3231 }
3232 
3233 static int gen9_emit_bb_start(struct i915_request *rq,
3234 			      u64 offset, u32 len,
3235 			      const unsigned int flags)
3236 {
3237 	u32 *cs;
3238 
3239 	cs = intel_ring_begin(rq, 6);
3240 	if (IS_ERR(cs))
3241 		return PTR_ERR(cs);
3242 
3243 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3244 
3245 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3246 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3247 	*cs++ = lower_32_bits(offset);
3248 	*cs++ = upper_32_bits(offset);
3249 
3250 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3251 	*cs++ = MI_NOOP;
3252 
3253 	intel_ring_advance(rq, cs);
3254 
3255 	return 0;
3256 }
3257 
3258 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3259 {
3260 	ENGINE_WRITE(engine, RING_IMR,
3261 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
3262 	ENGINE_POSTING_READ(engine, RING_IMR);
3263 }
3264 
3265 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3266 {
3267 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3268 }
3269 
3270 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3271 {
3272 	u32 cmd, *cs;
3273 
3274 	cs = intel_ring_begin(request, 4);
3275 	if (IS_ERR(cs))
3276 		return PTR_ERR(cs);
3277 
3278 	cmd = MI_FLUSH_DW + 1;
3279 
3280 	/* We always require a command barrier so that subsequent
3281 	 * commands, such as breadcrumb interrupts, are strictly ordered
3282 	 * wrt the contents of the write cache being flushed to memory
3283 	 * (and thus being coherent from the CPU).
3284 	 */
3285 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3286 
3287 	if (mode & EMIT_INVALIDATE) {
3288 		cmd |= MI_INVALIDATE_TLB;
3289 		if (request->engine->class == VIDEO_DECODE_CLASS)
3290 			cmd |= MI_INVALIDATE_BSD;
3291 	}
3292 
3293 	*cs++ = cmd;
3294 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3295 	*cs++ = 0; /* upper addr */
3296 	*cs++ = 0; /* value */
3297 	intel_ring_advance(request, cs);
3298 
3299 	return 0;
3300 }
3301 
3302 static int gen8_emit_flush_render(struct i915_request *request,
3303 				  u32 mode)
3304 {
3305 	bool vf_flush_wa = false, dc_flush_wa = false;
3306 	u32 *cs, flags = 0;
3307 	int len;
3308 
3309 	flags |= PIPE_CONTROL_CS_STALL;
3310 
3311 	if (mode & EMIT_FLUSH) {
3312 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3313 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3314 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3315 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3316 	}
3317 
3318 	if (mode & EMIT_INVALIDATE) {
3319 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3320 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3321 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3322 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3323 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3324 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3325 		flags |= PIPE_CONTROL_QW_WRITE;
3326 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3327 
3328 		/*
3329 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3330 		 * pipe control.
3331 		 */
3332 		if (IS_GEN(request->i915, 9))
3333 			vf_flush_wa = true;
3334 
3335 		/* WaForGAMHang:kbl */
3336 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3337 			dc_flush_wa = true;
3338 	}
3339 
3340 	len = 6;
3341 
3342 	if (vf_flush_wa)
3343 		len += 6;
3344 
3345 	if (dc_flush_wa)
3346 		len += 12;
3347 
3348 	cs = intel_ring_begin(request, len);
3349 	if (IS_ERR(cs))
3350 		return PTR_ERR(cs);
3351 
3352 	if (vf_flush_wa)
3353 		cs = gen8_emit_pipe_control(cs, 0, 0);
3354 
3355 	if (dc_flush_wa)
3356 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3357 					    0);
3358 
3359 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3360 
3361 	if (dc_flush_wa)
3362 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3363 
3364 	intel_ring_advance(request, cs);
3365 
3366 	return 0;
3367 }
3368 
3369 static int gen11_emit_flush_render(struct i915_request *request,
3370 				   u32 mode)
3371 {
3372 	if (mode & EMIT_FLUSH) {
3373 		u32 *cs;
3374 		u32 flags = 0;
3375 
3376 		flags |= PIPE_CONTROL_CS_STALL;
3377 
3378 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3379 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3380 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3381 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3382 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3383 		flags |= PIPE_CONTROL_QW_WRITE;
3384 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3385 
3386 		cs = intel_ring_begin(request, 6);
3387 		if (IS_ERR(cs))
3388 			return PTR_ERR(cs);
3389 
3390 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3391 		intel_ring_advance(request, cs);
3392 	}
3393 
3394 	if (mode & EMIT_INVALIDATE) {
3395 		u32 *cs;
3396 		u32 flags = 0;
3397 
3398 		flags |= PIPE_CONTROL_CS_STALL;
3399 
3400 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3401 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3402 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3403 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3404 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3405 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3406 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3407 		flags |= PIPE_CONTROL_QW_WRITE;
3408 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3409 
3410 		cs = intel_ring_begin(request, 6);
3411 		if (IS_ERR(cs))
3412 			return PTR_ERR(cs);
3413 
3414 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3415 		intel_ring_advance(request, cs);
3416 	}
3417 
3418 	return 0;
3419 }
3420 
3421 static u32 preparser_disable(bool state)
3422 {
3423 	return MI_ARB_CHECK | 1 << 8 | state;
3424 }
3425 
3426 static int gen12_emit_flush_render(struct i915_request *request,
3427 				   u32 mode)
3428 {
3429 	if (mode & EMIT_FLUSH) {
3430 		u32 flags = 0;
3431 		u32 *cs;
3432 
3433 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3434 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3435 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3436 		/* Wa_1409600907:tgl */
3437 		flags |= PIPE_CONTROL_DEPTH_STALL;
3438 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3439 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3440 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3441 
3442 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3443 		flags |= PIPE_CONTROL_QW_WRITE;
3444 
3445 		flags |= PIPE_CONTROL_CS_STALL;
3446 
3447 		cs = intel_ring_begin(request, 6);
3448 		if (IS_ERR(cs))
3449 			return PTR_ERR(cs);
3450 
3451 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3452 		intel_ring_advance(request, cs);
3453 	}
3454 
3455 	if (mode & EMIT_INVALIDATE) {
3456 		u32 flags = 0;
3457 		u32 *cs;
3458 
3459 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3460 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3461 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3462 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3463 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3464 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3465 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3466 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
3467 
3468 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3469 		flags |= PIPE_CONTROL_QW_WRITE;
3470 
3471 		flags |= PIPE_CONTROL_CS_STALL;
3472 
3473 		cs = intel_ring_begin(request, 8);
3474 		if (IS_ERR(cs))
3475 			return PTR_ERR(cs);
3476 
3477 		/*
3478 		 * Prevent the pre-parser from skipping past the TLB
3479 		 * invalidate and loading a stale page for the batch
3480 		 * buffer / request payload.
3481 		 */
3482 		*cs++ = preparser_disable(true);
3483 
3484 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3485 
3486 		*cs++ = preparser_disable(false);
3487 		intel_ring_advance(request, cs);
3488 
3489 		/*
3490 		 * Wa_1604544889:tgl
3491 		 */
3492 		if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
3493 			flags = 0;
3494 			flags |= PIPE_CONTROL_CS_STALL;
3495 			flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3496 
3497 			flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3498 			flags |= PIPE_CONTROL_QW_WRITE;
3499 
3500 			cs = intel_ring_begin(request, 6);
3501 			if (IS_ERR(cs))
3502 				return PTR_ERR(cs);
3503 
3504 			cs = gen8_emit_pipe_control(cs, flags,
3505 						    LRC_PPHWSP_SCRATCH_ADDR);
3506 			intel_ring_advance(request, cs);
3507 		}
3508 	}
3509 
3510 	return 0;
3511 }
3512 
3513 /*
3514  * Reserve space for 2 NOOPs at the end of each request to be
3515  * used as a workaround for not being allowed to do lite
3516  * restore with HEAD==TAIL (WaIdleLiteRestore).
3517  */
3518 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
3519 {
3520 	/* Ensure there's always at least one preemption point per-request. */
3521 	*cs++ = MI_ARB_CHECK;
3522 	*cs++ = MI_NOOP;
3523 	request->wa_tail = intel_ring_offset(request, cs);
3524 
3525 	return cs;
3526 }
3527 
3528 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
3529 {
3530 	*cs++ = MI_SEMAPHORE_WAIT |
3531 		MI_SEMAPHORE_GLOBAL_GTT |
3532 		MI_SEMAPHORE_POLL |
3533 		MI_SEMAPHORE_SAD_EQ_SDD;
3534 	*cs++ = 0;
3535 	*cs++ = intel_hws_preempt_address(request->engine);
3536 	*cs++ = 0;
3537 
3538 	return cs;
3539 }
3540 
3541 static __always_inline u32*
3542 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
3543 				 u32 *cs)
3544 {
3545 	*cs++ = MI_USER_INTERRUPT;
3546 
3547 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3548 	if (intel_engine_has_semaphores(request->engine))
3549 		cs = emit_preempt_busywait(request, cs);
3550 
3551 	request->tail = intel_ring_offset(request, cs);
3552 	assert_ring_tail_valid(request->ring, request->tail);
3553 
3554 	return gen8_emit_wa_tail(request, cs);
3555 }
3556 
3557 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3558 {
3559 	cs = gen8_emit_ggtt_write(cs,
3560 				  request->fence.seqno,
3561 				  i915_request_active_timeline(request)->hwsp_offset,
3562 				  0);
3563 
3564 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3565 }
3566 
3567 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3568 {
3569 	cs = gen8_emit_pipe_control(cs,
3570 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3571 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3572 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
3573 				    0);
3574 
3575 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
3576 	cs = gen8_emit_ggtt_write_rcs(cs,
3577 				      request->fence.seqno,
3578 				      i915_request_active_timeline(request)->hwsp_offset,
3579 				      PIPE_CONTROL_FLUSH_ENABLE |
3580 				      PIPE_CONTROL_CS_STALL);
3581 
3582 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3583 }
3584 
3585 static u32 *
3586 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3587 {
3588 	cs = gen8_emit_ggtt_write_rcs(cs,
3589 				      request->fence.seqno,
3590 				      i915_request_active_timeline(request)->hwsp_offset,
3591 				      PIPE_CONTROL_CS_STALL |
3592 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
3593 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3594 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3595 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
3596 				      PIPE_CONTROL_FLUSH_ENABLE);
3597 
3598 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3599 }
3600 
3601 /*
3602  * Note that the CS instruction pre-parser will not stall on the breadcrumb
3603  * flush and will continue pre-fetching the instructions after it before the
3604  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
3605  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
3606  * of the next request before the memory has been flushed, we're guaranteed that
3607  * we won't access the batch itself too early.
3608  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
3609  * so, if the current request is modifying an instruction in the next request on
3610  * the same intel_context, we might pre-fetch and then execute the pre-update
3611  * instruction. To avoid this, the users of self-modifying code should either
3612  * disable the parser around the code emitting the memory writes, via a new flag
3613  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
3614  * the in-kernel use-cases we've opted to use a separate context, see
3615  * reloc_gpu() as an example.
3616  * All the above applies only to the instructions themselves. Non-inline data
3617  * used by the instructions is not pre-fetched.
3618  */
3619 
3620 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
3621 {
3622 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
3623 		MI_SEMAPHORE_GLOBAL_GTT |
3624 		MI_SEMAPHORE_POLL |
3625 		MI_SEMAPHORE_SAD_EQ_SDD;
3626 	*cs++ = 0;
3627 	*cs++ = intel_hws_preempt_address(request->engine);
3628 	*cs++ = 0;
3629 	*cs++ = 0;
3630 	*cs++ = MI_NOOP;
3631 
3632 	return cs;
3633 }
3634 
3635 static __always_inline u32*
3636 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
3637 {
3638 	*cs++ = MI_USER_INTERRUPT;
3639 
3640 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3641 	if (intel_engine_has_semaphores(request->engine))
3642 		cs = gen12_emit_preempt_busywait(request, cs);
3643 
3644 	request->tail = intel_ring_offset(request, cs);
3645 	assert_ring_tail_valid(request->ring, request->tail);
3646 
3647 	return gen8_emit_wa_tail(request, cs);
3648 }
3649 
3650 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3651 {
3652 	cs = gen8_emit_ggtt_write(cs,
3653 				  request->fence.seqno,
3654 				  i915_request_active_timeline(request)->hwsp_offset,
3655 				  0);
3656 
3657 	return gen12_emit_fini_breadcrumb_footer(request, cs);
3658 }
3659 
3660 static u32 *
3661 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3662 {
3663 	cs = gen8_emit_ggtt_write_rcs(cs,
3664 				      request->fence.seqno,
3665 				      i915_request_active_timeline(request)->hwsp_offset,
3666 				      PIPE_CONTROL_CS_STALL |
3667 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
3668 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3669 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3670 				      /* Wa_1409600907:tgl */
3671 				      PIPE_CONTROL_DEPTH_STALL |
3672 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
3673 				      PIPE_CONTROL_FLUSH_ENABLE |
3674 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
3675 
3676 	return gen12_emit_fini_breadcrumb_footer(request, cs);
3677 }
3678 
3679 static void execlists_park(struct intel_engine_cs *engine)
3680 {
3681 	cancel_timer(&engine->execlists.timer);
3682 	cancel_timer(&engine->execlists.preempt);
3683 }
3684 
3685 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
3686 {
3687 	engine->submit_request = execlists_submit_request;
3688 	engine->cancel_requests = execlists_cancel_requests;
3689 	engine->schedule = i915_schedule;
3690 	engine->execlists.tasklet.func = execlists_submission_tasklet;
3691 
3692 	engine->reset.prepare = execlists_reset_prepare;
3693 	engine->reset.reset = execlists_reset;
3694 	engine->reset.finish = execlists_reset_finish;
3695 
3696 	engine->park = execlists_park;
3697 	engine->unpark = NULL;
3698 
3699 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
3700 	if (!intel_vgpu_active(engine->i915)) {
3701 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
3702 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
3703 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3704 	}
3705 
3706 	if (INTEL_GEN(engine->i915) >= 12)
3707 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
3708 }
3709 
3710 static void execlists_destroy(struct intel_engine_cs *engine)
3711 {
3712 	intel_engine_cleanup_common(engine);
3713 	lrc_destroy_wa_ctx(engine);
3714 	kfree(engine);
3715 }
3716 
3717 static void
3718 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3719 {
3720 	/* Default vfuncs which can be overriden by each engine. */
3721 
3722 	engine->destroy = execlists_destroy;
3723 	engine->resume = execlists_resume;
3724 
3725 	engine->reset.prepare = execlists_reset_prepare;
3726 	engine->reset.reset = execlists_reset;
3727 	engine->reset.finish = execlists_reset_finish;
3728 
3729 	engine->cops = &execlists_context_ops;
3730 	engine->request_alloc = execlists_request_alloc;
3731 
3732 	engine->emit_flush = gen8_emit_flush;
3733 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3734 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
3735 	if (INTEL_GEN(engine->i915) >= 12)
3736 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
3737 
3738 	engine->set_default_submission = intel_execlists_set_default_submission;
3739 
3740 	if (INTEL_GEN(engine->i915) < 11) {
3741 		engine->irq_enable = gen8_logical_ring_enable_irq;
3742 		engine->irq_disable = gen8_logical_ring_disable_irq;
3743 	} else {
3744 		/*
3745 		 * TODO: On Gen11 interrupt masks need to be clear
3746 		 * to allow C6 entry. Keep interrupts enabled at
3747 		 * and take the hit of generating extra interrupts
3748 		 * until a more refined solution exists.
3749 		 */
3750 	}
3751 	if (IS_GEN(engine->i915, 8))
3752 		engine->emit_bb_start = gen8_emit_bb_start;
3753 	else
3754 		engine->emit_bb_start = gen9_emit_bb_start;
3755 }
3756 
3757 static inline void
3758 logical_ring_default_irqs(struct intel_engine_cs *engine)
3759 {
3760 	unsigned int shift = 0;
3761 
3762 	if (INTEL_GEN(engine->i915) < 11) {
3763 		const u8 irq_shifts[] = {
3764 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
3765 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
3766 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
3767 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
3768 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
3769 		};
3770 
3771 		shift = irq_shifts[engine->id];
3772 	}
3773 
3774 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3775 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3776 }
3777 
3778 static void rcs_submission_override(struct intel_engine_cs *engine)
3779 {
3780 	switch (INTEL_GEN(engine->i915)) {
3781 	case 12:
3782 		engine->emit_flush = gen12_emit_flush_render;
3783 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
3784 		break;
3785 	case 11:
3786 		engine->emit_flush = gen11_emit_flush_render;
3787 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3788 		break;
3789 	default:
3790 		engine->emit_flush = gen8_emit_flush_render;
3791 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3792 		break;
3793 	}
3794 }
3795 
3796 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3797 {
3798 	tasklet_init(&engine->execlists.tasklet,
3799 		     execlists_submission_tasklet, (unsigned long)engine);
3800 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
3801 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
3802 
3803 	logical_ring_default_vfuncs(engine);
3804 	logical_ring_default_irqs(engine);
3805 
3806 	if (engine->class == RENDER_CLASS)
3807 		rcs_submission_override(engine);
3808 
3809 	return 0;
3810 }
3811 
3812 int intel_execlists_submission_init(struct intel_engine_cs *engine)
3813 {
3814 	struct intel_engine_execlists * const execlists = &engine->execlists;
3815 	struct drm_i915_private *i915 = engine->i915;
3816 	struct intel_uncore *uncore = engine->uncore;
3817 	u32 base = engine->mmio_base;
3818 	int ret;
3819 
3820 	ret = intel_engine_init_common(engine);
3821 	if (ret)
3822 		return ret;
3823 
3824 	if (intel_init_workaround_bb(engine))
3825 		/*
3826 		 * We continue even if we fail to initialize WA batch
3827 		 * because we only expect rare glitches but nothing
3828 		 * critical to prevent us from using GPU
3829 		 */
3830 		DRM_ERROR("WA batch buffer initialization failed\n");
3831 
3832 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
3833 		execlists->submit_reg = uncore->regs +
3834 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
3835 		execlists->ctrl_reg = uncore->regs +
3836 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
3837 	} else {
3838 		execlists->submit_reg = uncore->regs +
3839 			i915_mmio_reg_offset(RING_ELSP(base));
3840 	}
3841 
3842 	execlists->csb_status =
3843 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
3844 
3845 	execlists->csb_write =
3846 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
3847 
3848 	if (INTEL_GEN(i915) < 11)
3849 		execlists->csb_size = GEN8_CSB_ENTRIES;
3850 	else
3851 		execlists->csb_size = GEN11_CSB_ENTRIES;
3852 
3853 	reset_csb_pointers(engine);
3854 
3855 	return 0;
3856 }
3857 
3858 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
3859 {
3860 	u32 indirect_ctx_offset;
3861 
3862 	switch (INTEL_GEN(engine->i915)) {
3863 	default:
3864 		MISSING_CASE(INTEL_GEN(engine->i915));
3865 		/* fall through */
3866 	case 12:
3867 		indirect_ctx_offset =
3868 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3869 		break;
3870 	case 11:
3871 		indirect_ctx_offset =
3872 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3873 		break;
3874 	case 10:
3875 		indirect_ctx_offset =
3876 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3877 		break;
3878 	case 9:
3879 		indirect_ctx_offset =
3880 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3881 		break;
3882 	case 8:
3883 		indirect_ctx_offset =
3884 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3885 		break;
3886 	}
3887 
3888 	return indirect_ctx_offset;
3889 }
3890 
3891 
3892 static void init_common_reg_state(u32 * const regs,
3893 				  const struct intel_engine_cs *engine,
3894 				  const struct intel_ring *ring)
3895 {
3896 	regs[CTX_CONTEXT_CONTROL] =
3897 		_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
3898 		_MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
3899 	if (INTEL_GEN(engine->i915) < 11)
3900 		regs[CTX_CONTEXT_CONTROL] |=
3901 			_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
3902 					    CTX_CTRL_RS_CTX_ENABLE);
3903 
3904 	regs[CTX_RING_BUFFER_CONTROL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3905 	regs[CTX_BB_STATE] = RING_BB_PPGTT;
3906 }
3907 
3908 static void init_wa_bb_reg_state(u32 * const regs,
3909 				 const struct intel_engine_cs *engine,
3910 				 u32 pos_bb_per_ctx)
3911 {
3912 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
3913 
3914 	if (wa_ctx->per_ctx.size) {
3915 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3916 
3917 		regs[pos_bb_per_ctx] =
3918 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
3919 	}
3920 
3921 	if (wa_ctx->indirect_ctx.size) {
3922 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3923 
3924 		regs[pos_bb_per_ctx + 2] =
3925 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
3926 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
3927 
3928 		regs[pos_bb_per_ctx + 4] =
3929 			intel_lr_indirect_ctx_offset(engine) << 6;
3930 	}
3931 }
3932 
3933 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
3934 {
3935 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
3936 		/* 64b PPGTT (48bit canonical)
3937 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
3938 		 * other PDP Descriptors are ignored.
3939 		 */
3940 		ASSIGN_CTX_PML4(ppgtt, regs);
3941 	} else {
3942 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
3943 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
3944 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
3945 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
3946 	}
3947 }
3948 
3949 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
3950 {
3951 	if (i915_is_ggtt(vm))
3952 		return i915_vm_to_ggtt(vm)->alias;
3953 	else
3954 		return i915_vm_to_ppgtt(vm);
3955 }
3956 
3957 static void execlists_init_reg_state(u32 *regs,
3958 				     const struct intel_context *ce,
3959 				     const struct intel_engine_cs *engine,
3960 				     const struct intel_ring *ring,
3961 				     bool close)
3962 {
3963 	/*
3964 	 * A context is actually a big batch buffer with several
3965 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
3966 	 * values we are setting here are only for the first context restore:
3967 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
3968 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
3969 	 * we are not initializing here).
3970 	 *
3971 	 * Must keep consistent with virtual_update_register_offsets().
3972 	 */
3973 	u32 *bbe = set_offsets(regs, reg_offsets(engine), engine);
3974 
3975 	if (close) { /* Close the batch; used mainly by live_lrc_layout() */
3976 		*bbe = MI_BATCH_BUFFER_END;
3977 		if (INTEL_GEN(engine->i915) >= 10)
3978 			*bbe |= BIT(0);
3979 	}
3980 
3981 	init_common_reg_state(regs, engine, ring);
3982 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
3983 
3984 	init_wa_bb_reg_state(regs, engine,
3985 			     INTEL_GEN(engine->i915) >= 12 ?
3986 			     GEN12_CTX_BB_PER_CTX_PTR :
3987 			     CTX_BB_PER_CTX_PTR);
3988 }
3989 
3990 static int
3991 populate_lr_context(struct intel_context *ce,
3992 		    struct drm_i915_gem_object *ctx_obj,
3993 		    struct intel_engine_cs *engine,
3994 		    struct intel_ring *ring)
3995 {
3996 	bool inhibit = true;
3997 	void *vaddr;
3998 	u32 *regs;
3999 	int ret;
4000 
4001 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4002 	if (IS_ERR(vaddr)) {
4003 		ret = PTR_ERR(vaddr);
4004 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4005 		return ret;
4006 	}
4007 
4008 	set_redzone(vaddr, engine);
4009 
4010 	if (engine->default_state) {
4011 		void *defaults;
4012 
4013 		defaults = i915_gem_object_pin_map(engine->default_state,
4014 						   I915_MAP_WB);
4015 		if (IS_ERR(defaults)) {
4016 			ret = PTR_ERR(defaults);
4017 			goto err_unpin_ctx;
4018 		}
4019 
4020 		memcpy(vaddr, defaults, engine->context_size);
4021 		i915_gem_object_unpin_map(engine->default_state);
4022 		inhibit = false;
4023 	}
4024 
4025 	/* The second page of the context object contains some fields which must
4026 	 * be set up prior to the first execution. */
4027 	regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
4028 	execlists_init_reg_state(regs, ce, engine, ring, inhibit);
4029 	if (inhibit)
4030 		regs[CTX_CONTEXT_CONTROL] |=
4031 			_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4032 
4033 	ret = 0;
4034 err_unpin_ctx:
4035 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4036 	i915_gem_object_unpin_map(ctx_obj);
4037 	return ret;
4038 }
4039 
4040 static int __execlists_context_alloc(struct intel_context *ce,
4041 				     struct intel_engine_cs *engine)
4042 {
4043 	struct drm_i915_gem_object *ctx_obj;
4044 	struct intel_ring *ring;
4045 	struct i915_vma *vma;
4046 	u32 context_size;
4047 	int ret;
4048 
4049 	GEM_BUG_ON(ce->state);
4050 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4051 
4052 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4053 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4054 
4055 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4056 	if (IS_ERR(ctx_obj))
4057 		return PTR_ERR(ctx_obj);
4058 
4059 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4060 	if (IS_ERR(vma)) {
4061 		ret = PTR_ERR(vma);
4062 		goto error_deref_obj;
4063 	}
4064 
4065 	if (!ce->timeline) {
4066 		struct intel_timeline *tl;
4067 
4068 		tl = intel_timeline_create(engine->gt, NULL);
4069 		if (IS_ERR(tl)) {
4070 			ret = PTR_ERR(tl);
4071 			goto error_deref_obj;
4072 		}
4073 
4074 		ce->timeline = tl;
4075 	}
4076 
4077 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4078 	if (IS_ERR(ring)) {
4079 		ret = PTR_ERR(ring);
4080 		goto error_deref_obj;
4081 	}
4082 
4083 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4084 	if (ret) {
4085 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4086 		goto error_ring_free;
4087 	}
4088 
4089 	ce->ring = ring;
4090 	ce->state = vma;
4091 
4092 	return 0;
4093 
4094 error_ring_free:
4095 	intel_ring_put(ring);
4096 error_deref_obj:
4097 	i915_gem_object_put(ctx_obj);
4098 	return ret;
4099 }
4100 
4101 static struct list_head *virtual_queue(struct virtual_engine *ve)
4102 {
4103 	return &ve->base.execlists.default_priolist.requests[0];
4104 }
4105 
4106 static void virtual_context_destroy(struct kref *kref)
4107 {
4108 	struct virtual_engine *ve =
4109 		container_of(kref, typeof(*ve), context.ref);
4110 	unsigned int n;
4111 
4112 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4113 	GEM_BUG_ON(ve->request);
4114 	GEM_BUG_ON(ve->context.inflight);
4115 
4116 	for (n = 0; n < ve->num_siblings; n++) {
4117 		struct intel_engine_cs *sibling = ve->siblings[n];
4118 		struct rb_node *node = &ve->nodes[sibling->id].rb;
4119 		unsigned long flags;
4120 
4121 		if (RB_EMPTY_NODE(node))
4122 			continue;
4123 
4124 		spin_lock_irqsave(&sibling->active.lock, flags);
4125 
4126 		/* Detachment is lazily performed in the execlists tasklet */
4127 		if (!RB_EMPTY_NODE(node))
4128 			rb_erase_cached(node, &sibling->execlists.virtual);
4129 
4130 		spin_unlock_irqrestore(&sibling->active.lock, flags);
4131 	}
4132 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4133 
4134 	if (ve->context.state)
4135 		__execlists_context_fini(&ve->context);
4136 	intel_context_fini(&ve->context);
4137 
4138 	kfree(ve->bonds);
4139 	kfree(ve);
4140 }
4141 
4142 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4143 {
4144 	int swp;
4145 
4146 	/*
4147 	 * Pick a random sibling on starting to help spread the load around.
4148 	 *
4149 	 * New contexts are typically created with exactly the same order
4150 	 * of siblings, and often started in batches. Due to the way we iterate
4151 	 * the array of sibling when submitting requests, sibling[0] is
4152 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4153 	 * randomised across the system, we also help spread the load by the
4154 	 * first engine we inspect being different each time.
4155 	 *
4156 	 * NB This does not force us to execute on this engine, it will just
4157 	 * typically be the first we inspect for submission.
4158 	 */
4159 	swp = prandom_u32_max(ve->num_siblings);
4160 	if (!swp)
4161 		return;
4162 
4163 	swap(ve->siblings[swp], ve->siblings[0]);
4164 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4165 		virtual_update_register_offsets(ve->context.lrc_reg_state,
4166 						ve->siblings[0]);
4167 }
4168 
4169 static int virtual_context_pin(struct intel_context *ce)
4170 {
4171 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4172 	int err;
4173 
4174 	/* Note: we must use a real engine class for setting up reg state */
4175 	err = __execlists_context_pin(ce, ve->siblings[0]);
4176 	if (err)
4177 		return err;
4178 
4179 	virtual_engine_initial_hint(ve);
4180 	return 0;
4181 }
4182 
4183 static void virtual_context_enter(struct intel_context *ce)
4184 {
4185 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4186 	unsigned int n;
4187 
4188 	for (n = 0; n < ve->num_siblings; n++)
4189 		intel_engine_pm_get(ve->siblings[n]);
4190 
4191 	intel_timeline_enter(ce->timeline);
4192 }
4193 
4194 static void virtual_context_exit(struct intel_context *ce)
4195 {
4196 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4197 	unsigned int n;
4198 
4199 	intel_timeline_exit(ce->timeline);
4200 
4201 	for (n = 0; n < ve->num_siblings; n++)
4202 		intel_engine_pm_put(ve->siblings[n]);
4203 }
4204 
4205 static const struct intel_context_ops virtual_context_ops = {
4206 	.pin = virtual_context_pin,
4207 	.unpin = execlists_context_unpin,
4208 
4209 	.enter = virtual_context_enter,
4210 	.exit = virtual_context_exit,
4211 
4212 	.destroy = virtual_context_destroy,
4213 };
4214 
4215 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4216 {
4217 	struct i915_request *rq;
4218 	intel_engine_mask_t mask;
4219 
4220 	rq = READ_ONCE(ve->request);
4221 	if (!rq)
4222 		return 0;
4223 
4224 	/* The rq is ready for submission; rq->execution_mask is now stable. */
4225 	mask = rq->execution_mask;
4226 	if (unlikely(!mask)) {
4227 		/* Invalid selection, submit to a random engine in error */
4228 		i915_request_skip(rq, -ENODEV);
4229 		mask = ve->siblings[0]->mask;
4230 	}
4231 
4232 	GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n",
4233 		  ve->base.name,
4234 		  rq->fence.context, rq->fence.seqno,
4235 		  mask, ve->base.execlists.queue_priority_hint);
4236 
4237 	return mask;
4238 }
4239 
4240 static void virtual_submission_tasklet(unsigned long data)
4241 {
4242 	struct virtual_engine * const ve = (struct virtual_engine *)data;
4243 	const int prio = ve->base.execlists.queue_priority_hint;
4244 	intel_engine_mask_t mask;
4245 	unsigned int n;
4246 
4247 	rcu_read_lock();
4248 	mask = virtual_submission_mask(ve);
4249 	rcu_read_unlock();
4250 	if (unlikely(!mask))
4251 		return;
4252 
4253 	local_irq_disable();
4254 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4255 		struct intel_engine_cs *sibling = ve->siblings[n];
4256 		struct ve_node * const node = &ve->nodes[sibling->id];
4257 		struct rb_node **parent, *rb;
4258 		bool first;
4259 
4260 		if (unlikely(!(mask & sibling->mask))) {
4261 			if (!RB_EMPTY_NODE(&node->rb)) {
4262 				spin_lock(&sibling->active.lock);
4263 				rb_erase_cached(&node->rb,
4264 						&sibling->execlists.virtual);
4265 				RB_CLEAR_NODE(&node->rb);
4266 				spin_unlock(&sibling->active.lock);
4267 			}
4268 			continue;
4269 		}
4270 
4271 		spin_lock(&sibling->active.lock);
4272 
4273 		if (!RB_EMPTY_NODE(&node->rb)) {
4274 			/*
4275 			 * Cheat and avoid rebalancing the tree if we can
4276 			 * reuse this node in situ.
4277 			 */
4278 			first = rb_first_cached(&sibling->execlists.virtual) ==
4279 				&node->rb;
4280 			if (prio == node->prio || (prio > node->prio && first))
4281 				goto submit_engine;
4282 
4283 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4284 		}
4285 
4286 		rb = NULL;
4287 		first = true;
4288 		parent = &sibling->execlists.virtual.rb_root.rb_node;
4289 		while (*parent) {
4290 			struct ve_node *other;
4291 
4292 			rb = *parent;
4293 			other = rb_entry(rb, typeof(*other), rb);
4294 			if (prio > other->prio) {
4295 				parent = &rb->rb_left;
4296 			} else {
4297 				parent = &rb->rb_right;
4298 				first = false;
4299 			}
4300 		}
4301 
4302 		rb_link_node(&node->rb, rb, parent);
4303 		rb_insert_color_cached(&node->rb,
4304 				       &sibling->execlists.virtual,
4305 				       first);
4306 
4307 submit_engine:
4308 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4309 		node->prio = prio;
4310 		if (first && prio > sibling->execlists.queue_priority_hint) {
4311 			sibling->execlists.queue_priority_hint = prio;
4312 			tasklet_hi_schedule(&sibling->execlists.tasklet);
4313 		}
4314 
4315 		spin_unlock(&sibling->active.lock);
4316 	}
4317 	local_irq_enable();
4318 }
4319 
4320 static void virtual_submit_request(struct i915_request *rq)
4321 {
4322 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4323 	struct i915_request *old;
4324 	unsigned long flags;
4325 
4326 	GEM_TRACE("%s: rq=%llx:%lld\n",
4327 		  ve->base.name,
4328 		  rq->fence.context,
4329 		  rq->fence.seqno);
4330 
4331 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4332 
4333 	spin_lock_irqsave(&ve->base.active.lock, flags);
4334 
4335 	old = ve->request;
4336 	if (old) { /* background completion event from preempt-to-busy */
4337 		GEM_BUG_ON(!i915_request_completed(old));
4338 		__i915_request_submit(old);
4339 		i915_request_put(old);
4340 	}
4341 
4342 	if (i915_request_completed(rq)) {
4343 		__i915_request_submit(rq);
4344 
4345 		ve->base.execlists.queue_priority_hint = INT_MIN;
4346 		ve->request = NULL;
4347 	} else {
4348 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
4349 		ve->request = i915_request_get(rq);
4350 
4351 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4352 		list_move_tail(&rq->sched.link, virtual_queue(ve));
4353 
4354 		tasklet_schedule(&ve->base.execlists.tasklet);
4355 	}
4356 
4357 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
4358 }
4359 
4360 static struct ve_bond *
4361 virtual_find_bond(struct virtual_engine *ve,
4362 		  const struct intel_engine_cs *master)
4363 {
4364 	int i;
4365 
4366 	for (i = 0; i < ve->num_bonds; i++) {
4367 		if (ve->bonds[i].master == master)
4368 			return &ve->bonds[i];
4369 	}
4370 
4371 	return NULL;
4372 }
4373 
4374 static void
4375 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4376 {
4377 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4378 	intel_engine_mask_t allowed, exec;
4379 	struct ve_bond *bond;
4380 
4381 	allowed = ~to_request(signal)->engine->mask;
4382 
4383 	bond = virtual_find_bond(ve, to_request(signal)->engine);
4384 	if (bond)
4385 		allowed &= bond->sibling_mask;
4386 
4387 	/* Restrict the bonded request to run on only the available engines */
4388 	exec = READ_ONCE(rq->execution_mask);
4389 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4390 		;
4391 
4392 	/* Prevent the master from being re-run on the bonded engines */
4393 	to_request(signal)->execution_mask &= ~allowed;
4394 }
4395 
4396 struct intel_context *
4397 intel_execlists_create_virtual(struct i915_gem_context *ctx,
4398 			       struct intel_engine_cs **siblings,
4399 			       unsigned int count)
4400 {
4401 	struct virtual_engine *ve;
4402 	unsigned int n;
4403 	int err;
4404 
4405 	if (count == 0)
4406 		return ERR_PTR(-EINVAL);
4407 
4408 	if (count == 1)
4409 		return intel_context_create(ctx, siblings[0]);
4410 
4411 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4412 	if (!ve)
4413 		return ERR_PTR(-ENOMEM);
4414 
4415 	ve->base.i915 = ctx->i915;
4416 	ve->base.gt = siblings[0]->gt;
4417 	ve->base.uncore = siblings[0]->uncore;
4418 	ve->base.id = -1;
4419 
4420 	ve->base.class = OTHER_CLASS;
4421 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4422 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4423 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4424 
4425 	/*
4426 	 * The decision on whether to submit a request using semaphores
4427 	 * depends on the saturated state of the engine. We only compute
4428 	 * this during HW submission of the request, and we need for this
4429 	 * state to be globally applied to all requests being submitted
4430 	 * to this engine. Virtual engines encompass more than one physical
4431 	 * engine and so we cannot accurately tell in advance if one of those
4432 	 * engines is already saturated and so cannot afford to use a semaphore
4433 	 * and be pessimized in priority for doing so -- if we are the only
4434 	 * context using semaphores after all other clients have stopped, we
4435 	 * will be starved on the saturated system. Such a global switch for
4436 	 * semaphores is less than ideal, but alas is the current compromise.
4437 	 */
4438 	ve->base.saturated = ALL_ENGINES;
4439 
4440 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4441 
4442 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4443 	intel_engine_init_breadcrumbs(&ve->base);
4444 
4445 	intel_engine_init_execlists(&ve->base);
4446 
4447 	ve->base.cops = &virtual_context_ops;
4448 	ve->base.request_alloc = execlists_request_alloc;
4449 
4450 	ve->base.schedule = i915_schedule;
4451 	ve->base.submit_request = virtual_submit_request;
4452 	ve->base.bond_execute = virtual_bond_execute;
4453 
4454 	INIT_LIST_HEAD(virtual_queue(ve));
4455 	ve->base.execlists.queue_priority_hint = INT_MIN;
4456 	tasklet_init(&ve->base.execlists.tasklet,
4457 		     virtual_submission_tasklet,
4458 		     (unsigned long)ve);
4459 
4460 	intel_context_init(&ve->context, ctx, &ve->base);
4461 
4462 	for (n = 0; n < count; n++) {
4463 		struct intel_engine_cs *sibling = siblings[n];
4464 
4465 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
4466 		if (sibling->mask & ve->base.mask) {
4467 			DRM_DEBUG("duplicate %s entry in load balancer\n",
4468 				  sibling->name);
4469 			err = -EINVAL;
4470 			goto err_put;
4471 		}
4472 
4473 		/*
4474 		 * The virtual engine implementation is tightly coupled to
4475 		 * the execlists backend -- we push out request directly
4476 		 * into a tree inside each physical engine. We could support
4477 		 * layering if we handle cloning of the requests and
4478 		 * submitting a copy into each backend.
4479 		 */
4480 		if (sibling->execlists.tasklet.func !=
4481 		    execlists_submission_tasklet) {
4482 			err = -ENODEV;
4483 			goto err_put;
4484 		}
4485 
4486 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4487 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4488 
4489 		ve->siblings[ve->num_siblings++] = sibling;
4490 		ve->base.mask |= sibling->mask;
4491 
4492 		/*
4493 		 * All physical engines must be compatible for their emission
4494 		 * functions (as we build the instructions during request
4495 		 * construction and do not alter them before submission
4496 		 * on the physical engine). We use the engine class as a guide
4497 		 * here, although that could be refined.
4498 		 */
4499 		if (ve->base.class != OTHER_CLASS) {
4500 			if (ve->base.class != sibling->class) {
4501 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
4502 					  sibling->class, ve->base.class);
4503 				err = -EINVAL;
4504 				goto err_put;
4505 			}
4506 			continue;
4507 		}
4508 
4509 		ve->base.class = sibling->class;
4510 		ve->base.uabi_class = sibling->uabi_class;
4511 		snprintf(ve->base.name, sizeof(ve->base.name),
4512 			 "v%dx%d", ve->base.class, count);
4513 		ve->base.context_size = sibling->context_size;
4514 
4515 		ve->base.emit_bb_start = sibling->emit_bb_start;
4516 		ve->base.emit_flush = sibling->emit_flush;
4517 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
4518 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
4519 		ve->base.emit_fini_breadcrumb_dw =
4520 			sibling->emit_fini_breadcrumb_dw;
4521 
4522 		ve->base.flags = sibling->flags;
4523 	}
4524 
4525 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
4526 
4527 	err = __execlists_context_alloc(&ve->context, siblings[0]);
4528 	if (err)
4529 		goto err_put;
4530 
4531 	__set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags);
4532 
4533 	return &ve->context;
4534 
4535 err_put:
4536 	intel_context_put(&ve->context);
4537 	return ERR_PTR(err);
4538 }
4539 
4540 struct intel_context *
4541 intel_execlists_clone_virtual(struct i915_gem_context *ctx,
4542 			      struct intel_engine_cs *src)
4543 {
4544 	struct virtual_engine *se = to_virtual_engine(src);
4545 	struct intel_context *dst;
4546 
4547 	dst = intel_execlists_create_virtual(ctx,
4548 					     se->siblings,
4549 					     se->num_siblings);
4550 	if (IS_ERR(dst))
4551 		return dst;
4552 
4553 	if (se->num_bonds) {
4554 		struct virtual_engine *de = to_virtual_engine(dst->engine);
4555 
4556 		de->bonds = kmemdup(se->bonds,
4557 				    sizeof(*se->bonds) * se->num_bonds,
4558 				    GFP_KERNEL);
4559 		if (!de->bonds) {
4560 			intel_context_put(dst);
4561 			return ERR_PTR(-ENOMEM);
4562 		}
4563 
4564 		de->num_bonds = se->num_bonds;
4565 	}
4566 
4567 	return dst;
4568 }
4569 
4570 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
4571 				     const struct intel_engine_cs *master,
4572 				     const struct intel_engine_cs *sibling)
4573 {
4574 	struct virtual_engine *ve = to_virtual_engine(engine);
4575 	struct ve_bond *bond;
4576 	int n;
4577 
4578 	/* Sanity check the sibling is part of the virtual engine */
4579 	for (n = 0; n < ve->num_siblings; n++)
4580 		if (sibling == ve->siblings[n])
4581 			break;
4582 	if (n == ve->num_siblings)
4583 		return -EINVAL;
4584 
4585 	bond = virtual_find_bond(ve, master);
4586 	if (bond) {
4587 		bond->sibling_mask |= sibling->mask;
4588 		return 0;
4589 	}
4590 
4591 	bond = krealloc(ve->bonds,
4592 			sizeof(*bond) * (ve->num_bonds + 1),
4593 			GFP_KERNEL);
4594 	if (!bond)
4595 		return -ENOMEM;
4596 
4597 	bond[ve->num_bonds].master = master;
4598 	bond[ve->num_bonds].sibling_mask = sibling->mask;
4599 
4600 	ve->bonds = bond;
4601 	ve->num_bonds++;
4602 
4603 	return 0;
4604 }
4605 
4606 struct intel_engine_cs *
4607 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
4608 				 unsigned int sibling)
4609 {
4610 	struct virtual_engine *ve = to_virtual_engine(engine);
4611 
4612 	if (sibling >= ve->num_siblings)
4613 		return NULL;
4614 
4615 	return ve->siblings[sibling];
4616 }
4617 
4618 void intel_execlists_show_requests(struct intel_engine_cs *engine,
4619 				   struct drm_printer *m,
4620 				   void (*show_request)(struct drm_printer *m,
4621 							struct i915_request *rq,
4622 							const char *prefix),
4623 				   unsigned int max)
4624 {
4625 	const struct intel_engine_execlists *execlists = &engine->execlists;
4626 	struct i915_request *rq, *last;
4627 	unsigned long flags;
4628 	unsigned int count;
4629 	struct rb_node *rb;
4630 
4631 	spin_lock_irqsave(&engine->active.lock, flags);
4632 
4633 	last = NULL;
4634 	count = 0;
4635 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
4636 		if (count++ < max - 1)
4637 			show_request(m, rq, "\t\tE ");
4638 		else
4639 			last = rq;
4640 	}
4641 	if (last) {
4642 		if (count > max) {
4643 			drm_printf(m,
4644 				   "\t\t...skipping %d executing requests...\n",
4645 				   count - max);
4646 		}
4647 		show_request(m, last, "\t\tE ");
4648 	}
4649 
4650 	last = NULL;
4651 	count = 0;
4652 	if (execlists->queue_priority_hint != INT_MIN)
4653 		drm_printf(m, "\t\tQueue priority hint: %d\n",
4654 			   execlists->queue_priority_hint);
4655 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
4656 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
4657 		int i;
4658 
4659 		priolist_for_each_request(rq, p, i) {
4660 			if (count++ < max - 1)
4661 				show_request(m, rq, "\t\tQ ");
4662 			else
4663 				last = rq;
4664 		}
4665 	}
4666 	if (last) {
4667 		if (count > max) {
4668 			drm_printf(m,
4669 				   "\t\t...skipping %d queued requests...\n",
4670 				   count - max);
4671 		}
4672 		show_request(m, last, "\t\tQ ");
4673 	}
4674 
4675 	last = NULL;
4676 	count = 0;
4677 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
4678 		struct virtual_engine *ve =
4679 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4680 		struct i915_request *rq = READ_ONCE(ve->request);
4681 
4682 		if (rq) {
4683 			if (count++ < max - 1)
4684 				show_request(m, rq, "\t\tV ");
4685 			else
4686 				last = rq;
4687 		}
4688 	}
4689 	if (last) {
4690 		if (count > max) {
4691 			drm_printf(m,
4692 				   "\t\t...skipping %d virtual requests...\n",
4693 				   count - max);
4694 		}
4695 		show_request(m, last, "\t\tV ");
4696 	}
4697 
4698 	spin_unlock_irqrestore(&engine->active.lock, flags);
4699 }
4700 
4701 void intel_lr_context_reset(struct intel_engine_cs *engine,
4702 			    struct intel_context *ce,
4703 			    u32 head,
4704 			    bool scrub)
4705 {
4706 	GEM_BUG_ON(!intel_context_is_pinned(ce));
4707 
4708 	/*
4709 	 * We want a simple context + ring to execute the breadcrumb update.
4710 	 * We cannot rely on the context being intact across the GPU hang,
4711 	 * so clear it and rebuild just what we need for the breadcrumb.
4712 	 * All pending requests for this context will be zapped, and any
4713 	 * future request will be after userspace has had the opportunity
4714 	 * to recreate its own state.
4715 	 */
4716 	if (scrub)
4717 		restore_default_state(ce, engine);
4718 
4719 	/* Rerun the request; its payload has been neutered (if guilty). */
4720 	ce->ring->head = head;
4721 	intel_ring_update_space(ce->ring);
4722 
4723 	__execlists_update_reg_state(ce, engine);
4724 }
4725 
4726 bool
4727 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
4728 {
4729 	return engine->set_default_submission ==
4730 	       intel_execlists_set_default_submission;
4731 }
4732 
4733 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4734 #include "selftest_lrc.c"
4735 #endif
4736