xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 55fd7e02)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 #include "shmem_utils.h"
151 
152 #define RING_EXECLIST_QFULL		(1 << 0x2)
153 #define RING_EXECLIST1_VALID		(1 << 0x3)
154 #define RING_EXECLIST0_VALID		(1 << 0x4)
155 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
156 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
157 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
158 
159 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
160 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
162 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
163 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
164 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
165 
166 #define GEN8_CTX_STATUS_COMPLETED_MASK \
167 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
168 
169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
170 
171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
173 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
174 #define GEN12_IDLE_CTX_ID		0x7FF
175 #define GEN12_CSB_CTX_VALID(csb_dw) \
176 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
177 
178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
180 
181 struct virtual_engine {
182 	struct intel_engine_cs base;
183 	struct intel_context context;
184 
185 	/*
186 	 * We allow only a single request through the virtual engine at a time
187 	 * (each request in the timeline waits for the completion fence of
188 	 * the previous before being submitted). By restricting ourselves to
189 	 * only submitting a single request, each request is placed on to a
190 	 * physical to maximise load spreading (by virtue of the late greedy
191 	 * scheduling -- each real engine takes the next available request
192 	 * upon idling).
193 	 */
194 	struct i915_request *request;
195 
196 	/*
197 	 * We keep a rbtree of available virtual engines inside each physical
198 	 * engine, sorted by priority. Here we preallocate the nodes we need
199 	 * for the virtual engine, indexed by physical_engine->id.
200 	 */
201 	struct ve_node {
202 		struct rb_node rb;
203 		int prio;
204 	} nodes[I915_NUM_ENGINES];
205 
206 	/*
207 	 * Keep track of bonded pairs -- restrictions upon on our selection
208 	 * of physical engines any particular request may be submitted to.
209 	 * If we receive a submit-fence from a master engine, we will only
210 	 * use one of sibling_mask physical engines.
211 	 */
212 	struct ve_bond {
213 		const struct intel_engine_cs *master;
214 		intel_engine_mask_t sibling_mask;
215 	} *bonds;
216 	unsigned int num_bonds;
217 
218 	/* And finally, which physical engines this virtual engine maps onto. */
219 	unsigned int num_siblings;
220 	struct intel_engine_cs *siblings[];
221 };
222 
223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
224 {
225 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
226 	return container_of(engine, struct virtual_engine, base);
227 }
228 
229 static int __execlists_context_alloc(struct intel_context *ce,
230 				     struct intel_engine_cs *engine);
231 
232 static void execlists_init_reg_state(u32 *reg_state,
233 				     const struct intel_context *ce,
234 				     const struct intel_engine_cs *engine,
235 				     const struct intel_ring *ring,
236 				     bool close);
237 static void
238 __execlists_update_reg_state(const struct intel_context *ce,
239 			     const struct intel_engine_cs *engine,
240 			     u32 head);
241 
242 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
243 {
244 	if (INTEL_GEN(engine->i915) >= 12)
245 		return 0x60;
246 	else if (INTEL_GEN(engine->i915) >= 9)
247 		return 0x54;
248 	else if (engine->class == RENDER_CLASS)
249 		return 0x58;
250 	else
251 		return -1;
252 }
253 
254 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
255 {
256 	if (INTEL_GEN(engine->i915) >= 12)
257 		return 0x74;
258 	else if (INTEL_GEN(engine->i915) >= 9)
259 		return 0x68;
260 	else if (engine->class == RENDER_CLASS)
261 		return 0xd8;
262 	else
263 		return -1;
264 }
265 
266 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
267 {
268 	if (INTEL_GEN(engine->i915) >= 12)
269 		return 0x12;
270 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
271 		return 0x18;
272 	else
273 		return -1;
274 }
275 
276 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
277 {
278 	int x;
279 
280 	x = lrc_ring_wa_bb_per_ctx(engine);
281 	if (x < 0)
282 		return x;
283 
284 	return x + 2;
285 }
286 
287 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
288 {
289 	int x;
290 
291 	x = lrc_ring_indirect_ptr(engine);
292 	if (x < 0)
293 		return x;
294 
295 	return x + 2;
296 }
297 
298 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
299 {
300 	if (engine->class != RENDER_CLASS)
301 		return -1;
302 
303 	if (INTEL_GEN(engine->i915) >= 12)
304 		return 0xb6;
305 	else if (INTEL_GEN(engine->i915) >= 11)
306 		return 0xaa;
307 	else
308 		return -1;
309 }
310 
311 static u32
312 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
313 {
314 	switch (INTEL_GEN(engine->i915)) {
315 	default:
316 		MISSING_CASE(INTEL_GEN(engine->i915));
317 		fallthrough;
318 	case 12:
319 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
320 	case 11:
321 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
322 	case 10:
323 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
324 	case 9:
325 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
326 	case 8:
327 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
328 	}
329 }
330 
331 static void
332 lrc_ring_setup_indirect_ctx(u32 *regs,
333 			    const struct intel_engine_cs *engine,
334 			    u32 ctx_bb_ggtt_addr,
335 			    u32 size)
336 {
337 	GEM_BUG_ON(!size);
338 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
339 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
340 	regs[lrc_ring_indirect_ptr(engine) + 1] =
341 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
342 
343 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
344 	regs[lrc_ring_indirect_offset(engine) + 1] =
345 		lrc_ring_indirect_offset_default(engine) << 6;
346 }
347 
348 static u32 intel_context_get_runtime(const struct intel_context *ce)
349 {
350 	/*
351 	 * We can use either ppHWSP[16] which is recorded before the context
352 	 * switch (and so excludes the cost of context switches) or use the
353 	 * value from the context image itself, which is saved/restored earlier
354 	 * and so includes the cost of the save.
355 	 */
356 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
357 }
358 
359 static void mark_eio(struct i915_request *rq)
360 {
361 	if (i915_request_completed(rq))
362 		return;
363 
364 	GEM_BUG_ON(i915_request_signaled(rq));
365 
366 	i915_request_set_error_once(rq, -EIO);
367 	i915_request_mark_complete(rq);
368 }
369 
370 static struct i915_request *
371 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
372 {
373 	struct i915_request *active = rq;
374 
375 	rcu_read_lock();
376 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
377 		if (i915_request_completed(rq))
378 			break;
379 
380 		active = rq;
381 	}
382 	rcu_read_unlock();
383 
384 	return active;
385 }
386 
387 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
388 {
389 	return (i915_ggtt_offset(engine->status_page.vma) +
390 		I915_GEM_HWS_PREEMPT_ADDR);
391 }
392 
393 static inline void
394 ring_set_paused(const struct intel_engine_cs *engine, int state)
395 {
396 	/*
397 	 * We inspect HWS_PREEMPT with a semaphore inside
398 	 * engine->emit_fini_breadcrumb. If the dword is true,
399 	 * the ring is paused as the semaphore will busywait
400 	 * until the dword is false.
401 	 */
402 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
403 	if (state)
404 		wmb();
405 }
406 
407 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
408 {
409 	return rb_entry(rb, struct i915_priolist, node);
410 }
411 
412 static inline int rq_prio(const struct i915_request *rq)
413 {
414 	return READ_ONCE(rq->sched.attr.priority);
415 }
416 
417 static int effective_prio(const struct i915_request *rq)
418 {
419 	int prio = rq_prio(rq);
420 
421 	/*
422 	 * If this request is special and must not be interrupted at any
423 	 * cost, so be it. Note we are only checking the most recent request
424 	 * in the context and so may be masking an earlier vip request. It
425 	 * is hoped that under the conditions where nopreempt is used, this
426 	 * will not matter (i.e. all requests to that context will be
427 	 * nopreempt for as long as desired).
428 	 */
429 	if (i915_request_has_nopreempt(rq))
430 		prio = I915_PRIORITY_UNPREEMPTABLE;
431 
432 	return prio;
433 }
434 
435 static int queue_prio(const struct intel_engine_execlists *execlists)
436 {
437 	struct i915_priolist *p;
438 	struct rb_node *rb;
439 
440 	rb = rb_first_cached(&execlists->queue);
441 	if (!rb)
442 		return INT_MIN;
443 
444 	/*
445 	 * As the priolist[] are inverted, with the highest priority in [0],
446 	 * we have to flip the index value to become priority.
447 	 */
448 	p = to_priolist(rb);
449 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
450 }
451 
452 static inline bool need_preempt(const struct intel_engine_cs *engine,
453 				const struct i915_request *rq,
454 				struct rb_node *rb)
455 {
456 	int last_prio;
457 
458 	if (!intel_engine_has_semaphores(engine))
459 		return false;
460 
461 	/*
462 	 * Check if the current priority hint merits a preemption attempt.
463 	 *
464 	 * We record the highest value priority we saw during rescheduling
465 	 * prior to this dequeue, therefore we know that if it is strictly
466 	 * less than the current tail of ESLP[0], we do not need to force
467 	 * a preempt-to-idle cycle.
468 	 *
469 	 * However, the priority hint is a mere hint that we may need to
470 	 * preempt. If that hint is stale or we may be trying to preempt
471 	 * ourselves, ignore the request.
472 	 *
473 	 * More naturally we would write
474 	 *      prio >= max(0, last);
475 	 * except that we wish to prevent triggering preemption at the same
476 	 * priority level: the task that is running should remain running
477 	 * to preserve FIFO ordering of dependencies.
478 	 */
479 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
480 	if (engine->execlists.queue_priority_hint <= last_prio)
481 		return false;
482 
483 	/*
484 	 * Check against the first request in ELSP[1], it will, thanks to the
485 	 * power of PI, be the highest priority of that context.
486 	 */
487 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
488 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
489 		return true;
490 
491 	if (rb) {
492 		struct virtual_engine *ve =
493 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
494 		bool preempt = false;
495 
496 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
497 			struct i915_request *next;
498 
499 			rcu_read_lock();
500 			next = READ_ONCE(ve->request);
501 			if (next)
502 				preempt = rq_prio(next) > last_prio;
503 			rcu_read_unlock();
504 		}
505 
506 		if (preempt)
507 			return preempt;
508 	}
509 
510 	/*
511 	 * If the inflight context did not trigger the preemption, then maybe
512 	 * it was the set of queued requests? Pick the highest priority in
513 	 * the queue (the first active priolist) and see if it deserves to be
514 	 * running instead of ELSP[0].
515 	 *
516 	 * The highest priority request in the queue can not be either
517 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
518 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
519 	 */
520 	return queue_prio(&engine->execlists) > last_prio;
521 }
522 
523 __maybe_unused static inline bool
524 assert_priority_queue(const struct i915_request *prev,
525 		      const struct i915_request *next)
526 {
527 	/*
528 	 * Without preemption, the prev may refer to the still active element
529 	 * which we refuse to let go.
530 	 *
531 	 * Even with preemption, there are times when we think it is better not
532 	 * to preempt and leave an ostensibly lower priority request in flight.
533 	 */
534 	if (i915_request_is_active(prev))
535 		return true;
536 
537 	return rq_prio(prev) >= rq_prio(next);
538 }
539 
540 /*
541  * The context descriptor encodes various attributes of a context,
542  * including its GTT address and some flags. Because it's fairly
543  * expensive to calculate, we'll just do it once and cache the result,
544  * which remains valid until the context is unpinned.
545  *
546  * This is what a descriptor looks like, from LSB to MSB::
547  *
548  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
549  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
550  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
551  *      bits 53-54:    mbz, reserved for use by hardware
552  *      bits 55-63:    group ID, currently unused and set to 0
553  *
554  * Starting from Gen11, the upper dword of the descriptor has a new format:
555  *
556  *      bits 32-36:    reserved
557  *      bits 37-47:    SW context ID
558  *      bits 48:53:    engine instance
559  *      bit 54:        mbz, reserved for use by hardware
560  *      bits 55-60:    SW counter
561  *      bits 61-63:    engine class
562  *
563  * engine info, SW context ID and SW counter need to form a unique number
564  * (Context ID) per lrc.
565  */
566 static u32
567 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
568 {
569 	u32 desc;
570 
571 	desc = INTEL_LEGACY_32B_CONTEXT;
572 	if (i915_vm_is_4lvl(ce->vm))
573 		desc = INTEL_LEGACY_64B_CONTEXT;
574 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
575 
576 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
577 	if (IS_GEN(engine->i915, 8))
578 		desc |= GEN8_CTX_L3LLC_COHERENT;
579 
580 	return i915_ggtt_offset(ce->state) | desc;
581 }
582 
583 static inline unsigned int dword_in_page(void *addr)
584 {
585 	return offset_in_page(addr) / sizeof(u32);
586 }
587 
588 static void set_offsets(u32 *regs,
589 			const u8 *data,
590 			const struct intel_engine_cs *engine,
591 			bool clear)
592 #define NOP(x) (BIT(7) | (x))
593 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
594 #define POSTED BIT(0)
595 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
596 #define REG16(x) \
597 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
598 	(((x) >> 2) & 0x7f)
599 #define END(total_state_size) 0, (total_state_size)
600 {
601 	const u32 base = engine->mmio_base;
602 
603 	while (*data) {
604 		u8 count, flags;
605 
606 		if (*data & BIT(7)) { /* skip */
607 			count = *data++ & ~BIT(7);
608 			if (clear)
609 				memset32(regs, MI_NOOP, count);
610 			regs += count;
611 			continue;
612 		}
613 
614 		count = *data & 0x3f;
615 		flags = *data >> 6;
616 		data++;
617 
618 		*regs = MI_LOAD_REGISTER_IMM(count);
619 		if (flags & POSTED)
620 			*regs |= MI_LRI_FORCE_POSTED;
621 		if (INTEL_GEN(engine->i915) >= 11)
622 			*regs |= MI_LRI_LRM_CS_MMIO;
623 		regs++;
624 
625 		GEM_BUG_ON(!count);
626 		do {
627 			u32 offset = 0;
628 			u8 v;
629 
630 			do {
631 				v = *data++;
632 				offset <<= 7;
633 				offset |= v & ~BIT(7);
634 			} while (v & BIT(7));
635 
636 			regs[0] = base + (offset << 2);
637 			if (clear)
638 				regs[1] = 0;
639 			regs += 2;
640 		} while (--count);
641 	}
642 
643 	if (clear) {
644 		u8 count = *++data;
645 
646 		/* Clear past the tail for HW access */
647 		GEM_BUG_ON(dword_in_page(regs) > count);
648 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
649 
650 		/* Close the batch; used mainly by live_lrc_layout() */
651 		*regs = MI_BATCH_BUFFER_END;
652 		if (INTEL_GEN(engine->i915) >= 10)
653 			*regs |= BIT(0);
654 	}
655 }
656 
657 static const u8 gen8_xcs_offsets[] = {
658 	NOP(1),
659 	LRI(11, 0),
660 	REG16(0x244),
661 	REG(0x034),
662 	REG(0x030),
663 	REG(0x038),
664 	REG(0x03c),
665 	REG(0x168),
666 	REG(0x140),
667 	REG(0x110),
668 	REG(0x11c),
669 	REG(0x114),
670 	REG(0x118),
671 
672 	NOP(9),
673 	LRI(9, 0),
674 	REG16(0x3a8),
675 	REG16(0x28c),
676 	REG16(0x288),
677 	REG16(0x284),
678 	REG16(0x280),
679 	REG16(0x27c),
680 	REG16(0x278),
681 	REG16(0x274),
682 	REG16(0x270),
683 
684 	NOP(13),
685 	LRI(2, 0),
686 	REG16(0x200),
687 	REG(0x028),
688 
689 	END(80)
690 };
691 
692 static const u8 gen9_xcs_offsets[] = {
693 	NOP(1),
694 	LRI(14, POSTED),
695 	REG16(0x244),
696 	REG(0x034),
697 	REG(0x030),
698 	REG(0x038),
699 	REG(0x03c),
700 	REG(0x168),
701 	REG(0x140),
702 	REG(0x110),
703 	REG(0x11c),
704 	REG(0x114),
705 	REG(0x118),
706 	REG(0x1c0),
707 	REG(0x1c4),
708 	REG(0x1c8),
709 
710 	NOP(3),
711 	LRI(9, POSTED),
712 	REG16(0x3a8),
713 	REG16(0x28c),
714 	REG16(0x288),
715 	REG16(0x284),
716 	REG16(0x280),
717 	REG16(0x27c),
718 	REG16(0x278),
719 	REG16(0x274),
720 	REG16(0x270),
721 
722 	NOP(13),
723 	LRI(1, POSTED),
724 	REG16(0x200),
725 
726 	NOP(13),
727 	LRI(44, POSTED),
728 	REG(0x028),
729 	REG(0x09c),
730 	REG(0x0c0),
731 	REG(0x178),
732 	REG(0x17c),
733 	REG16(0x358),
734 	REG(0x170),
735 	REG(0x150),
736 	REG(0x154),
737 	REG(0x158),
738 	REG16(0x41c),
739 	REG16(0x600),
740 	REG16(0x604),
741 	REG16(0x608),
742 	REG16(0x60c),
743 	REG16(0x610),
744 	REG16(0x614),
745 	REG16(0x618),
746 	REG16(0x61c),
747 	REG16(0x620),
748 	REG16(0x624),
749 	REG16(0x628),
750 	REG16(0x62c),
751 	REG16(0x630),
752 	REG16(0x634),
753 	REG16(0x638),
754 	REG16(0x63c),
755 	REG16(0x640),
756 	REG16(0x644),
757 	REG16(0x648),
758 	REG16(0x64c),
759 	REG16(0x650),
760 	REG16(0x654),
761 	REG16(0x658),
762 	REG16(0x65c),
763 	REG16(0x660),
764 	REG16(0x664),
765 	REG16(0x668),
766 	REG16(0x66c),
767 	REG16(0x670),
768 	REG16(0x674),
769 	REG16(0x678),
770 	REG16(0x67c),
771 	REG(0x068),
772 
773 	END(176)
774 };
775 
776 static const u8 gen12_xcs_offsets[] = {
777 	NOP(1),
778 	LRI(13, POSTED),
779 	REG16(0x244),
780 	REG(0x034),
781 	REG(0x030),
782 	REG(0x038),
783 	REG(0x03c),
784 	REG(0x168),
785 	REG(0x140),
786 	REG(0x110),
787 	REG(0x1c0),
788 	REG(0x1c4),
789 	REG(0x1c8),
790 	REG(0x180),
791 	REG16(0x2b4),
792 
793 	NOP(5),
794 	LRI(9, POSTED),
795 	REG16(0x3a8),
796 	REG16(0x28c),
797 	REG16(0x288),
798 	REG16(0x284),
799 	REG16(0x280),
800 	REG16(0x27c),
801 	REG16(0x278),
802 	REG16(0x274),
803 	REG16(0x270),
804 
805 	END(80)
806 };
807 
808 static const u8 gen8_rcs_offsets[] = {
809 	NOP(1),
810 	LRI(14, POSTED),
811 	REG16(0x244),
812 	REG(0x034),
813 	REG(0x030),
814 	REG(0x038),
815 	REG(0x03c),
816 	REG(0x168),
817 	REG(0x140),
818 	REG(0x110),
819 	REG(0x11c),
820 	REG(0x114),
821 	REG(0x118),
822 	REG(0x1c0),
823 	REG(0x1c4),
824 	REG(0x1c8),
825 
826 	NOP(3),
827 	LRI(9, POSTED),
828 	REG16(0x3a8),
829 	REG16(0x28c),
830 	REG16(0x288),
831 	REG16(0x284),
832 	REG16(0x280),
833 	REG16(0x27c),
834 	REG16(0x278),
835 	REG16(0x274),
836 	REG16(0x270),
837 
838 	NOP(13),
839 	LRI(1, 0),
840 	REG(0x0c8),
841 
842 	END(80)
843 };
844 
845 static const u8 gen9_rcs_offsets[] = {
846 	NOP(1),
847 	LRI(14, POSTED),
848 	REG16(0x244),
849 	REG(0x34),
850 	REG(0x30),
851 	REG(0x38),
852 	REG(0x3c),
853 	REG(0x168),
854 	REG(0x140),
855 	REG(0x110),
856 	REG(0x11c),
857 	REG(0x114),
858 	REG(0x118),
859 	REG(0x1c0),
860 	REG(0x1c4),
861 	REG(0x1c8),
862 
863 	NOP(3),
864 	LRI(9, POSTED),
865 	REG16(0x3a8),
866 	REG16(0x28c),
867 	REG16(0x288),
868 	REG16(0x284),
869 	REG16(0x280),
870 	REG16(0x27c),
871 	REG16(0x278),
872 	REG16(0x274),
873 	REG16(0x270),
874 
875 	NOP(13),
876 	LRI(1, 0),
877 	REG(0xc8),
878 
879 	NOP(13),
880 	LRI(44, POSTED),
881 	REG(0x28),
882 	REG(0x9c),
883 	REG(0xc0),
884 	REG(0x178),
885 	REG(0x17c),
886 	REG16(0x358),
887 	REG(0x170),
888 	REG(0x150),
889 	REG(0x154),
890 	REG(0x158),
891 	REG16(0x41c),
892 	REG16(0x600),
893 	REG16(0x604),
894 	REG16(0x608),
895 	REG16(0x60c),
896 	REG16(0x610),
897 	REG16(0x614),
898 	REG16(0x618),
899 	REG16(0x61c),
900 	REG16(0x620),
901 	REG16(0x624),
902 	REG16(0x628),
903 	REG16(0x62c),
904 	REG16(0x630),
905 	REG16(0x634),
906 	REG16(0x638),
907 	REG16(0x63c),
908 	REG16(0x640),
909 	REG16(0x644),
910 	REG16(0x648),
911 	REG16(0x64c),
912 	REG16(0x650),
913 	REG16(0x654),
914 	REG16(0x658),
915 	REG16(0x65c),
916 	REG16(0x660),
917 	REG16(0x664),
918 	REG16(0x668),
919 	REG16(0x66c),
920 	REG16(0x670),
921 	REG16(0x674),
922 	REG16(0x678),
923 	REG16(0x67c),
924 	REG(0x68),
925 
926 	END(176)
927 };
928 
929 static const u8 gen11_rcs_offsets[] = {
930 	NOP(1),
931 	LRI(15, POSTED),
932 	REG16(0x244),
933 	REG(0x034),
934 	REG(0x030),
935 	REG(0x038),
936 	REG(0x03c),
937 	REG(0x168),
938 	REG(0x140),
939 	REG(0x110),
940 	REG(0x11c),
941 	REG(0x114),
942 	REG(0x118),
943 	REG(0x1c0),
944 	REG(0x1c4),
945 	REG(0x1c8),
946 	REG(0x180),
947 
948 	NOP(1),
949 	LRI(9, POSTED),
950 	REG16(0x3a8),
951 	REG16(0x28c),
952 	REG16(0x288),
953 	REG16(0x284),
954 	REG16(0x280),
955 	REG16(0x27c),
956 	REG16(0x278),
957 	REG16(0x274),
958 	REG16(0x270),
959 
960 	LRI(1, POSTED),
961 	REG(0x1b0),
962 
963 	NOP(10),
964 	LRI(1, 0),
965 	REG(0x0c8),
966 
967 	END(80)
968 };
969 
970 static const u8 gen12_rcs_offsets[] = {
971 	NOP(1),
972 	LRI(13, POSTED),
973 	REG16(0x244),
974 	REG(0x034),
975 	REG(0x030),
976 	REG(0x038),
977 	REG(0x03c),
978 	REG(0x168),
979 	REG(0x140),
980 	REG(0x110),
981 	REG(0x1c0),
982 	REG(0x1c4),
983 	REG(0x1c8),
984 	REG(0x180),
985 	REG16(0x2b4),
986 
987 	NOP(5),
988 	LRI(9, POSTED),
989 	REG16(0x3a8),
990 	REG16(0x28c),
991 	REG16(0x288),
992 	REG16(0x284),
993 	REG16(0x280),
994 	REG16(0x27c),
995 	REG16(0x278),
996 	REG16(0x274),
997 	REG16(0x270),
998 
999 	LRI(3, POSTED),
1000 	REG(0x1b0),
1001 	REG16(0x5a8),
1002 	REG16(0x5ac),
1003 
1004 	NOP(6),
1005 	LRI(1, 0),
1006 	REG(0x0c8),
1007 	NOP(3 + 9 + 1),
1008 
1009 	LRI(51, POSTED),
1010 	REG16(0x588),
1011 	REG16(0x588),
1012 	REG16(0x588),
1013 	REG16(0x588),
1014 	REG16(0x588),
1015 	REG16(0x588),
1016 	REG(0x028),
1017 	REG(0x09c),
1018 	REG(0x0c0),
1019 	REG(0x178),
1020 	REG(0x17c),
1021 	REG16(0x358),
1022 	REG(0x170),
1023 	REG(0x150),
1024 	REG(0x154),
1025 	REG(0x158),
1026 	REG16(0x41c),
1027 	REG16(0x600),
1028 	REG16(0x604),
1029 	REG16(0x608),
1030 	REG16(0x60c),
1031 	REG16(0x610),
1032 	REG16(0x614),
1033 	REG16(0x618),
1034 	REG16(0x61c),
1035 	REG16(0x620),
1036 	REG16(0x624),
1037 	REG16(0x628),
1038 	REG16(0x62c),
1039 	REG16(0x630),
1040 	REG16(0x634),
1041 	REG16(0x638),
1042 	REG16(0x63c),
1043 	REG16(0x640),
1044 	REG16(0x644),
1045 	REG16(0x648),
1046 	REG16(0x64c),
1047 	REG16(0x650),
1048 	REG16(0x654),
1049 	REG16(0x658),
1050 	REG16(0x65c),
1051 	REG16(0x660),
1052 	REG16(0x664),
1053 	REG16(0x668),
1054 	REG16(0x66c),
1055 	REG16(0x670),
1056 	REG16(0x674),
1057 	REG16(0x678),
1058 	REG16(0x67c),
1059 	REG(0x068),
1060 	REG(0x084),
1061 	NOP(1),
1062 
1063 	END(192)
1064 };
1065 
1066 #undef END
1067 #undef REG16
1068 #undef REG
1069 #undef LRI
1070 #undef NOP
1071 
1072 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1073 {
1074 	/*
1075 	 * The gen12+ lists only have the registers we program in the basic
1076 	 * default state. We rely on the context image using relative
1077 	 * addressing to automatic fixup the register state between the
1078 	 * physical engines for virtual engine.
1079 	 */
1080 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1081 		   !intel_engine_has_relative_mmio(engine));
1082 
1083 	if (engine->class == RENDER_CLASS) {
1084 		if (INTEL_GEN(engine->i915) >= 12)
1085 			return gen12_rcs_offsets;
1086 		else if (INTEL_GEN(engine->i915) >= 11)
1087 			return gen11_rcs_offsets;
1088 		else if (INTEL_GEN(engine->i915) >= 9)
1089 			return gen9_rcs_offsets;
1090 		else
1091 			return gen8_rcs_offsets;
1092 	} else {
1093 		if (INTEL_GEN(engine->i915) >= 12)
1094 			return gen12_xcs_offsets;
1095 		else if (INTEL_GEN(engine->i915) >= 9)
1096 			return gen9_xcs_offsets;
1097 		else
1098 			return gen8_xcs_offsets;
1099 	}
1100 }
1101 
1102 static struct i915_request *
1103 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1104 {
1105 	struct i915_request *rq, *rn, *active = NULL;
1106 	struct list_head *uninitialized_var(pl);
1107 	int prio = I915_PRIORITY_INVALID;
1108 
1109 	lockdep_assert_held(&engine->active.lock);
1110 
1111 	list_for_each_entry_safe_reverse(rq, rn,
1112 					 &engine->active.requests,
1113 					 sched.link) {
1114 		if (i915_request_completed(rq))
1115 			continue; /* XXX */
1116 
1117 		__i915_request_unsubmit(rq);
1118 
1119 		/*
1120 		 * Push the request back into the queue for later resubmission.
1121 		 * If this request is not native to this physical engine (i.e.
1122 		 * it came from a virtual source), push it back onto the virtual
1123 		 * engine so that it can be moved across onto another physical
1124 		 * engine as load dictates.
1125 		 */
1126 		if (likely(rq->execution_mask == engine->mask)) {
1127 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1128 			if (rq_prio(rq) != prio) {
1129 				prio = rq_prio(rq);
1130 				pl = i915_sched_lookup_priolist(engine, prio);
1131 			}
1132 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1133 
1134 			list_move(&rq->sched.link, pl);
1135 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1136 
1137 			/* Check in case we rollback so far we wrap [size/2] */
1138 			if (intel_ring_direction(rq->ring,
1139 						 intel_ring_wrap(rq->ring,
1140 								 rq->tail),
1141 						 rq->ring->tail) > 0)
1142 				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1143 
1144 			active = rq;
1145 		} else {
1146 			struct intel_engine_cs *owner = rq->context->engine;
1147 
1148 			/*
1149 			 * Decouple the virtual breadcrumb before moving it
1150 			 * back to the virtual engine -- we don't want the
1151 			 * request to complete in the background and try
1152 			 * and cancel the breadcrumb on the virtual engine
1153 			 * (instead of the old engine where it is linked)!
1154 			 */
1155 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1156 				     &rq->fence.flags)) {
1157 				spin_lock_nested(&rq->lock,
1158 						 SINGLE_DEPTH_NESTING);
1159 				i915_request_cancel_breadcrumb(rq);
1160 				spin_unlock(&rq->lock);
1161 			}
1162 			WRITE_ONCE(rq->engine, owner);
1163 			owner->submit_request(rq);
1164 			active = NULL;
1165 		}
1166 	}
1167 
1168 	return active;
1169 }
1170 
1171 struct i915_request *
1172 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1173 {
1174 	struct intel_engine_cs *engine =
1175 		container_of(execlists, typeof(*engine), execlists);
1176 
1177 	return __unwind_incomplete_requests(engine);
1178 }
1179 
1180 static inline void
1181 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1182 {
1183 	/*
1184 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1185 	 * The compiler should eliminate this function as dead-code.
1186 	 */
1187 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1188 		return;
1189 
1190 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1191 				   status, rq);
1192 }
1193 
1194 static void intel_engine_context_in(struct intel_engine_cs *engine)
1195 {
1196 	unsigned long flags;
1197 
1198 	if (atomic_add_unless(&engine->stats.active, 1, 0))
1199 		return;
1200 
1201 	write_seqlock_irqsave(&engine->stats.lock, flags);
1202 	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1203 		engine->stats.start = ktime_get();
1204 		atomic_inc(&engine->stats.active);
1205 	}
1206 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1207 }
1208 
1209 static void intel_engine_context_out(struct intel_engine_cs *engine)
1210 {
1211 	unsigned long flags;
1212 
1213 	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1214 
1215 	if (atomic_add_unless(&engine->stats.active, -1, 1))
1216 		return;
1217 
1218 	write_seqlock_irqsave(&engine->stats.lock, flags);
1219 	if (atomic_dec_and_test(&engine->stats.active)) {
1220 		engine->stats.total =
1221 			ktime_add(engine->stats.total,
1222 				  ktime_sub(ktime_get(), engine->stats.start));
1223 	}
1224 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1225 }
1226 
1227 static void
1228 execlists_check_context(const struct intel_context *ce,
1229 			const struct intel_engine_cs *engine)
1230 {
1231 	const struct intel_ring *ring = ce->ring;
1232 	u32 *regs = ce->lrc_reg_state;
1233 	bool valid = true;
1234 	int x;
1235 
1236 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1237 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1238 		       engine->name,
1239 		       regs[CTX_RING_START],
1240 		       i915_ggtt_offset(ring->vma));
1241 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1242 		valid = false;
1243 	}
1244 
1245 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1246 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1247 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1248 		       engine->name,
1249 		       regs[CTX_RING_CTL],
1250 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1251 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1252 		valid = false;
1253 	}
1254 
1255 	x = lrc_ring_mi_mode(engine);
1256 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1257 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1258 		       engine->name, regs[x + 1]);
1259 		regs[x + 1] &= ~STOP_RING;
1260 		regs[x + 1] |= STOP_RING << 16;
1261 		valid = false;
1262 	}
1263 
1264 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1265 }
1266 
1267 static void restore_default_state(struct intel_context *ce,
1268 				  struct intel_engine_cs *engine)
1269 {
1270 	u32 *regs;
1271 
1272 	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1273 	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1274 
1275 	ce->runtime.last = intel_context_get_runtime(ce);
1276 }
1277 
1278 static void reset_active(struct i915_request *rq,
1279 			 struct intel_engine_cs *engine)
1280 {
1281 	struct intel_context * const ce = rq->context;
1282 	u32 head;
1283 
1284 	/*
1285 	 * The executing context has been cancelled. We want to prevent
1286 	 * further execution along this context and propagate the error on
1287 	 * to anything depending on its results.
1288 	 *
1289 	 * In __i915_request_submit(), we apply the -EIO and remove the
1290 	 * requests' payloads for any banned requests. But first, we must
1291 	 * rewind the context back to the start of the incomplete request so
1292 	 * that we do not jump back into the middle of the batch.
1293 	 *
1294 	 * We preserve the breadcrumbs and semaphores of the incomplete
1295 	 * requests so that inter-timeline dependencies (i.e other timelines)
1296 	 * remain correctly ordered. And we defer to __i915_request_submit()
1297 	 * so that all asynchronous waits are correctly handled.
1298 	 */
1299 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1300 		     rq->fence.context, rq->fence.seqno);
1301 
1302 	/* On resubmission of the active request, payload will be scrubbed */
1303 	if (i915_request_completed(rq))
1304 		head = rq->tail;
1305 	else
1306 		head = active_request(ce->timeline, rq)->head;
1307 	head = intel_ring_wrap(ce->ring, head);
1308 
1309 	/* Scrub the context image to prevent replaying the previous batch */
1310 	restore_default_state(ce, engine);
1311 	__execlists_update_reg_state(ce, engine, head);
1312 
1313 	/* We've switched away, so this should be a no-op, but intent matters */
1314 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1315 }
1316 
1317 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1318 {
1319 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1320 	ce->runtime.num_underflow += dt < 0;
1321 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1322 #endif
1323 }
1324 
1325 static void intel_context_update_runtime(struct intel_context *ce)
1326 {
1327 	u32 old;
1328 	s32 dt;
1329 
1330 	if (intel_context_is_barrier(ce))
1331 		return;
1332 
1333 	old = ce->runtime.last;
1334 	ce->runtime.last = intel_context_get_runtime(ce);
1335 	dt = ce->runtime.last - old;
1336 
1337 	if (unlikely(dt <= 0)) {
1338 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1339 			 old, ce->runtime.last, dt);
1340 		st_update_runtime_underflow(ce, dt);
1341 		return;
1342 	}
1343 
1344 	ewma_runtime_add(&ce->runtime.avg, dt);
1345 	ce->runtime.total += dt;
1346 }
1347 
1348 static inline struct intel_engine_cs *
1349 __execlists_schedule_in(struct i915_request *rq)
1350 {
1351 	struct intel_engine_cs * const engine = rq->engine;
1352 	struct intel_context * const ce = rq->context;
1353 
1354 	intel_context_get(ce);
1355 
1356 	if (unlikely(intel_context_is_banned(ce)))
1357 		reset_active(rq, engine);
1358 
1359 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1360 		execlists_check_context(ce, engine);
1361 
1362 	if (ce->tag) {
1363 		/* Use a fixed tag for OA and friends */
1364 		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1365 		ce->lrc.ccid = ce->tag;
1366 	} else {
1367 		/* We don't need a strict matching tag, just different values */
1368 		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1369 
1370 		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1371 		clear_bit(tag - 1, &engine->context_tag);
1372 		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1373 
1374 		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1375 	}
1376 
1377 	ce->lrc.ccid |= engine->execlists.ccid;
1378 
1379 	__intel_gt_pm_get(engine->gt);
1380 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1381 	intel_engine_context_in(engine);
1382 
1383 	return engine;
1384 }
1385 
1386 static inline struct i915_request *
1387 execlists_schedule_in(struct i915_request *rq, int idx)
1388 {
1389 	struct intel_context * const ce = rq->context;
1390 	struct intel_engine_cs *old;
1391 
1392 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1393 	trace_i915_request_in(rq, idx);
1394 
1395 	old = READ_ONCE(ce->inflight);
1396 	do {
1397 		if (!old) {
1398 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1399 			break;
1400 		}
1401 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1402 
1403 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1404 	return i915_request_get(rq);
1405 }
1406 
1407 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1408 {
1409 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1410 	struct i915_request *next = READ_ONCE(ve->request);
1411 
1412 	if (next && next->execution_mask & ~rq->execution_mask)
1413 		tasklet_schedule(&ve->base.execlists.tasklet);
1414 }
1415 
1416 static inline void
1417 __execlists_schedule_out(struct i915_request *rq,
1418 			 struct intel_engine_cs * const engine,
1419 			 unsigned int ccid)
1420 {
1421 	struct intel_context * const ce = rq->context;
1422 
1423 	/*
1424 	 * NB process_csb() is not under the engine->active.lock and hence
1425 	 * schedule_out can race with schedule_in meaning that we should
1426 	 * refrain from doing non-trivial work here.
1427 	 */
1428 
1429 	/*
1430 	 * If we have just completed this context, the engine may now be
1431 	 * idle and we want to re-enter powersaving.
1432 	 */
1433 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1434 	    i915_request_completed(rq))
1435 		intel_engine_add_retire(engine, ce->timeline);
1436 
1437 	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1438 	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1439 	if (ccid < BITS_PER_LONG) {
1440 		GEM_BUG_ON(ccid == 0);
1441 		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1442 		set_bit(ccid - 1, &engine->context_tag);
1443 	}
1444 
1445 	intel_context_update_runtime(ce);
1446 	intel_engine_context_out(engine);
1447 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1448 	intel_gt_pm_put_async(engine->gt);
1449 
1450 	/*
1451 	 * If this is part of a virtual engine, its next request may
1452 	 * have been blocked waiting for access to the active context.
1453 	 * We have to kick all the siblings again in case we need to
1454 	 * switch (e.g. the next request is not runnable on this
1455 	 * engine). Hopefully, we will already have submitted the next
1456 	 * request before the tasklet runs and do not need to rebuild
1457 	 * each virtual tree and kick everyone again.
1458 	 */
1459 	if (ce->engine != engine)
1460 		kick_siblings(rq, ce);
1461 
1462 	intel_context_put(ce);
1463 }
1464 
1465 static inline void
1466 execlists_schedule_out(struct i915_request *rq)
1467 {
1468 	struct intel_context * const ce = rq->context;
1469 	struct intel_engine_cs *cur, *old;
1470 	u32 ccid;
1471 
1472 	trace_i915_request_out(rq);
1473 
1474 	ccid = rq->context->lrc.ccid;
1475 	old = READ_ONCE(ce->inflight);
1476 	do
1477 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1478 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1479 	if (!cur)
1480 		__execlists_schedule_out(rq, old, ccid);
1481 
1482 	i915_request_put(rq);
1483 }
1484 
1485 static u64 execlists_update_context(struct i915_request *rq)
1486 {
1487 	struct intel_context *ce = rq->context;
1488 	u64 desc = ce->lrc.desc;
1489 	u32 tail, prev;
1490 
1491 	/*
1492 	 * WaIdleLiteRestore:bdw,skl
1493 	 *
1494 	 * We should never submit the context with the same RING_TAIL twice
1495 	 * just in case we submit an empty ring, which confuses the HW.
1496 	 *
1497 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1498 	 * the normal request to be able to always advance the RING_TAIL on
1499 	 * subsequent resubmissions (for lite restore). Should that fail us,
1500 	 * and we try and submit the same tail again, force the context
1501 	 * reload.
1502 	 *
1503 	 * If we need to return to a preempted context, we need to skip the
1504 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1505 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1506 	 * an earlier request.
1507 	 */
1508 	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1509 	prev = rq->ring->tail;
1510 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1511 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1512 		desc |= CTX_DESC_FORCE_RESTORE;
1513 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1514 	rq->tail = rq->wa_tail;
1515 
1516 	/*
1517 	 * Make sure the context image is complete before we submit it to HW.
1518 	 *
1519 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1520 	 * an uncached write such as our mmio register access, the empirical
1521 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1522 	 * may not be visible to the HW prior to the completion of the UC
1523 	 * register write and that we may begin execution from the context
1524 	 * before its image is complete leading to invalid PD chasing.
1525 	 */
1526 	wmb();
1527 
1528 	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1529 	return desc;
1530 }
1531 
1532 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1533 {
1534 	if (execlists->ctrl_reg) {
1535 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1536 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1537 	} else {
1538 		writel(upper_32_bits(desc), execlists->submit_reg);
1539 		writel(lower_32_bits(desc), execlists->submit_reg);
1540 	}
1541 }
1542 
1543 static __maybe_unused char *
1544 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1545 {
1546 	if (!rq)
1547 		return "";
1548 
1549 	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1550 		 prefix,
1551 		 rq->context->lrc.ccid,
1552 		 rq->fence.context, rq->fence.seqno,
1553 		 i915_request_completed(rq) ? "!" :
1554 		 i915_request_started(rq) ? "*" :
1555 		 "",
1556 		 rq_prio(rq));
1557 
1558 	return buf;
1559 }
1560 
1561 static __maybe_unused void
1562 trace_ports(const struct intel_engine_execlists *execlists,
1563 	    const char *msg,
1564 	    struct i915_request * const *ports)
1565 {
1566 	const struct intel_engine_cs *engine =
1567 		container_of(execlists, typeof(*engine), execlists);
1568 	char __maybe_unused p0[40], p1[40];
1569 
1570 	if (!ports[0])
1571 		return;
1572 
1573 	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1574 		     dump_port(p0, sizeof(p0), "", ports[0]),
1575 		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1576 }
1577 
1578 static inline bool
1579 reset_in_progress(const struct intel_engine_execlists *execlists)
1580 {
1581 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1582 }
1583 
1584 static __maybe_unused bool
1585 assert_pending_valid(const struct intel_engine_execlists *execlists,
1586 		     const char *msg)
1587 {
1588 	struct intel_engine_cs *engine =
1589 		container_of(execlists, typeof(*engine), execlists);
1590 	struct i915_request * const *port, *rq;
1591 	struct intel_context *ce = NULL;
1592 	bool sentinel = false;
1593 	u32 ccid = -1;
1594 
1595 	trace_ports(execlists, msg, execlists->pending);
1596 
1597 	/* We may be messing around with the lists during reset, lalala */
1598 	if (reset_in_progress(execlists))
1599 		return true;
1600 
1601 	if (!execlists->pending[0]) {
1602 		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1603 			      engine->name);
1604 		return false;
1605 	}
1606 
1607 	if (execlists->pending[execlists_num_ports(execlists)]) {
1608 		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1609 			      engine->name, execlists_num_ports(execlists));
1610 		return false;
1611 	}
1612 
1613 	for (port = execlists->pending; (rq = *port); port++) {
1614 		unsigned long flags;
1615 		bool ok = true;
1616 
1617 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1618 		GEM_BUG_ON(!i915_request_is_active(rq));
1619 
1620 		if (ce == rq->context) {
1621 			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1622 				      engine->name,
1623 				      ce->timeline->fence_context,
1624 				      port - execlists->pending);
1625 			return false;
1626 		}
1627 		ce = rq->context;
1628 
1629 		if (ccid == ce->lrc.ccid) {
1630 			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1631 				      engine->name,
1632 				      ccid, ce->timeline->fence_context,
1633 				      port - execlists->pending);
1634 			return false;
1635 		}
1636 		ccid = ce->lrc.ccid;
1637 
1638 		/*
1639 		 * Sentinels are supposed to be lonely so they flush the
1640 		 * current exection off the HW. Check that they are the
1641 		 * only request in the pending submission.
1642 		 */
1643 		if (sentinel) {
1644 			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1645 				      engine->name,
1646 				      ce->timeline->fence_context,
1647 				      port - execlists->pending);
1648 			return false;
1649 		}
1650 
1651 		sentinel = i915_request_has_sentinel(rq);
1652 		if (sentinel && port != execlists->pending) {
1653 			GEM_TRACE_ERR("%s: sentinel context:%llx not in prime position[%zd]\n",
1654 				      engine->name,
1655 				      ce->timeline->fence_context,
1656 				      port - execlists->pending);
1657 			return false;
1658 		}
1659 
1660 		/* Hold tightly onto the lock to prevent concurrent retires! */
1661 		if (!spin_trylock_irqsave(&rq->lock, flags))
1662 			continue;
1663 
1664 		if (i915_request_completed(rq))
1665 			goto unlock;
1666 
1667 		if (i915_active_is_idle(&ce->active) &&
1668 		    !intel_context_is_barrier(ce)) {
1669 			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1670 				      engine->name,
1671 				      ce->timeline->fence_context,
1672 				      port - execlists->pending);
1673 			ok = false;
1674 			goto unlock;
1675 		}
1676 
1677 		if (!i915_vma_is_pinned(ce->state)) {
1678 			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1679 				      engine->name,
1680 				      ce->timeline->fence_context,
1681 				      port - execlists->pending);
1682 			ok = false;
1683 			goto unlock;
1684 		}
1685 
1686 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1687 			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1688 				      engine->name,
1689 				      ce->timeline->fence_context,
1690 				      port - execlists->pending);
1691 			ok = false;
1692 			goto unlock;
1693 		}
1694 
1695 unlock:
1696 		spin_unlock_irqrestore(&rq->lock, flags);
1697 		if (!ok)
1698 			return false;
1699 	}
1700 
1701 	return ce;
1702 }
1703 
1704 static void execlists_submit_ports(struct intel_engine_cs *engine)
1705 {
1706 	struct intel_engine_execlists *execlists = &engine->execlists;
1707 	unsigned int n;
1708 
1709 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1710 
1711 	/*
1712 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1713 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1714 	 * not be relinquished until the device is idle (see
1715 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1716 	 * that all ELSP are drained i.e. we have processed the CSB,
1717 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1718 	 */
1719 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1720 
1721 	/*
1722 	 * ELSQ note: the submit queue is not cleared after being submitted
1723 	 * to the HW so we need to make sure we always clean it up. This is
1724 	 * currently ensured by the fact that we always write the same number
1725 	 * of elsq entries, keep this in mind before changing the loop below.
1726 	 */
1727 	for (n = execlists_num_ports(execlists); n--; ) {
1728 		struct i915_request *rq = execlists->pending[n];
1729 
1730 		write_desc(execlists,
1731 			   rq ? execlists_update_context(rq) : 0,
1732 			   n);
1733 	}
1734 
1735 	/* we need to manually load the submit queue */
1736 	if (execlists->ctrl_reg)
1737 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1738 }
1739 
1740 static bool ctx_single_port_submission(const struct intel_context *ce)
1741 {
1742 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1743 		intel_context_force_single_submission(ce));
1744 }
1745 
1746 static bool can_merge_ctx(const struct intel_context *prev,
1747 			  const struct intel_context *next)
1748 {
1749 	if (prev != next)
1750 		return false;
1751 
1752 	if (ctx_single_port_submission(prev))
1753 		return false;
1754 
1755 	return true;
1756 }
1757 
1758 static unsigned long i915_request_flags(const struct i915_request *rq)
1759 {
1760 	return READ_ONCE(rq->fence.flags);
1761 }
1762 
1763 static bool can_merge_rq(const struct i915_request *prev,
1764 			 const struct i915_request *next)
1765 {
1766 	GEM_BUG_ON(prev == next);
1767 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1768 
1769 	/*
1770 	 * We do not submit known completed requests. Therefore if the next
1771 	 * request is already completed, we can pretend to merge it in
1772 	 * with the previous context (and we will skip updating the ELSP
1773 	 * and tracking). Thus hopefully keeping the ELSP full with active
1774 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1775 	 * us.
1776 	 */
1777 	if (i915_request_completed(next))
1778 		return true;
1779 
1780 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1781 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1782 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1783 		return false;
1784 
1785 	if (!can_merge_ctx(prev->context, next->context))
1786 		return false;
1787 
1788 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1789 	return true;
1790 }
1791 
1792 static void virtual_update_register_offsets(u32 *regs,
1793 					    struct intel_engine_cs *engine)
1794 {
1795 	set_offsets(regs, reg_offsets(engine), engine, false);
1796 }
1797 
1798 static bool virtual_matches(const struct virtual_engine *ve,
1799 			    const struct i915_request *rq,
1800 			    const struct intel_engine_cs *engine)
1801 {
1802 	const struct intel_engine_cs *inflight;
1803 
1804 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1805 		return false;
1806 
1807 	/*
1808 	 * We track when the HW has completed saving the context image
1809 	 * (i.e. when we have seen the final CS event switching out of
1810 	 * the context) and must not overwrite the context image before
1811 	 * then. This restricts us to only using the active engine
1812 	 * while the previous virtualized request is inflight (so
1813 	 * we reuse the register offsets). This is a very small
1814 	 * hystersis on the greedy seelction algorithm.
1815 	 */
1816 	inflight = intel_context_inflight(&ve->context);
1817 	if (inflight && inflight != engine)
1818 		return false;
1819 
1820 	return true;
1821 }
1822 
1823 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve)
1824 {
1825 	/*
1826 	 * All the outstanding signals on ve->siblings[0] must have
1827 	 * been completed, just pending the interrupt handler. As those
1828 	 * signals still refer to the old sibling (via rq->engine), we must
1829 	 * transfer those to the old irq_worker to keep our locking
1830 	 * consistent.
1831 	 */
1832 	intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context);
1833 }
1834 
1835 #define for_each_waiter(p__, rq__) \
1836 	list_for_each_entry_lockless(p__, \
1837 				     &(rq__)->sched.waiters_list, \
1838 				     wait_link)
1839 
1840 #define for_each_signaler(p__, rq__) \
1841 	list_for_each_entry_rcu(p__, \
1842 				&(rq__)->sched.signalers_list, \
1843 				signal_link)
1844 
1845 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1846 {
1847 	LIST_HEAD(list);
1848 
1849 	/*
1850 	 * We want to move the interrupted request to the back of
1851 	 * the round-robin list (i.e. its priority level), but
1852 	 * in doing so, we must then move all requests that were in
1853 	 * flight and were waiting for the interrupted request to
1854 	 * be run after it again.
1855 	 */
1856 	do {
1857 		struct i915_dependency *p;
1858 
1859 		GEM_BUG_ON(i915_request_is_active(rq));
1860 		list_move_tail(&rq->sched.link, pl);
1861 
1862 		for_each_waiter(p, rq) {
1863 			struct i915_request *w =
1864 				container_of(p->waiter, typeof(*w), sched);
1865 
1866 			if (p->flags & I915_DEPENDENCY_WEAK)
1867 				continue;
1868 
1869 			/* Leave semaphores spinning on the other engines */
1870 			if (w->engine != rq->engine)
1871 				continue;
1872 
1873 			/* No waiter should start before its signaler */
1874 			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1875 				   i915_request_started(w) &&
1876 				   !i915_request_completed(rq));
1877 
1878 			GEM_BUG_ON(i915_request_is_active(w));
1879 			if (!i915_request_is_ready(w))
1880 				continue;
1881 
1882 			if (rq_prio(w) < rq_prio(rq))
1883 				continue;
1884 
1885 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1886 			list_move_tail(&w->sched.link, &list);
1887 		}
1888 
1889 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1890 	} while (rq);
1891 }
1892 
1893 static void defer_active(struct intel_engine_cs *engine)
1894 {
1895 	struct i915_request *rq;
1896 
1897 	rq = __unwind_incomplete_requests(engine);
1898 	if (!rq)
1899 		return;
1900 
1901 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1902 }
1903 
1904 static bool
1905 need_timeslice(const struct intel_engine_cs *engine,
1906 	       const struct i915_request *rq,
1907 	       const struct rb_node *rb)
1908 {
1909 	int hint;
1910 
1911 	if (!intel_engine_has_timeslices(engine))
1912 		return false;
1913 
1914 	hint = engine->execlists.queue_priority_hint;
1915 
1916 	if (rb) {
1917 		const struct virtual_engine *ve =
1918 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1919 		const struct intel_engine_cs *inflight =
1920 			intel_context_inflight(&ve->context);
1921 
1922 		if (!inflight || inflight == engine) {
1923 			struct i915_request *next;
1924 
1925 			rcu_read_lock();
1926 			next = READ_ONCE(ve->request);
1927 			if (next)
1928 				hint = max(hint, rq_prio(next));
1929 			rcu_read_unlock();
1930 		}
1931 	}
1932 
1933 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1934 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1935 
1936 	GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1937 	return hint >= effective_prio(rq);
1938 }
1939 
1940 static bool
1941 timeslice_yield(const struct intel_engine_execlists *el,
1942 		const struct i915_request *rq)
1943 {
1944 	/*
1945 	 * Once bitten, forever smitten!
1946 	 *
1947 	 * If the active context ever busy-waited on a semaphore,
1948 	 * it will be treated as a hog until the end of its timeslice (i.e.
1949 	 * until it is scheduled out and replaced by a new submission,
1950 	 * possibly even its own lite-restore). The HW only sends an interrupt
1951 	 * on the first miss, and we do know if that semaphore has been
1952 	 * signaled, or even if it is now stuck on another semaphore. Play
1953 	 * safe, yield if it might be stuck -- it will be given a fresh
1954 	 * timeslice in the near future.
1955 	 */
1956 	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1957 }
1958 
1959 static bool
1960 timeslice_expired(const struct intel_engine_execlists *el,
1961 		  const struct i915_request *rq)
1962 {
1963 	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1964 }
1965 
1966 static int
1967 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1968 {
1969 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1970 		return INT_MIN;
1971 
1972 	return rq_prio(list_next_entry(rq, sched.link));
1973 }
1974 
1975 static inline unsigned long
1976 timeslice(const struct intel_engine_cs *engine)
1977 {
1978 	return READ_ONCE(engine->props.timeslice_duration_ms);
1979 }
1980 
1981 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1982 {
1983 	const struct intel_engine_execlists *execlists = &engine->execlists;
1984 	const struct i915_request *rq = *execlists->active;
1985 
1986 	if (!rq || i915_request_completed(rq))
1987 		return 0;
1988 
1989 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1990 		return 0;
1991 
1992 	return timeslice(engine);
1993 }
1994 
1995 static void set_timeslice(struct intel_engine_cs *engine)
1996 {
1997 	unsigned long duration;
1998 
1999 	if (!intel_engine_has_timeslices(engine))
2000 		return;
2001 
2002 	duration = active_timeslice(engine);
2003 	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2004 
2005 	set_timer_ms(&engine->execlists.timer, duration);
2006 }
2007 
2008 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2009 {
2010 	struct intel_engine_execlists *execlists = &engine->execlists;
2011 	unsigned long duration;
2012 
2013 	if (!intel_engine_has_timeslices(engine))
2014 		return;
2015 
2016 	WRITE_ONCE(execlists->switch_priority_hint, prio);
2017 	if (prio == INT_MIN)
2018 		return;
2019 
2020 	if (timer_pending(&execlists->timer))
2021 		return;
2022 
2023 	duration = timeslice(engine);
2024 	ENGINE_TRACE(engine,
2025 		     "start timeslicing, prio:%d, interval:%lu",
2026 		     prio, duration);
2027 
2028 	set_timer_ms(&execlists->timer, duration);
2029 }
2030 
2031 static void record_preemption(struct intel_engine_execlists *execlists)
2032 {
2033 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2034 }
2035 
2036 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2037 					    const struct i915_request *rq)
2038 {
2039 	if (!rq)
2040 		return 0;
2041 
2042 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2043 	if (unlikely(intel_context_is_banned(rq->context)))
2044 		return 1;
2045 
2046 	return READ_ONCE(engine->props.preempt_timeout_ms);
2047 }
2048 
2049 static void set_preempt_timeout(struct intel_engine_cs *engine,
2050 				const struct i915_request *rq)
2051 {
2052 	if (!intel_engine_has_preempt_reset(engine))
2053 		return;
2054 
2055 	set_timer_ms(&engine->execlists.preempt,
2056 		     active_preempt_timeout(engine, rq));
2057 }
2058 
2059 static inline void clear_ports(struct i915_request **ports, int count)
2060 {
2061 	memset_p((void **)ports, NULL, count);
2062 }
2063 
2064 static void execlists_dequeue(struct intel_engine_cs *engine)
2065 {
2066 	struct intel_engine_execlists * const execlists = &engine->execlists;
2067 	struct i915_request **port = execlists->pending;
2068 	struct i915_request ** const last_port = port + execlists->port_mask;
2069 	struct i915_request * const *active;
2070 	struct i915_request *last;
2071 	struct rb_node *rb;
2072 	bool submit = false;
2073 
2074 	/*
2075 	 * Hardware submission is through 2 ports. Conceptually each port
2076 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2077 	 * static for a context, and unique to each, so we only execute
2078 	 * requests belonging to a single context from each ring. RING_HEAD
2079 	 * is maintained by the CS in the context image, it marks the place
2080 	 * where it got up to last time, and through RING_TAIL we tell the CS
2081 	 * where we want to execute up to this time.
2082 	 *
2083 	 * In this list the requests are in order of execution. Consecutive
2084 	 * requests from the same context are adjacent in the ringbuffer. We
2085 	 * can combine these requests into a single RING_TAIL update:
2086 	 *
2087 	 *              RING_HEAD...req1...req2
2088 	 *                                    ^- RING_TAIL
2089 	 * since to execute req2 the CS must first execute req1.
2090 	 *
2091 	 * Our goal then is to point each port to the end of a consecutive
2092 	 * sequence of requests as being the most optimal (fewest wake ups
2093 	 * and context switches) submission.
2094 	 */
2095 
2096 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2097 		struct virtual_engine *ve =
2098 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2099 		struct i915_request *rq = READ_ONCE(ve->request);
2100 
2101 		if (!rq) { /* lazily cleanup after another engine handled rq */
2102 			rb_erase_cached(rb, &execlists->virtual);
2103 			RB_CLEAR_NODE(rb);
2104 			rb = rb_first_cached(&execlists->virtual);
2105 			continue;
2106 		}
2107 
2108 		if (!virtual_matches(ve, rq, engine)) {
2109 			rb = rb_next(rb);
2110 			continue;
2111 		}
2112 
2113 		break;
2114 	}
2115 
2116 	/*
2117 	 * If the queue is higher priority than the last
2118 	 * request in the currently active context, submit afresh.
2119 	 * We will resubmit again afterwards in case we need to split
2120 	 * the active context to interject the preemption request,
2121 	 * i.e. we will retrigger preemption following the ack in case
2122 	 * of trouble.
2123 	 */
2124 	active = READ_ONCE(execlists->active);
2125 
2126 	/*
2127 	 * In theory we can skip over completed contexts that have not
2128 	 * yet been processed by events (as those events are in flight):
2129 	 *
2130 	 * while ((last = *active) && i915_request_completed(last))
2131 	 *	active++;
2132 	 *
2133 	 * However, the GPU cannot handle this as it will ultimately
2134 	 * find itself trying to jump back into a context it has just
2135 	 * completed and barf.
2136 	 */
2137 
2138 	if ((last = *active)) {
2139 		if (need_preempt(engine, last, rb)) {
2140 			if (i915_request_completed(last)) {
2141 				tasklet_hi_schedule(&execlists->tasklet);
2142 				return;
2143 			}
2144 
2145 			ENGINE_TRACE(engine,
2146 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2147 				     last->fence.context,
2148 				     last->fence.seqno,
2149 				     last->sched.attr.priority,
2150 				     execlists->queue_priority_hint);
2151 			record_preemption(execlists);
2152 
2153 			/*
2154 			 * Don't let the RING_HEAD advance past the breadcrumb
2155 			 * as we unwind (and until we resubmit) so that we do
2156 			 * not accidentally tell it to go backwards.
2157 			 */
2158 			ring_set_paused(engine, 1);
2159 
2160 			/*
2161 			 * Note that we have not stopped the GPU at this point,
2162 			 * so we are unwinding the incomplete requests as they
2163 			 * remain inflight and so by the time we do complete
2164 			 * the preemption, some of the unwound requests may
2165 			 * complete!
2166 			 */
2167 			__unwind_incomplete_requests(engine);
2168 
2169 			last = NULL;
2170 		} else if (need_timeslice(engine, last, rb) &&
2171 			   timeslice_expired(execlists, last)) {
2172 			if (i915_request_completed(last)) {
2173 				tasklet_hi_schedule(&execlists->tasklet);
2174 				return;
2175 			}
2176 
2177 			ENGINE_TRACE(engine,
2178 				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2179 				     last->fence.context,
2180 				     last->fence.seqno,
2181 				     last->sched.attr.priority,
2182 				     execlists->queue_priority_hint,
2183 				     yesno(timeslice_yield(execlists, last)));
2184 
2185 			ring_set_paused(engine, 1);
2186 			defer_active(engine);
2187 
2188 			/*
2189 			 * Unlike for preemption, if we rewind and continue
2190 			 * executing the same context as previously active,
2191 			 * the order of execution will remain the same and
2192 			 * the tail will only advance. We do not need to
2193 			 * force a full context restore, as a lite-restore
2194 			 * is sufficient to resample the monotonic TAIL.
2195 			 *
2196 			 * If we switch to any other context, similarly we
2197 			 * will not rewind TAIL of current context, and
2198 			 * normal save/restore will preserve state and allow
2199 			 * us to later continue executing the same request.
2200 			 */
2201 			last = NULL;
2202 		} else {
2203 			/*
2204 			 * Otherwise if we already have a request pending
2205 			 * for execution after the current one, we can
2206 			 * just wait until the next CS event before
2207 			 * queuing more. In either case we will force a
2208 			 * lite-restore preemption event, but if we wait
2209 			 * we hopefully coalesce several updates into a single
2210 			 * submission.
2211 			 */
2212 			if (!list_is_last(&last->sched.link,
2213 					  &engine->active.requests)) {
2214 				/*
2215 				 * Even if ELSP[1] is occupied and not worthy
2216 				 * of timeslices, our queue might be.
2217 				 */
2218 				start_timeslice(engine, queue_prio(execlists));
2219 				return;
2220 			}
2221 		}
2222 	}
2223 
2224 	while (rb) { /* XXX virtual is always taking precedence */
2225 		struct virtual_engine *ve =
2226 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2227 		struct i915_request *rq;
2228 
2229 		spin_lock(&ve->base.active.lock);
2230 
2231 		rq = ve->request;
2232 		if (unlikely(!rq)) { /* lost the race to a sibling */
2233 			spin_unlock(&ve->base.active.lock);
2234 			rb_erase_cached(rb, &execlists->virtual);
2235 			RB_CLEAR_NODE(rb);
2236 			rb = rb_first_cached(&execlists->virtual);
2237 			continue;
2238 		}
2239 
2240 		GEM_BUG_ON(rq != ve->request);
2241 		GEM_BUG_ON(rq->engine != &ve->base);
2242 		GEM_BUG_ON(rq->context != &ve->context);
2243 
2244 		if (rq_prio(rq) >= queue_prio(execlists)) {
2245 			if (!virtual_matches(ve, rq, engine)) {
2246 				spin_unlock(&ve->base.active.lock);
2247 				rb = rb_next(rb);
2248 				continue;
2249 			}
2250 
2251 			if (last && !can_merge_rq(last, rq)) {
2252 				spin_unlock(&ve->base.active.lock);
2253 				start_timeslice(engine, rq_prio(rq));
2254 				return; /* leave this for another sibling */
2255 			}
2256 
2257 			ENGINE_TRACE(engine,
2258 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2259 				     rq->fence.context,
2260 				     rq->fence.seqno,
2261 				     i915_request_completed(rq) ? "!" :
2262 				     i915_request_started(rq) ? "*" :
2263 				     "",
2264 				     yesno(engine != ve->siblings[0]));
2265 
2266 			WRITE_ONCE(ve->request, NULL);
2267 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2268 				   INT_MIN);
2269 			rb_erase_cached(rb, &execlists->virtual);
2270 			RB_CLEAR_NODE(rb);
2271 
2272 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2273 			WRITE_ONCE(rq->engine, engine);
2274 
2275 			if (engine != ve->siblings[0]) {
2276 				u32 *regs = ve->context.lrc_reg_state;
2277 				unsigned int n;
2278 
2279 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2280 
2281 				if (!intel_engine_has_relative_mmio(engine))
2282 					virtual_update_register_offsets(regs,
2283 									engine);
2284 
2285 				if (!list_empty(&ve->context.signals))
2286 					virtual_xfer_breadcrumbs(ve);
2287 
2288 				/*
2289 				 * Move the bound engine to the top of the list
2290 				 * for future execution. We then kick this
2291 				 * tasklet first before checking others, so that
2292 				 * we preferentially reuse this set of bound
2293 				 * registers.
2294 				 */
2295 				for (n = 1; n < ve->num_siblings; n++) {
2296 					if (ve->siblings[n] == engine) {
2297 						swap(ve->siblings[n],
2298 						     ve->siblings[0]);
2299 						break;
2300 					}
2301 				}
2302 
2303 				GEM_BUG_ON(ve->siblings[0] != engine);
2304 			}
2305 
2306 			if (__i915_request_submit(rq)) {
2307 				submit = true;
2308 				last = rq;
2309 			}
2310 			i915_request_put(rq);
2311 
2312 			/*
2313 			 * Hmm, we have a bunch of virtual engine requests,
2314 			 * but the first one was already completed (thanks
2315 			 * preempt-to-busy!). Keep looking at the veng queue
2316 			 * until we have no more relevant requests (i.e.
2317 			 * the normal submit queue has higher priority).
2318 			 */
2319 			if (!submit) {
2320 				spin_unlock(&ve->base.active.lock);
2321 				rb = rb_first_cached(&execlists->virtual);
2322 				continue;
2323 			}
2324 		}
2325 
2326 		spin_unlock(&ve->base.active.lock);
2327 		break;
2328 	}
2329 
2330 	while ((rb = rb_first_cached(&execlists->queue))) {
2331 		struct i915_priolist *p = to_priolist(rb);
2332 		struct i915_request *rq, *rn;
2333 		int i;
2334 
2335 		priolist_for_each_request_consume(rq, rn, p, i) {
2336 			bool merge = true;
2337 
2338 			/*
2339 			 * Can we combine this request with the current port?
2340 			 * It has to be the same context/ringbuffer and not
2341 			 * have any exceptions (e.g. GVT saying never to
2342 			 * combine contexts).
2343 			 *
2344 			 * If we can combine the requests, we can execute both
2345 			 * by updating the RING_TAIL to point to the end of the
2346 			 * second request, and so we never need to tell the
2347 			 * hardware about the first.
2348 			 */
2349 			if (last && !can_merge_rq(last, rq)) {
2350 				/*
2351 				 * If we are on the second port and cannot
2352 				 * combine this request with the last, then we
2353 				 * are done.
2354 				 */
2355 				if (port == last_port)
2356 					goto done;
2357 
2358 				/*
2359 				 * We must not populate both ELSP[] with the
2360 				 * same LRCA, i.e. we must submit 2 different
2361 				 * contexts if we submit 2 ELSP.
2362 				 */
2363 				if (last->context == rq->context)
2364 					goto done;
2365 
2366 				if (i915_request_has_sentinel(last))
2367 					goto done;
2368 
2369 				/*
2370 				 * If GVT overrides us we only ever submit
2371 				 * port[0], leaving port[1] empty. Note that we
2372 				 * also have to be careful that we don't queue
2373 				 * the same context (even though a different
2374 				 * request) to the second port.
2375 				 */
2376 				if (ctx_single_port_submission(last->context) ||
2377 				    ctx_single_port_submission(rq->context))
2378 					goto done;
2379 
2380 				merge = false;
2381 			}
2382 
2383 			if (__i915_request_submit(rq)) {
2384 				if (!merge) {
2385 					*port = execlists_schedule_in(last, port - execlists->pending);
2386 					port++;
2387 					last = NULL;
2388 				}
2389 
2390 				GEM_BUG_ON(last &&
2391 					   !can_merge_ctx(last->context,
2392 							  rq->context));
2393 				GEM_BUG_ON(last &&
2394 					   i915_seqno_passed(last->fence.seqno,
2395 							     rq->fence.seqno));
2396 
2397 				submit = true;
2398 				last = rq;
2399 			}
2400 		}
2401 
2402 		rb_erase_cached(&p->node, &execlists->queue);
2403 		i915_priolist_free(p);
2404 	}
2405 
2406 done:
2407 	/*
2408 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2409 	 *
2410 	 * We choose the priority hint such that if we add a request of greater
2411 	 * priority than this, we kick the submission tasklet to decide on
2412 	 * the right order of submitting the requests to hardware. We must
2413 	 * also be prepared to reorder requests as they are in-flight on the
2414 	 * HW. We derive the priority hint then as the first "hole" in
2415 	 * the HW submission ports and if there are no available slots,
2416 	 * the priority of the lowest executing request, i.e. last.
2417 	 *
2418 	 * When we do receive a higher priority request ready to run from the
2419 	 * user, see queue_request(), the priority hint is bumped to that
2420 	 * request triggering preemption on the next dequeue (or subsequent
2421 	 * interrupt for secondary ports).
2422 	 */
2423 	execlists->queue_priority_hint = queue_prio(execlists);
2424 
2425 	if (submit) {
2426 		*port = execlists_schedule_in(last, port - execlists->pending);
2427 		execlists->switch_priority_hint =
2428 			switch_prio(engine, *execlists->pending);
2429 
2430 		/*
2431 		 * Skip if we ended up with exactly the same set of requests,
2432 		 * e.g. trying to timeslice a pair of ordered contexts
2433 		 */
2434 		if (!memcmp(active, execlists->pending,
2435 			    (port - execlists->pending + 1) * sizeof(*port))) {
2436 			do
2437 				execlists_schedule_out(fetch_and_zero(port));
2438 			while (port-- != execlists->pending);
2439 
2440 			goto skip_submit;
2441 		}
2442 		clear_ports(port + 1, last_port - port);
2443 
2444 		WRITE_ONCE(execlists->yield, -1);
2445 		set_preempt_timeout(engine, *active);
2446 		execlists_submit_ports(engine);
2447 	} else {
2448 skip_submit:
2449 		ring_set_paused(engine, 0);
2450 	}
2451 }
2452 
2453 static void
2454 cancel_port_requests(struct intel_engine_execlists * const execlists)
2455 {
2456 	struct i915_request * const *port;
2457 
2458 	for (port = execlists->pending; *port; port++)
2459 		execlists_schedule_out(*port);
2460 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2461 
2462 	/* Mark the end of active before we overwrite *active */
2463 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2464 		execlists_schedule_out(*port);
2465 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2466 
2467 	smp_wmb(); /* complete the seqlock for execlists_active() */
2468 	WRITE_ONCE(execlists->active, execlists->inflight);
2469 }
2470 
2471 static inline void
2472 invalidate_csb_entries(const u32 *first, const u32 *last)
2473 {
2474 	clflush((void *)first);
2475 	clflush((void *)last);
2476 }
2477 
2478 /*
2479  * Starting with Gen12, the status has a new format:
2480  *
2481  *     bit  0:     switched to new queue
2482  *     bit  1:     reserved
2483  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2484  *                 switch detail is set to "wait on semaphore"
2485  *     bits 3-5:   engine class
2486  *     bits 6-11:  engine instance
2487  *     bits 12-14: reserved
2488  *     bits 15-25: sw context id of the lrc the GT switched to
2489  *     bits 26-31: sw counter of the lrc the GT switched to
2490  *     bits 32-35: context switch detail
2491  *                  - 0: ctx complete
2492  *                  - 1: wait on sync flip
2493  *                  - 2: wait on vblank
2494  *                  - 3: wait on scanline
2495  *                  - 4: wait on semaphore
2496  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2497  *                       WAIT_FOR_EVENT)
2498  *     bit  36:    reserved
2499  *     bits 37-43: wait detail (for switch detail 1 to 4)
2500  *     bits 44-46: reserved
2501  *     bits 47-57: sw context id of the lrc the GT switched away from
2502  *     bits 58-63: sw counter of the lrc the GT switched away from
2503  */
2504 static inline bool
2505 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2506 {
2507 	u32 lower_dw = csb[0];
2508 	u32 upper_dw = csb[1];
2509 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2510 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2511 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2512 
2513 	/*
2514 	 * The context switch detail is not guaranteed to be 5 when a preemption
2515 	 * occurs, so we can't just check for that. The check below works for
2516 	 * all the cases we care about, including preemptions of WAIT
2517 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2518 	 * would require some extra handling, but we don't support that.
2519 	 */
2520 	if (!ctx_away_valid || new_queue) {
2521 		GEM_BUG_ON(!ctx_to_valid);
2522 		return true;
2523 	}
2524 
2525 	/*
2526 	 * switch detail = 5 is covered by the case above and we do not expect a
2527 	 * context switch on an unsuccessful wait instruction since we always
2528 	 * use polling mode.
2529 	 */
2530 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2531 	return false;
2532 }
2533 
2534 static inline bool
2535 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2536 {
2537 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2538 }
2539 
2540 static void process_csb(struct intel_engine_cs *engine)
2541 {
2542 	struct intel_engine_execlists * const execlists = &engine->execlists;
2543 	const u32 * const buf = execlists->csb_status;
2544 	const u8 num_entries = execlists->csb_size;
2545 	u8 head, tail;
2546 
2547 	/*
2548 	 * As we modify our execlists state tracking we require exclusive
2549 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2550 	 * and we assume that is only inside the reset paths and so serialised.
2551 	 */
2552 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2553 		   !reset_in_progress(execlists));
2554 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2555 
2556 	/*
2557 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2558 	 * When reading from the csb_write mmio register, we have to be
2559 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2560 	 * the low 4bits. As it happens we know the next 4bits are always
2561 	 * zero and so we can simply masked off the low u8 of the register
2562 	 * and treat it identically to reading from the HWSP (without having
2563 	 * to use explicit shifting and masking, and probably bifurcating
2564 	 * the code to handle the legacy mmio read).
2565 	 */
2566 	head = execlists->csb_head;
2567 	tail = READ_ONCE(*execlists->csb_write);
2568 	if (unlikely(head == tail))
2569 		return;
2570 
2571 	/*
2572 	 * Hopefully paired with a wmb() in HW!
2573 	 *
2574 	 * We must complete the read of the write pointer before any reads
2575 	 * from the CSB, so that we do not see stale values. Without an rmb
2576 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2577 	 * we perform the READ_ONCE(*csb_write).
2578 	 */
2579 	rmb();
2580 
2581 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2582 	do {
2583 		bool promote;
2584 
2585 		if (++head == num_entries)
2586 			head = 0;
2587 
2588 		/*
2589 		 * We are flying near dragons again.
2590 		 *
2591 		 * We hold a reference to the request in execlist_port[]
2592 		 * but no more than that. We are operating in softirq
2593 		 * context and so cannot hold any mutex or sleep. That
2594 		 * prevents us stopping the requests we are processing
2595 		 * in port[] from being retired simultaneously (the
2596 		 * breadcrumb will be complete before we see the
2597 		 * context-switch). As we only hold the reference to the
2598 		 * request, any pointer chasing underneath the request
2599 		 * is subject to a potential use-after-free. Thus we
2600 		 * store all of the bookkeeping within port[] as
2601 		 * required, and avoid using unguarded pointers beneath
2602 		 * request itself. The same applies to the atomic
2603 		 * status notifier.
2604 		 */
2605 
2606 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2607 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2608 
2609 		if (INTEL_GEN(engine->i915) >= 12)
2610 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2611 		else
2612 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2613 		if (promote) {
2614 			struct i915_request * const *old = execlists->active;
2615 
2616 			ring_set_paused(engine, 0);
2617 
2618 			/* Point active to the new ELSP; prevent overwriting */
2619 			WRITE_ONCE(execlists->active, execlists->pending);
2620 			smp_wmb(); /* notify execlists_active() */
2621 
2622 			/* cancel old inflight, prepare for switch */
2623 			trace_ports(execlists, "preempted", old);
2624 			while (*old)
2625 				execlists_schedule_out(*old++);
2626 
2627 			/* switch pending to inflight */
2628 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2629 			memcpy(execlists->inflight,
2630 			       execlists->pending,
2631 			       execlists_num_ports(execlists) *
2632 			       sizeof(*execlists->pending));
2633 			smp_wmb(); /* complete the seqlock */
2634 			WRITE_ONCE(execlists->active, execlists->inflight);
2635 
2636 			WRITE_ONCE(execlists->pending[0], NULL);
2637 		} else {
2638 			GEM_BUG_ON(!*execlists->active);
2639 
2640 			/* port0 completed, advanced to port1 */
2641 			trace_ports(execlists, "completed", execlists->active);
2642 
2643 			/*
2644 			 * We rely on the hardware being strongly
2645 			 * ordered, that the breadcrumb write is
2646 			 * coherent (visible from the CPU) before the
2647 			 * user interrupt is processed. One might assume
2648 			 * that the breadcrumb write being before the
2649 			 * user interrupt and the CS event for the context
2650 			 * switch would therefore be before the CS event
2651 			 * itself...
2652 			 */
2653 			if (GEM_SHOW_DEBUG() &&
2654 			    !i915_request_completed(*execlists->active)) {
2655 				struct i915_request *rq = *execlists->active;
2656 				const u32 *regs __maybe_unused =
2657 					rq->context->lrc_reg_state;
2658 
2659 				ENGINE_TRACE(engine,
2660 					     "context completed before request!\n");
2661 				ENGINE_TRACE(engine,
2662 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2663 					     ENGINE_READ(engine, RING_START),
2664 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2665 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2666 					     ENGINE_READ(engine, RING_CTL),
2667 					     ENGINE_READ(engine, RING_MI_MODE));
2668 				ENGINE_TRACE(engine,
2669 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2670 					     i915_ggtt_offset(rq->ring->vma),
2671 					     rq->head, rq->tail,
2672 					     rq->fence.context,
2673 					     lower_32_bits(rq->fence.seqno),
2674 					     hwsp_seqno(rq));
2675 				ENGINE_TRACE(engine,
2676 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2677 					     regs[CTX_RING_START],
2678 					     regs[CTX_RING_HEAD],
2679 					     regs[CTX_RING_TAIL]);
2680 			}
2681 
2682 			execlists_schedule_out(*execlists->active++);
2683 
2684 			GEM_BUG_ON(execlists->active - execlists->inflight >
2685 				   execlists_num_ports(execlists));
2686 		}
2687 	} while (head != tail);
2688 
2689 	execlists->csb_head = head;
2690 	set_timeslice(engine);
2691 
2692 	/*
2693 	 * Gen11 has proven to fail wrt global observation point between
2694 	 * entry and tail update, failing on the ordering and thus
2695 	 * we see an old entry in the context status buffer.
2696 	 *
2697 	 * Forcibly evict out entries for the next gpu csb update,
2698 	 * to increase the odds that we get a fresh entries with non
2699 	 * working hardware. The cost for doing so comes out mostly with
2700 	 * the wash as hardware, working or not, will need to do the
2701 	 * invalidation before.
2702 	 */
2703 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2704 }
2705 
2706 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2707 {
2708 	lockdep_assert_held(&engine->active.lock);
2709 	if (!READ_ONCE(engine->execlists.pending[0])) {
2710 		rcu_read_lock(); /* protect peeking at execlists->active */
2711 		execlists_dequeue(engine);
2712 		rcu_read_unlock();
2713 	}
2714 }
2715 
2716 static void __execlists_hold(struct i915_request *rq)
2717 {
2718 	LIST_HEAD(list);
2719 
2720 	do {
2721 		struct i915_dependency *p;
2722 
2723 		if (i915_request_is_active(rq))
2724 			__i915_request_unsubmit(rq);
2725 
2726 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2727 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2728 		i915_request_set_hold(rq);
2729 		RQ_TRACE(rq, "on hold\n");
2730 
2731 		for_each_waiter(p, rq) {
2732 			struct i915_request *w =
2733 				container_of(p->waiter, typeof(*w), sched);
2734 
2735 			/* Leave semaphores spinning on the other engines */
2736 			if (w->engine != rq->engine)
2737 				continue;
2738 
2739 			if (!i915_request_is_ready(w))
2740 				continue;
2741 
2742 			if (i915_request_completed(w))
2743 				continue;
2744 
2745 			if (i915_request_on_hold(w))
2746 				continue;
2747 
2748 			list_move_tail(&w->sched.link, &list);
2749 		}
2750 
2751 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2752 	} while (rq);
2753 }
2754 
2755 static bool execlists_hold(struct intel_engine_cs *engine,
2756 			   struct i915_request *rq)
2757 {
2758 	spin_lock_irq(&engine->active.lock);
2759 
2760 	if (i915_request_completed(rq)) { /* too late! */
2761 		rq = NULL;
2762 		goto unlock;
2763 	}
2764 
2765 	if (rq->engine != engine) { /* preempted virtual engine */
2766 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2767 
2768 		/*
2769 		 * intel_context_inflight() is only protected by virtue
2770 		 * of process_csb() being called only by the tasklet (or
2771 		 * directly from inside reset while the tasklet is suspended).
2772 		 * Assert that neither of those are allowed to run while we
2773 		 * poke at the request queues.
2774 		 */
2775 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2776 
2777 		/*
2778 		 * An unsubmitted request along a virtual engine will
2779 		 * remain on the active (this) engine until we are able
2780 		 * to process the context switch away (and so mark the
2781 		 * context as no longer in flight). That cannot have happened
2782 		 * yet, otherwise we would not be hanging!
2783 		 */
2784 		spin_lock(&ve->base.active.lock);
2785 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2786 		GEM_BUG_ON(ve->request != rq);
2787 		ve->request = NULL;
2788 		spin_unlock(&ve->base.active.lock);
2789 		i915_request_put(rq);
2790 
2791 		rq->engine = engine;
2792 	}
2793 
2794 	/*
2795 	 * Transfer this request onto the hold queue to prevent it
2796 	 * being resumbitted to HW (and potentially completed) before we have
2797 	 * released it. Since we may have already submitted following
2798 	 * requests, we need to remove those as well.
2799 	 */
2800 	GEM_BUG_ON(i915_request_on_hold(rq));
2801 	GEM_BUG_ON(rq->engine != engine);
2802 	__execlists_hold(rq);
2803 	GEM_BUG_ON(list_empty(&engine->active.hold));
2804 
2805 unlock:
2806 	spin_unlock_irq(&engine->active.lock);
2807 	return rq;
2808 }
2809 
2810 static bool hold_request(const struct i915_request *rq)
2811 {
2812 	struct i915_dependency *p;
2813 	bool result = false;
2814 
2815 	/*
2816 	 * If one of our ancestors is on hold, we must also be on hold,
2817 	 * otherwise we will bypass it and execute before it.
2818 	 */
2819 	rcu_read_lock();
2820 	for_each_signaler(p, rq) {
2821 		const struct i915_request *s =
2822 			container_of(p->signaler, typeof(*s), sched);
2823 
2824 		if (s->engine != rq->engine)
2825 			continue;
2826 
2827 		result = i915_request_on_hold(s);
2828 		if (result)
2829 			break;
2830 	}
2831 	rcu_read_unlock();
2832 
2833 	return result;
2834 }
2835 
2836 static void __execlists_unhold(struct i915_request *rq)
2837 {
2838 	LIST_HEAD(list);
2839 
2840 	do {
2841 		struct i915_dependency *p;
2842 
2843 		RQ_TRACE(rq, "hold release\n");
2844 
2845 		GEM_BUG_ON(!i915_request_on_hold(rq));
2846 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2847 
2848 		i915_request_clear_hold(rq);
2849 		list_move_tail(&rq->sched.link,
2850 			       i915_sched_lookup_priolist(rq->engine,
2851 							  rq_prio(rq)));
2852 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2853 
2854 		/* Also release any children on this engine that are ready */
2855 		for_each_waiter(p, rq) {
2856 			struct i915_request *w =
2857 				container_of(p->waiter, typeof(*w), sched);
2858 
2859 			/* Propagate any change in error status */
2860 			if (rq->fence.error)
2861 				i915_request_set_error_once(w, rq->fence.error);
2862 
2863 			if (w->engine != rq->engine)
2864 				continue;
2865 
2866 			if (!i915_request_on_hold(w))
2867 				continue;
2868 
2869 			/* Check that no other parents are also on hold */
2870 			if (hold_request(w))
2871 				continue;
2872 
2873 			list_move_tail(&w->sched.link, &list);
2874 		}
2875 
2876 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2877 	} while (rq);
2878 }
2879 
2880 static void execlists_unhold(struct intel_engine_cs *engine,
2881 			     struct i915_request *rq)
2882 {
2883 	spin_lock_irq(&engine->active.lock);
2884 
2885 	/*
2886 	 * Move this request back to the priority queue, and all of its
2887 	 * children and grandchildren that were suspended along with it.
2888 	 */
2889 	__execlists_unhold(rq);
2890 
2891 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2892 		engine->execlists.queue_priority_hint = rq_prio(rq);
2893 		tasklet_hi_schedule(&engine->execlists.tasklet);
2894 	}
2895 
2896 	spin_unlock_irq(&engine->active.lock);
2897 }
2898 
2899 struct execlists_capture {
2900 	struct work_struct work;
2901 	struct i915_request *rq;
2902 	struct i915_gpu_coredump *error;
2903 };
2904 
2905 static void execlists_capture_work(struct work_struct *work)
2906 {
2907 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2908 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2909 	struct intel_engine_cs *engine = cap->rq->engine;
2910 	struct intel_gt_coredump *gt = cap->error->gt;
2911 	struct intel_engine_capture_vma *vma;
2912 
2913 	/* Compress all the objects attached to the request, slow! */
2914 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2915 	if (vma) {
2916 		struct i915_vma_compress *compress =
2917 			i915_vma_capture_prepare(gt);
2918 
2919 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2920 		i915_vma_capture_finish(gt, compress);
2921 	}
2922 
2923 	gt->simulated = gt->engine->simulated;
2924 	cap->error->simulated = gt->simulated;
2925 
2926 	/* Publish the error state, and announce it to the world */
2927 	i915_error_state_store(cap->error);
2928 	i915_gpu_coredump_put(cap->error);
2929 
2930 	/* Return this request and all that depend upon it for signaling */
2931 	execlists_unhold(engine, cap->rq);
2932 	i915_request_put(cap->rq);
2933 
2934 	kfree(cap);
2935 }
2936 
2937 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2938 {
2939 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2940 	struct execlists_capture *cap;
2941 
2942 	cap = kmalloc(sizeof(*cap), gfp);
2943 	if (!cap)
2944 		return NULL;
2945 
2946 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2947 	if (!cap->error)
2948 		goto err_cap;
2949 
2950 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2951 	if (!cap->error->gt)
2952 		goto err_gpu;
2953 
2954 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2955 	if (!cap->error->gt->engine)
2956 		goto err_gt;
2957 
2958 	return cap;
2959 
2960 err_gt:
2961 	kfree(cap->error->gt);
2962 err_gpu:
2963 	kfree(cap->error);
2964 err_cap:
2965 	kfree(cap);
2966 	return NULL;
2967 }
2968 
2969 static struct i915_request *
2970 active_context(struct intel_engine_cs *engine, u32 ccid)
2971 {
2972 	const struct intel_engine_execlists * const el = &engine->execlists;
2973 	struct i915_request * const *port, *rq;
2974 
2975 	/*
2976 	 * Use the most recent result from process_csb(), but just in case
2977 	 * we trigger an error (via interrupt) before the first CS event has
2978 	 * been written, peek at the next submission.
2979 	 */
2980 
2981 	for (port = el->active; (rq = *port); port++) {
2982 		if (rq->context->lrc.ccid == ccid) {
2983 			ENGINE_TRACE(engine,
2984 				     "ccid found at active:%zd\n",
2985 				     port - el->active);
2986 			return rq;
2987 		}
2988 	}
2989 
2990 	for (port = el->pending; (rq = *port); port++) {
2991 		if (rq->context->lrc.ccid == ccid) {
2992 			ENGINE_TRACE(engine,
2993 				     "ccid found at pending:%zd\n",
2994 				     port - el->pending);
2995 			return rq;
2996 		}
2997 	}
2998 
2999 	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3000 	return NULL;
3001 }
3002 
3003 static u32 active_ccid(struct intel_engine_cs *engine)
3004 {
3005 	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3006 }
3007 
3008 static bool execlists_capture(struct intel_engine_cs *engine)
3009 {
3010 	struct execlists_capture *cap;
3011 
3012 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3013 		return true;
3014 
3015 	/*
3016 	 * We need to _quickly_ capture the engine state before we reset.
3017 	 * We are inside an atomic section (softirq) here and we are delaying
3018 	 * the forced preemption event.
3019 	 */
3020 	cap = capture_regs(engine);
3021 	if (!cap)
3022 		return true;
3023 
3024 	spin_lock_irq(&engine->active.lock);
3025 	cap->rq = active_context(engine, active_ccid(engine));
3026 	if (cap->rq) {
3027 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3028 		cap->rq = i915_request_get_rcu(cap->rq);
3029 	}
3030 	spin_unlock_irq(&engine->active.lock);
3031 	if (!cap->rq)
3032 		goto err_free;
3033 
3034 	/*
3035 	 * Remove the request from the execlists queue, and take ownership
3036 	 * of the request. We pass it to our worker who will _slowly_ compress
3037 	 * all the pages the _user_ requested for debugging their batch, after
3038 	 * which we return it to the queue for signaling.
3039 	 *
3040 	 * By removing them from the execlists queue, we also remove the
3041 	 * requests from being processed by __unwind_incomplete_requests()
3042 	 * during the intel_engine_reset(), and so they will *not* be replayed
3043 	 * afterwards.
3044 	 *
3045 	 * Note that because we have not yet reset the engine at this point,
3046 	 * it is possible for the request that we have identified as being
3047 	 * guilty, did in fact complete and we will then hit an arbitration
3048 	 * point allowing the outstanding preemption to succeed. The likelihood
3049 	 * of that is very low (as capturing of the engine registers should be
3050 	 * fast enough to run inside an irq-off atomic section!), so we will
3051 	 * simply hold that request accountable for being non-preemptible
3052 	 * long enough to force the reset.
3053 	 */
3054 	if (!execlists_hold(engine, cap->rq))
3055 		goto err_rq;
3056 
3057 	INIT_WORK(&cap->work, execlists_capture_work);
3058 	schedule_work(&cap->work);
3059 	return true;
3060 
3061 err_rq:
3062 	i915_request_put(cap->rq);
3063 err_free:
3064 	i915_gpu_coredump_put(cap->error);
3065 	kfree(cap);
3066 	return false;
3067 }
3068 
3069 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3070 {
3071 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3072 	unsigned long *lock = &engine->gt->reset.flags;
3073 
3074 	if (!intel_has_reset_engine(engine->gt))
3075 		return;
3076 
3077 	if (test_and_set_bit(bit, lock))
3078 		return;
3079 
3080 	ENGINE_TRACE(engine, "reset for %s\n", msg);
3081 
3082 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3083 	tasklet_disable_nosync(&engine->execlists.tasklet);
3084 
3085 	ring_set_paused(engine, 1); /* Freeze the current request in place */
3086 	if (execlists_capture(engine))
3087 		intel_engine_reset(engine, msg);
3088 	else
3089 		ring_set_paused(engine, 0);
3090 
3091 	tasklet_enable(&engine->execlists.tasklet);
3092 	clear_and_wake_up_bit(bit, lock);
3093 }
3094 
3095 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3096 {
3097 	const struct timer_list *t = &engine->execlists.preempt;
3098 
3099 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3100 		return false;
3101 
3102 	if (!timer_expired(t))
3103 		return false;
3104 
3105 	return READ_ONCE(engine->execlists.pending[0]);
3106 }
3107 
3108 /*
3109  * Check the unread Context Status Buffers and manage the submission of new
3110  * contexts to the ELSP accordingly.
3111  */
3112 static void execlists_submission_tasklet(unsigned long data)
3113 {
3114 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3115 	bool timeout = preempt_timeout(engine);
3116 
3117 	process_csb(engine);
3118 
3119 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3120 		engine->execlists.error_interrupt = 0;
3121 		if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
3122 			execlists_reset(engine, "CS error");
3123 	}
3124 
3125 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3126 		unsigned long flags;
3127 
3128 		spin_lock_irqsave(&engine->active.lock, flags);
3129 		__execlists_submission_tasklet(engine);
3130 		spin_unlock_irqrestore(&engine->active.lock, flags);
3131 
3132 		/* Recheck after serialising with direct-submission */
3133 		if (unlikely(timeout && preempt_timeout(engine)))
3134 			execlists_reset(engine, "preemption time out");
3135 	}
3136 }
3137 
3138 static void __execlists_kick(struct intel_engine_execlists *execlists)
3139 {
3140 	/* Kick the tasklet for some interrupt coalescing and reset handling */
3141 	tasklet_hi_schedule(&execlists->tasklet);
3142 }
3143 
3144 #define execlists_kick(t, member) \
3145 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3146 
3147 static void execlists_timeslice(struct timer_list *timer)
3148 {
3149 	execlists_kick(timer, timer);
3150 }
3151 
3152 static void execlists_preempt(struct timer_list *timer)
3153 {
3154 	execlists_kick(timer, preempt);
3155 }
3156 
3157 static void queue_request(struct intel_engine_cs *engine,
3158 			  struct i915_request *rq)
3159 {
3160 	GEM_BUG_ON(!list_empty(&rq->sched.link));
3161 	list_add_tail(&rq->sched.link,
3162 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3163 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3164 }
3165 
3166 static void __submit_queue_imm(struct intel_engine_cs *engine)
3167 {
3168 	struct intel_engine_execlists * const execlists = &engine->execlists;
3169 
3170 	if (reset_in_progress(execlists))
3171 		return; /* defer until we restart the engine following reset */
3172 
3173 	/* Hopefully we clear execlists->pending[] to let us through */
3174 	if (READ_ONCE(execlists->pending[0]) &&
3175 	    tasklet_trylock(&execlists->tasklet)) {
3176 		process_csb(engine);
3177 		tasklet_unlock(&execlists->tasklet);
3178 	}
3179 
3180 	__execlists_submission_tasklet(engine);
3181 }
3182 
3183 static void submit_queue(struct intel_engine_cs *engine,
3184 			 const struct i915_request *rq)
3185 {
3186 	struct intel_engine_execlists *execlists = &engine->execlists;
3187 
3188 	if (rq_prio(rq) <= execlists->queue_priority_hint)
3189 		return;
3190 
3191 	execlists->queue_priority_hint = rq_prio(rq);
3192 	__submit_queue_imm(engine);
3193 }
3194 
3195 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3196 			     const struct i915_request *rq)
3197 {
3198 	GEM_BUG_ON(i915_request_on_hold(rq));
3199 	return !list_empty(&engine->active.hold) && hold_request(rq);
3200 }
3201 
3202 static void execlists_submit_request(struct i915_request *request)
3203 {
3204 	struct intel_engine_cs *engine = request->engine;
3205 	unsigned long flags;
3206 
3207 	/* Will be called from irq-context when using foreign fences. */
3208 	spin_lock_irqsave(&engine->active.lock, flags);
3209 
3210 	if (unlikely(ancestor_on_hold(engine, request))) {
3211 		RQ_TRACE(request, "ancestor on hold\n");
3212 		list_add_tail(&request->sched.link, &engine->active.hold);
3213 		i915_request_set_hold(request);
3214 	} else {
3215 		queue_request(engine, request);
3216 
3217 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3218 		GEM_BUG_ON(list_empty(&request->sched.link));
3219 
3220 		submit_queue(engine, request);
3221 	}
3222 
3223 	spin_unlock_irqrestore(&engine->active.lock, flags);
3224 }
3225 
3226 static void __execlists_context_fini(struct intel_context *ce)
3227 {
3228 	intel_ring_put(ce->ring);
3229 	i915_vma_put(ce->state);
3230 }
3231 
3232 static void execlists_context_destroy(struct kref *kref)
3233 {
3234 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3235 
3236 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3237 	GEM_BUG_ON(intel_context_is_pinned(ce));
3238 
3239 	if (ce->state)
3240 		__execlists_context_fini(ce);
3241 
3242 	intel_context_fini(ce);
3243 	intel_context_free(ce);
3244 }
3245 
3246 static void
3247 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3248 {
3249 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3250 		return;
3251 
3252 	vaddr += engine->context_size;
3253 
3254 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3255 }
3256 
3257 static void
3258 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3259 {
3260 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3261 		return;
3262 
3263 	vaddr += engine->context_size;
3264 
3265 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3266 		drm_err_once(&engine->i915->drm,
3267 			     "%s context redzone overwritten!\n",
3268 			     engine->name);
3269 }
3270 
3271 static void execlists_context_unpin(struct intel_context *ce)
3272 {
3273 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3274 		      ce->engine);
3275 
3276 	i915_gem_object_unpin_map(ce->state->obj);
3277 }
3278 
3279 static u32 *
3280 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3281 {
3282 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3283 		MI_SRM_LRM_GLOBAL_GTT |
3284 		MI_LRI_LRM_CS_MMIO;
3285 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3286 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3287 		CTX_TIMESTAMP * sizeof(u32);
3288 	*cs++ = 0;
3289 
3290 	*cs++ = MI_LOAD_REGISTER_REG |
3291 		MI_LRR_SOURCE_CS_MMIO |
3292 		MI_LRI_LRM_CS_MMIO;
3293 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3294 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3295 
3296 	*cs++ = MI_LOAD_REGISTER_REG |
3297 		MI_LRR_SOURCE_CS_MMIO |
3298 		MI_LRI_LRM_CS_MMIO;
3299 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3300 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3301 
3302 	return cs;
3303 }
3304 
3305 static u32 *
3306 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3307 {
3308 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3309 
3310 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3311 		MI_SRM_LRM_GLOBAL_GTT |
3312 		MI_LRI_LRM_CS_MMIO;
3313 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3314 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3315 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3316 	*cs++ = 0;
3317 
3318 	return cs;
3319 }
3320 
3321 static u32 *
3322 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3323 {
3324 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3325 
3326 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3327 		MI_SRM_LRM_GLOBAL_GTT |
3328 		MI_LRI_LRM_CS_MMIO;
3329 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3330 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3331 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3332 	*cs++ = 0;
3333 
3334 	*cs++ = MI_LOAD_REGISTER_REG |
3335 		MI_LRR_SOURCE_CS_MMIO |
3336 		MI_LRI_LRM_CS_MMIO;
3337 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3338 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3339 
3340 	return cs;
3341 }
3342 
3343 static u32 *
3344 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3345 {
3346 	cs = gen12_emit_timestamp_wa(ce, cs);
3347 	cs = gen12_emit_cmd_buf_wa(ce, cs);
3348 	cs = gen12_emit_restore_scratch(ce, cs);
3349 
3350 	return cs;
3351 }
3352 
3353 static u32 *
3354 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3355 {
3356 	cs = gen12_emit_timestamp_wa(ce, cs);
3357 	cs = gen12_emit_restore_scratch(ce, cs);
3358 
3359 	return cs;
3360 }
3361 
3362 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3363 {
3364 	return PAGE_SIZE * ce->wa_bb_page;
3365 }
3366 
3367 static u32 *context_indirect_bb(const struct intel_context *ce)
3368 {
3369 	void *ptr;
3370 
3371 	GEM_BUG_ON(!ce->wa_bb_page);
3372 
3373 	ptr = ce->lrc_reg_state;
3374 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3375 	ptr += context_wa_bb_offset(ce);
3376 
3377 	return ptr;
3378 }
3379 
3380 static void
3381 setup_indirect_ctx_bb(const struct intel_context *ce,
3382 		      const struct intel_engine_cs *engine,
3383 		      u32 *(*emit)(const struct intel_context *, u32 *))
3384 {
3385 	u32 * const start = context_indirect_bb(ce);
3386 	u32 *cs;
3387 
3388 	cs = emit(ce, start);
3389 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3390 	while ((unsigned long)cs % CACHELINE_BYTES)
3391 		*cs++ = MI_NOOP;
3392 
3393 	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3394 				    i915_ggtt_offset(ce->state) +
3395 				    context_wa_bb_offset(ce),
3396 				    (cs - start) * sizeof(*cs));
3397 }
3398 
3399 static void
3400 __execlists_update_reg_state(const struct intel_context *ce,
3401 			     const struct intel_engine_cs *engine,
3402 			     u32 head)
3403 {
3404 	struct intel_ring *ring = ce->ring;
3405 	u32 *regs = ce->lrc_reg_state;
3406 
3407 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3408 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3409 
3410 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3411 	regs[CTX_RING_HEAD] = head;
3412 	regs[CTX_RING_TAIL] = ring->tail;
3413 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3414 
3415 	/* RPCS */
3416 	if (engine->class == RENDER_CLASS) {
3417 		regs[CTX_R_PWR_CLK_STATE] =
3418 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
3419 
3420 		i915_oa_init_reg_state(ce, engine);
3421 	}
3422 
3423 	if (ce->wa_bb_page) {
3424 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3425 
3426 		fn = gen12_emit_indirect_ctx_xcs;
3427 		if (ce->engine->class == RENDER_CLASS)
3428 			fn = gen12_emit_indirect_ctx_rcs;
3429 
3430 		/* Mutually exclusive wrt to global indirect bb */
3431 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3432 		setup_indirect_ctx_bb(ce, engine, fn);
3433 	}
3434 }
3435 
3436 static int
3437 __execlists_context_pin(struct intel_context *ce,
3438 			struct intel_engine_cs *engine)
3439 {
3440 	void *vaddr;
3441 
3442 	GEM_BUG_ON(!ce->state);
3443 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3444 
3445 	vaddr = i915_gem_object_pin_map(ce->state->obj,
3446 					i915_coherent_map_type(engine->i915) |
3447 					I915_MAP_OVERRIDE);
3448 	if (IS_ERR(vaddr))
3449 		return PTR_ERR(vaddr);
3450 
3451 	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3452 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3453 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3454 
3455 	return 0;
3456 }
3457 
3458 static int execlists_context_pin(struct intel_context *ce)
3459 {
3460 	return __execlists_context_pin(ce, ce->engine);
3461 }
3462 
3463 static int execlists_context_alloc(struct intel_context *ce)
3464 {
3465 	return __execlists_context_alloc(ce, ce->engine);
3466 }
3467 
3468 static void execlists_context_reset(struct intel_context *ce)
3469 {
3470 	CE_TRACE(ce, "reset\n");
3471 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3472 
3473 	intel_ring_reset(ce->ring, ce->ring->emit);
3474 
3475 	/* Scrub away the garbage */
3476 	execlists_init_reg_state(ce->lrc_reg_state,
3477 				 ce, ce->engine, ce->ring, true);
3478 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3479 
3480 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3481 }
3482 
3483 static const struct intel_context_ops execlists_context_ops = {
3484 	.alloc = execlists_context_alloc,
3485 
3486 	.pin = execlists_context_pin,
3487 	.unpin = execlists_context_unpin,
3488 
3489 	.enter = intel_context_enter_engine,
3490 	.exit = intel_context_exit_engine,
3491 
3492 	.reset = execlists_context_reset,
3493 	.destroy = execlists_context_destroy,
3494 };
3495 
3496 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3497 {
3498 	u32 *cs;
3499 
3500 	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3501 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3502 		return 0;
3503 
3504 	cs = intel_ring_begin(rq, 6);
3505 	if (IS_ERR(cs))
3506 		return PTR_ERR(cs);
3507 
3508 	/*
3509 	 * Check if we have been preempted before we even get started.
3510 	 *
3511 	 * After this point i915_request_started() reports true, even if
3512 	 * we get preempted and so are no longer running.
3513 	 */
3514 	*cs++ = MI_ARB_CHECK;
3515 	*cs++ = MI_NOOP;
3516 
3517 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3518 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3519 	*cs++ = 0;
3520 	*cs++ = rq->fence.seqno - 1;
3521 
3522 	intel_ring_advance(rq, cs);
3523 
3524 	/* Record the updated position of the request's payload */
3525 	rq->infix = intel_ring_offset(rq, cs);
3526 
3527 	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3528 
3529 	return 0;
3530 }
3531 
3532 static int emit_pdps(struct i915_request *rq)
3533 {
3534 	const struct intel_engine_cs * const engine = rq->engine;
3535 	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3536 	int err, i;
3537 	u32 *cs;
3538 
3539 	GEM_BUG_ON(intel_vgpu_active(rq->i915));
3540 
3541 	/*
3542 	 * Beware ye of the dragons, this sequence is magic!
3543 	 *
3544 	 * Small changes to this sequence can cause anything from
3545 	 * GPU hangs to forcewake errors and machine lockups!
3546 	 */
3547 
3548 	/* Flush any residual operations from the context load */
3549 	err = engine->emit_flush(rq, EMIT_FLUSH);
3550 	if (err)
3551 		return err;
3552 
3553 	/* Magic required to prevent forcewake errors! */
3554 	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3555 	if (err)
3556 		return err;
3557 
3558 	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3559 	if (IS_ERR(cs))
3560 		return PTR_ERR(cs);
3561 
3562 	/* Ensure the LRI have landed before we invalidate & continue */
3563 	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3564 	for (i = GEN8_3LVL_PDPES; i--; ) {
3565 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3566 		u32 base = engine->mmio_base;
3567 
3568 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3569 		*cs++ = upper_32_bits(pd_daddr);
3570 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3571 		*cs++ = lower_32_bits(pd_daddr);
3572 	}
3573 	*cs++ = MI_NOOP;
3574 
3575 	intel_ring_advance(rq, cs);
3576 
3577 	return 0;
3578 }
3579 
3580 static int execlists_request_alloc(struct i915_request *request)
3581 {
3582 	int ret;
3583 
3584 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3585 
3586 	/*
3587 	 * Flush enough space to reduce the likelihood of waiting after
3588 	 * we start building the request - in which case we will just
3589 	 * have to repeat work.
3590 	 */
3591 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3592 
3593 	/*
3594 	 * Note that after this point, we have committed to using
3595 	 * this request as it is being used to both track the
3596 	 * state of engine initialisation and liveness of the
3597 	 * golden renderstate above. Think twice before you try
3598 	 * to cancel/unwind this request now.
3599 	 */
3600 
3601 	if (!i915_vm_is_4lvl(request->context->vm)) {
3602 		ret = emit_pdps(request);
3603 		if (ret)
3604 			return ret;
3605 	}
3606 
3607 	/* Unconditionally invalidate GPU caches and TLBs. */
3608 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3609 	if (ret)
3610 		return ret;
3611 
3612 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3613 	return 0;
3614 }
3615 
3616 /*
3617  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3618  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3619  * but there is a slight complication as this is applied in WA batch where the
3620  * values are only initialized once so we cannot take register value at the
3621  * beginning and reuse it further; hence we save its value to memory, upload a
3622  * constant value with bit21 set and then we restore it back with the saved value.
3623  * To simplify the WA, a constant value is formed by using the default value
3624  * of this register. This shouldn't be a problem because we are only modifying
3625  * it for a short period and this batch in non-premptible. We can ofcourse
3626  * use additional instructions that read the actual value of the register
3627  * at that time and set our bit of interest but it makes the WA complicated.
3628  *
3629  * This WA is also required for Gen9 so extracting as a function avoids
3630  * code duplication.
3631  */
3632 static u32 *
3633 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3634 {
3635 	/* NB no one else is allowed to scribble over scratch + 256! */
3636 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3637 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3638 	*batch++ = intel_gt_scratch_offset(engine->gt,
3639 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3640 	*batch++ = 0;
3641 
3642 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3643 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3644 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3645 
3646 	batch = gen8_emit_pipe_control(batch,
3647 				       PIPE_CONTROL_CS_STALL |
3648 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3649 				       0);
3650 
3651 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3652 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3653 	*batch++ = intel_gt_scratch_offset(engine->gt,
3654 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3655 	*batch++ = 0;
3656 
3657 	return batch;
3658 }
3659 
3660 /*
3661  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3662  * initialized at the beginning and shared across all contexts but this field
3663  * helps us to have multiple batches at different offsets and select them based
3664  * on a criteria. At the moment this batch always start at the beginning of the page
3665  * and at this point we don't have multiple wa_ctx batch buffers.
3666  *
3667  * The number of WA applied are not known at the beginning; we use this field
3668  * to return the no of DWORDS written.
3669  *
3670  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3671  * so it adds NOOPs as padding to make it cacheline aligned.
3672  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3673  * makes a complete batch buffer.
3674  */
3675 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3676 {
3677 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3678 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3679 
3680 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3681 	if (IS_BROADWELL(engine->i915))
3682 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3683 
3684 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3685 	/* Actual scratch location is at 128 bytes offset */
3686 	batch = gen8_emit_pipe_control(batch,
3687 				       PIPE_CONTROL_FLUSH_L3 |
3688 				       PIPE_CONTROL_STORE_DATA_INDEX |
3689 				       PIPE_CONTROL_CS_STALL |
3690 				       PIPE_CONTROL_QW_WRITE,
3691 				       LRC_PPHWSP_SCRATCH_ADDR);
3692 
3693 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3694 
3695 	/* Pad to end of cacheline */
3696 	while ((unsigned long)batch % CACHELINE_BYTES)
3697 		*batch++ = MI_NOOP;
3698 
3699 	/*
3700 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3701 	 * execution depends on the length specified in terms of cache lines
3702 	 * in the register CTX_RCS_INDIRECT_CTX
3703 	 */
3704 
3705 	return batch;
3706 }
3707 
3708 struct lri {
3709 	i915_reg_t reg;
3710 	u32 value;
3711 };
3712 
3713 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3714 {
3715 	GEM_BUG_ON(!count || count > 63);
3716 
3717 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3718 	do {
3719 		*batch++ = i915_mmio_reg_offset(lri->reg);
3720 		*batch++ = lri->value;
3721 	} while (lri++, --count);
3722 	*batch++ = MI_NOOP;
3723 
3724 	return batch;
3725 }
3726 
3727 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3728 {
3729 	static const struct lri lri[] = {
3730 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3731 		{
3732 			COMMON_SLICE_CHICKEN2,
3733 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3734 				       0),
3735 		},
3736 
3737 		/* BSpec: 11391 */
3738 		{
3739 			FF_SLICE_CHICKEN,
3740 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3741 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3742 		},
3743 
3744 		/* BSpec: 11299 */
3745 		{
3746 			_3D_CHICKEN3,
3747 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3748 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3749 		}
3750 	};
3751 
3752 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3753 
3754 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3755 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3756 
3757 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3758 	batch = gen8_emit_pipe_control(batch,
3759 				       PIPE_CONTROL_FLUSH_L3 |
3760 				       PIPE_CONTROL_STORE_DATA_INDEX |
3761 				       PIPE_CONTROL_CS_STALL |
3762 				       PIPE_CONTROL_QW_WRITE,
3763 				       LRC_PPHWSP_SCRATCH_ADDR);
3764 
3765 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3766 
3767 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3768 	if (HAS_POOLED_EU(engine->i915)) {
3769 		/*
3770 		 * EU pool configuration is setup along with golden context
3771 		 * during context initialization. This value depends on
3772 		 * device type (2x6 or 3x6) and needs to be updated based
3773 		 * on which subslice is disabled especially for 2x6
3774 		 * devices, however it is safe to load default
3775 		 * configuration of 3x6 device instead of masking off
3776 		 * corresponding bits because HW ignores bits of a disabled
3777 		 * subslice and drops down to appropriate config. Please
3778 		 * see render_state_setup() in i915_gem_render_state.c for
3779 		 * possible configurations, to avoid duplication they are
3780 		 * not shown here again.
3781 		 */
3782 		*batch++ = GEN9_MEDIA_POOL_STATE;
3783 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3784 		*batch++ = 0x00777000;
3785 		*batch++ = 0;
3786 		*batch++ = 0;
3787 		*batch++ = 0;
3788 	}
3789 
3790 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3791 
3792 	/* Pad to end of cacheline */
3793 	while ((unsigned long)batch % CACHELINE_BYTES)
3794 		*batch++ = MI_NOOP;
3795 
3796 	return batch;
3797 }
3798 
3799 static u32 *
3800 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3801 {
3802 	int i;
3803 
3804 	/*
3805 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3806 	 *
3807 	 * Ensure the engine is idle prior to programming a
3808 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3809 	 */
3810 	batch = gen8_emit_pipe_control(batch,
3811 				       PIPE_CONTROL_CS_STALL,
3812 				       0);
3813 	/*
3814 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3815 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3816 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3817 	 * confusing. Since gen8_emit_pipe_control() already advances the
3818 	 * batch by 6 dwords, we advance the other 10 here, completing a
3819 	 * cacheline. It's not clear if the workaround requires this padding
3820 	 * before other commands, or if it's just the regular padding we would
3821 	 * already have for the workaround bb, so leave it here for now.
3822 	 */
3823 	for (i = 0; i < 10; i++)
3824 		*batch++ = MI_NOOP;
3825 
3826 	/* Pad to end of cacheline */
3827 	while ((unsigned long)batch % CACHELINE_BYTES)
3828 		*batch++ = MI_NOOP;
3829 
3830 	return batch;
3831 }
3832 
3833 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3834 
3835 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3836 {
3837 	struct drm_i915_gem_object *obj;
3838 	struct i915_vma *vma;
3839 	int err;
3840 
3841 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3842 	if (IS_ERR(obj))
3843 		return PTR_ERR(obj);
3844 
3845 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3846 	if (IS_ERR(vma)) {
3847 		err = PTR_ERR(vma);
3848 		goto err;
3849 	}
3850 
3851 	err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3852 	if (err)
3853 		goto err;
3854 
3855 	engine->wa_ctx.vma = vma;
3856 	return 0;
3857 
3858 err:
3859 	i915_gem_object_put(obj);
3860 	return err;
3861 }
3862 
3863 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3864 {
3865 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3866 }
3867 
3868 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3869 
3870 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3871 {
3872 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3873 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3874 					    &wa_ctx->per_ctx };
3875 	wa_bb_func_t wa_bb_fn[2];
3876 	struct page *page;
3877 	void *batch, *batch_ptr;
3878 	unsigned int i;
3879 	int ret;
3880 
3881 	if (engine->class != RENDER_CLASS)
3882 		return 0;
3883 
3884 	switch (INTEL_GEN(engine->i915)) {
3885 	case 12:
3886 	case 11:
3887 		return 0;
3888 	case 10:
3889 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3890 		wa_bb_fn[1] = NULL;
3891 		break;
3892 	case 9:
3893 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3894 		wa_bb_fn[1] = NULL;
3895 		break;
3896 	case 8:
3897 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3898 		wa_bb_fn[1] = NULL;
3899 		break;
3900 	default:
3901 		MISSING_CASE(INTEL_GEN(engine->i915));
3902 		return 0;
3903 	}
3904 
3905 	ret = lrc_setup_wa_ctx(engine);
3906 	if (ret) {
3907 		drm_dbg(&engine->i915->drm,
3908 			"Failed to setup context WA page: %d\n", ret);
3909 		return ret;
3910 	}
3911 
3912 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3913 	batch = batch_ptr = kmap_atomic(page);
3914 
3915 	/*
3916 	 * Emit the two workaround batch buffers, recording the offset from the
3917 	 * start of the workaround batch buffer object for each and their
3918 	 * respective sizes.
3919 	 */
3920 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3921 		wa_bb[i]->offset = batch_ptr - batch;
3922 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3923 						  CACHELINE_BYTES))) {
3924 			ret = -EINVAL;
3925 			break;
3926 		}
3927 		if (wa_bb_fn[i])
3928 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3929 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3930 	}
3931 
3932 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3933 
3934 	kunmap_atomic(batch);
3935 	if (ret)
3936 		lrc_destroy_wa_ctx(engine);
3937 
3938 	return ret;
3939 }
3940 
3941 static void reset_csb_pointers(struct intel_engine_cs *engine)
3942 {
3943 	struct intel_engine_execlists * const execlists = &engine->execlists;
3944 	const unsigned int reset_value = execlists->csb_size - 1;
3945 
3946 	ring_set_paused(engine, 0);
3947 
3948 	/*
3949 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3950 	 * Bludgeon them with a mmio update to be sure.
3951 	 */
3952 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3953 		     0xffff << 16 | reset_value << 8 | reset_value);
3954 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3955 
3956 	/*
3957 	 * After a reset, the HW starts writing into CSB entry [0]. We
3958 	 * therefore have to set our HEAD pointer back one entry so that
3959 	 * the *first* entry we check is entry 0. To complicate this further,
3960 	 * as we don't wait for the first interrupt after reset, we have to
3961 	 * fake the HW write to point back to the last entry so that our
3962 	 * inline comparison of our cached head position against the last HW
3963 	 * write works even before the first interrupt.
3964 	 */
3965 	execlists->csb_head = reset_value;
3966 	WRITE_ONCE(*execlists->csb_write, reset_value);
3967 	wmb(); /* Make sure this is visible to HW (paranoia?) */
3968 
3969 	invalidate_csb_entries(&execlists->csb_status[0],
3970 			       &execlists->csb_status[reset_value]);
3971 
3972 	/* Once more for luck and our trusty paranoia */
3973 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3974 		     0xffff << 16 | reset_value << 8 | reset_value);
3975 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3976 
3977 	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
3978 }
3979 
3980 static void execlists_sanitize(struct intel_engine_cs *engine)
3981 {
3982 	/*
3983 	 * Poison residual state on resume, in case the suspend didn't!
3984 	 *
3985 	 * We have to assume that across suspend/resume (or other loss
3986 	 * of control) that the contents of our pinned buffers has been
3987 	 * lost, replaced by garbage. Since this doesn't always happen,
3988 	 * let's poison such state so that we more quickly spot when
3989 	 * we falsely assume it has been preserved.
3990 	 */
3991 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3992 		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
3993 
3994 	reset_csb_pointers(engine);
3995 
3996 	/*
3997 	 * The kernel_context HWSP is stored in the status_page. As above,
3998 	 * that may be lost on resume/initialisation, and so we need to
3999 	 * reset the value in the HWSP.
4000 	 */
4001 	intel_timeline_reset_seqno(engine->kernel_context->timeline);
4002 
4003 	/* And scrub the dirty cachelines for the HWSP */
4004 	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4005 }
4006 
4007 static void enable_error_interrupt(struct intel_engine_cs *engine)
4008 {
4009 	u32 status;
4010 
4011 	engine->execlists.error_interrupt = 0;
4012 	ENGINE_WRITE(engine, RING_EMR, ~0u);
4013 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4014 
4015 	status = ENGINE_READ(engine, RING_ESR);
4016 	if (unlikely(status)) {
4017 		drm_err(&engine->i915->drm,
4018 			"engine '%s' resumed still in error: %08x\n",
4019 			engine->name, status);
4020 		__intel_gt_reset(engine->gt, engine->mask);
4021 	}
4022 
4023 	/*
4024 	 * On current gen8+, we have 2 signals to play with
4025 	 *
4026 	 * - I915_ERROR_INSTUCTION (bit 0)
4027 	 *
4028 	 *    Generate an error if the command parser encounters an invalid
4029 	 *    instruction
4030 	 *
4031 	 *    This is a fatal error.
4032 	 *
4033 	 * - CP_PRIV (bit 2)
4034 	 *
4035 	 *    Generate an error on privilege violation (where the CP replaces
4036 	 *    the instruction with a no-op). This also fires for writes into
4037 	 *    read-only scratch pages.
4038 	 *
4039 	 *    This is a non-fatal error, parsing continues.
4040 	 *
4041 	 * * there are a few others defined for odd HW that we do not use
4042 	 *
4043 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4044 	 * error (as the HW is validating and suppressing the mistakes), we
4045 	 * only unmask the instruction error bit.
4046 	 */
4047 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4048 }
4049 
4050 static void enable_execlists(struct intel_engine_cs *engine)
4051 {
4052 	u32 mode;
4053 
4054 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4055 
4056 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4057 
4058 	if (INTEL_GEN(engine->i915) >= 11)
4059 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4060 	else
4061 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4062 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4063 
4064 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4065 
4066 	ENGINE_WRITE_FW(engine,
4067 			RING_HWS_PGA,
4068 			i915_ggtt_offset(engine->status_page.vma));
4069 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4070 
4071 	enable_error_interrupt(engine);
4072 
4073 	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4074 }
4075 
4076 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4077 {
4078 	bool unexpected = false;
4079 
4080 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4081 		drm_dbg(&engine->i915->drm,
4082 			"STOP_RING still set in RING_MI_MODE\n");
4083 		unexpected = true;
4084 	}
4085 
4086 	return unexpected;
4087 }
4088 
4089 static int execlists_resume(struct intel_engine_cs *engine)
4090 {
4091 	intel_mocs_init_engine(engine);
4092 
4093 	intel_engine_reset_breadcrumbs(engine);
4094 
4095 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4096 		struct drm_printer p = drm_debug_printer(__func__);
4097 
4098 		intel_engine_dump(engine, &p, NULL);
4099 	}
4100 
4101 	enable_execlists(engine);
4102 
4103 	return 0;
4104 }
4105 
4106 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4107 {
4108 	struct intel_engine_execlists * const execlists = &engine->execlists;
4109 	unsigned long flags;
4110 
4111 	ENGINE_TRACE(engine, "depth<-%d\n",
4112 		     atomic_read(&execlists->tasklet.count));
4113 
4114 	/*
4115 	 * Prevent request submission to the hardware until we have
4116 	 * completed the reset in i915_gem_reset_finish(). If a request
4117 	 * is completed by one engine, it may then queue a request
4118 	 * to a second via its execlists->tasklet *just* as we are
4119 	 * calling engine->resume() and also writing the ELSP.
4120 	 * Turning off the execlists->tasklet until the reset is over
4121 	 * prevents the race.
4122 	 */
4123 	__tasklet_disable_sync_once(&execlists->tasklet);
4124 	GEM_BUG_ON(!reset_in_progress(execlists));
4125 
4126 	/* And flush any current direct submission. */
4127 	spin_lock_irqsave(&engine->active.lock, flags);
4128 	spin_unlock_irqrestore(&engine->active.lock, flags);
4129 
4130 	/*
4131 	 * We stop engines, otherwise we might get failed reset and a
4132 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4133 	 * from system hang if batchbuffer is progressing when
4134 	 * the reset is issued, regardless of READY_TO_RESET ack.
4135 	 * Thus assume it is best to stop engines on all gens
4136 	 * where we have a gpu reset.
4137 	 *
4138 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4139 	 *
4140 	 * FIXME: Wa for more modern gens needs to be validated
4141 	 */
4142 	ring_set_paused(engine, 1);
4143 	intel_engine_stop_cs(engine);
4144 
4145 	engine->execlists.reset_ccid = active_ccid(engine);
4146 }
4147 
4148 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4149 {
4150 	int x;
4151 
4152 	x = lrc_ring_mi_mode(engine);
4153 	if (x != -1) {
4154 		regs[x + 1] &= ~STOP_RING;
4155 		regs[x + 1] |= STOP_RING << 16;
4156 	}
4157 }
4158 
4159 static void __execlists_reset_reg_state(const struct intel_context *ce,
4160 					const struct intel_engine_cs *engine)
4161 {
4162 	u32 *regs = ce->lrc_reg_state;
4163 
4164 	__reset_stop_ring(regs, engine);
4165 }
4166 
4167 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4168 {
4169 	struct intel_engine_execlists * const execlists = &engine->execlists;
4170 	struct intel_context *ce;
4171 	struct i915_request *rq;
4172 	u32 head;
4173 
4174 	mb(); /* paranoia: read the CSB pointers from after the reset */
4175 	clflush(execlists->csb_write);
4176 	mb();
4177 
4178 	process_csb(engine); /* drain preemption events */
4179 
4180 	/* Following the reset, we need to reload the CSB read/write pointers */
4181 	reset_csb_pointers(engine);
4182 
4183 	/*
4184 	 * Save the currently executing context, even if we completed
4185 	 * its request, it was still running at the time of the
4186 	 * reset and will have been clobbered.
4187 	 */
4188 	rq = active_context(engine, engine->execlists.reset_ccid);
4189 	if (!rq)
4190 		goto unwind;
4191 
4192 	ce = rq->context;
4193 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4194 
4195 	if (i915_request_completed(rq)) {
4196 		/* Idle context; tidy up the ring so we can restart afresh */
4197 		head = intel_ring_wrap(ce->ring, rq->tail);
4198 		goto out_replay;
4199 	}
4200 
4201 	/* We still have requests in-flight; the engine should be active */
4202 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4203 
4204 	/* Context has requests still in-flight; it should not be idle! */
4205 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4206 
4207 	rq = active_request(ce->timeline, rq);
4208 	head = intel_ring_wrap(ce->ring, rq->head);
4209 	GEM_BUG_ON(head == ce->ring->tail);
4210 
4211 	/*
4212 	 * If this request hasn't started yet, e.g. it is waiting on a
4213 	 * semaphore, we need to avoid skipping the request or else we
4214 	 * break the signaling chain. However, if the context is corrupt
4215 	 * the request will not restart and we will be stuck with a wedged
4216 	 * device. It is quite often the case that if we issue a reset
4217 	 * while the GPU is loading the context image, that the context
4218 	 * image becomes corrupt.
4219 	 *
4220 	 * Otherwise, if we have not started yet, the request should replay
4221 	 * perfectly and we do not need to flag the result as being erroneous.
4222 	 */
4223 	if (!i915_request_started(rq))
4224 		goto out_replay;
4225 
4226 	/*
4227 	 * If the request was innocent, we leave the request in the ELSP
4228 	 * and will try to replay it on restarting. The context image may
4229 	 * have been corrupted by the reset, in which case we may have
4230 	 * to service a new GPU hang, but more likely we can continue on
4231 	 * without impact.
4232 	 *
4233 	 * If the request was guilty, we presume the context is corrupt
4234 	 * and have to at least restore the RING register in the context
4235 	 * image back to the expected values to skip over the guilty request.
4236 	 */
4237 	__i915_request_reset(rq, stalled);
4238 
4239 	/*
4240 	 * We want a simple context + ring to execute the breadcrumb update.
4241 	 * We cannot rely on the context being intact across the GPU hang,
4242 	 * so clear it and rebuild just what we need for the breadcrumb.
4243 	 * All pending requests for this context will be zapped, and any
4244 	 * future request will be after userspace has had the opportunity
4245 	 * to recreate its own state.
4246 	 */
4247 out_replay:
4248 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4249 		     head, ce->ring->tail);
4250 	__execlists_reset_reg_state(ce, engine);
4251 	__execlists_update_reg_state(ce, engine, head);
4252 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4253 
4254 unwind:
4255 	/* Push back any incomplete requests for replay after the reset. */
4256 	cancel_port_requests(execlists);
4257 	__unwind_incomplete_requests(engine);
4258 }
4259 
4260 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4261 {
4262 	unsigned long flags;
4263 
4264 	ENGINE_TRACE(engine, "\n");
4265 
4266 	spin_lock_irqsave(&engine->active.lock, flags);
4267 
4268 	__execlists_reset(engine, stalled);
4269 
4270 	spin_unlock_irqrestore(&engine->active.lock, flags);
4271 }
4272 
4273 static void nop_submission_tasklet(unsigned long data)
4274 {
4275 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4276 
4277 	/* The driver is wedged; don't process any more events. */
4278 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4279 }
4280 
4281 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4282 {
4283 	struct intel_engine_execlists * const execlists = &engine->execlists;
4284 	struct i915_request *rq, *rn;
4285 	struct rb_node *rb;
4286 	unsigned long flags;
4287 
4288 	ENGINE_TRACE(engine, "\n");
4289 
4290 	/*
4291 	 * Before we call engine->cancel_requests(), we should have exclusive
4292 	 * access to the submission state. This is arranged for us by the
4293 	 * caller disabling the interrupt generation, the tasklet and other
4294 	 * threads that may then access the same state, giving us a free hand
4295 	 * to reset state. However, we still need to let lockdep be aware that
4296 	 * we know this state may be accessed in hardirq context, so we
4297 	 * disable the irq around this manipulation and we want to keep
4298 	 * the spinlock focused on its duties and not accidentally conflate
4299 	 * coverage to the submission's irq state. (Similarly, although we
4300 	 * shouldn't need to disable irq around the manipulation of the
4301 	 * submission's irq state, we also wish to remind ourselves that
4302 	 * it is irq state.)
4303 	 */
4304 	spin_lock_irqsave(&engine->active.lock, flags);
4305 
4306 	__execlists_reset(engine, true);
4307 
4308 	/* Mark all executing requests as skipped. */
4309 	list_for_each_entry(rq, &engine->active.requests, sched.link)
4310 		mark_eio(rq);
4311 
4312 	/* Flush the queued requests to the timeline list (for retiring). */
4313 	while ((rb = rb_first_cached(&execlists->queue))) {
4314 		struct i915_priolist *p = to_priolist(rb);
4315 		int i;
4316 
4317 		priolist_for_each_request_consume(rq, rn, p, i) {
4318 			mark_eio(rq);
4319 			__i915_request_submit(rq);
4320 		}
4321 
4322 		rb_erase_cached(&p->node, &execlists->queue);
4323 		i915_priolist_free(p);
4324 	}
4325 
4326 	/* On-hold requests will be flushed to timeline upon their release */
4327 	list_for_each_entry(rq, &engine->active.hold, sched.link)
4328 		mark_eio(rq);
4329 
4330 	/* Cancel all attached virtual engines */
4331 	while ((rb = rb_first_cached(&execlists->virtual))) {
4332 		struct virtual_engine *ve =
4333 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4334 
4335 		rb_erase_cached(rb, &execlists->virtual);
4336 		RB_CLEAR_NODE(rb);
4337 
4338 		spin_lock(&ve->base.active.lock);
4339 		rq = fetch_and_zero(&ve->request);
4340 		if (rq) {
4341 			mark_eio(rq);
4342 
4343 			rq->engine = engine;
4344 			__i915_request_submit(rq);
4345 			i915_request_put(rq);
4346 
4347 			ve->base.execlists.queue_priority_hint = INT_MIN;
4348 		}
4349 		spin_unlock(&ve->base.active.lock);
4350 	}
4351 
4352 	/* Remaining _unready_ requests will be nop'ed when submitted */
4353 
4354 	execlists->queue_priority_hint = INT_MIN;
4355 	execlists->queue = RB_ROOT_CACHED;
4356 
4357 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4358 	execlists->tasklet.func = nop_submission_tasklet;
4359 
4360 	spin_unlock_irqrestore(&engine->active.lock, flags);
4361 }
4362 
4363 static void execlists_reset_finish(struct intel_engine_cs *engine)
4364 {
4365 	struct intel_engine_execlists * const execlists = &engine->execlists;
4366 
4367 	/*
4368 	 * After a GPU reset, we may have requests to replay. Do so now while
4369 	 * we still have the forcewake to be sure that the GPU is not allowed
4370 	 * to sleep before we restart and reload a context.
4371 	 */
4372 	GEM_BUG_ON(!reset_in_progress(execlists));
4373 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4374 		execlists->tasklet.func(execlists->tasklet.data);
4375 
4376 	if (__tasklet_enable(&execlists->tasklet))
4377 		/* And kick in case we missed a new request submission. */
4378 		tasklet_hi_schedule(&execlists->tasklet);
4379 	ENGINE_TRACE(engine, "depth->%d\n",
4380 		     atomic_read(&execlists->tasklet.count));
4381 }
4382 
4383 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4384 				    u64 offset, u32 len,
4385 				    const unsigned int flags)
4386 {
4387 	u32 *cs;
4388 
4389 	cs = intel_ring_begin(rq, 4);
4390 	if (IS_ERR(cs))
4391 		return PTR_ERR(cs);
4392 
4393 	/*
4394 	 * WaDisableCtxRestoreArbitration:bdw,chv
4395 	 *
4396 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4397 	 * particular all the gen that do not need the w/a at all!), if we
4398 	 * took care to make sure that on every switch into this context
4399 	 * (both ordinary and for preemption) that arbitrartion was enabled
4400 	 * we would be fine.  However, for gen8 there is another w/a that
4401 	 * requires us to not preempt inside GPGPU execution, so we keep
4402 	 * arbitration disabled for gen8 batches. Arbitration will be
4403 	 * re-enabled before we close the request
4404 	 * (engine->emit_fini_breadcrumb).
4405 	 */
4406 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4407 
4408 	/* FIXME(BDW+): Address space and security selectors. */
4409 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4410 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4411 	*cs++ = lower_32_bits(offset);
4412 	*cs++ = upper_32_bits(offset);
4413 
4414 	intel_ring_advance(rq, cs);
4415 
4416 	return 0;
4417 }
4418 
4419 static int gen8_emit_bb_start(struct i915_request *rq,
4420 			      u64 offset, u32 len,
4421 			      const unsigned int flags)
4422 {
4423 	u32 *cs;
4424 
4425 	cs = intel_ring_begin(rq, 6);
4426 	if (IS_ERR(cs))
4427 		return PTR_ERR(cs);
4428 
4429 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4430 
4431 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4432 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4433 	*cs++ = lower_32_bits(offset);
4434 	*cs++ = upper_32_bits(offset);
4435 
4436 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4437 	*cs++ = MI_NOOP;
4438 
4439 	intel_ring_advance(rq, cs);
4440 
4441 	return 0;
4442 }
4443 
4444 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4445 {
4446 	ENGINE_WRITE(engine, RING_IMR,
4447 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4448 	ENGINE_POSTING_READ(engine, RING_IMR);
4449 }
4450 
4451 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4452 {
4453 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4454 }
4455 
4456 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4457 {
4458 	u32 cmd, *cs;
4459 
4460 	cs = intel_ring_begin(request, 4);
4461 	if (IS_ERR(cs))
4462 		return PTR_ERR(cs);
4463 
4464 	cmd = MI_FLUSH_DW + 1;
4465 
4466 	/* We always require a command barrier so that subsequent
4467 	 * commands, such as breadcrumb interrupts, are strictly ordered
4468 	 * wrt the contents of the write cache being flushed to memory
4469 	 * (and thus being coherent from the CPU).
4470 	 */
4471 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4472 
4473 	if (mode & EMIT_INVALIDATE) {
4474 		cmd |= MI_INVALIDATE_TLB;
4475 		if (request->engine->class == VIDEO_DECODE_CLASS)
4476 			cmd |= MI_INVALIDATE_BSD;
4477 	}
4478 
4479 	*cs++ = cmd;
4480 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4481 	*cs++ = 0; /* upper addr */
4482 	*cs++ = 0; /* value */
4483 	intel_ring_advance(request, cs);
4484 
4485 	return 0;
4486 }
4487 
4488 static int gen8_emit_flush_render(struct i915_request *request,
4489 				  u32 mode)
4490 {
4491 	bool vf_flush_wa = false, dc_flush_wa = false;
4492 	u32 *cs, flags = 0;
4493 	int len;
4494 
4495 	flags |= PIPE_CONTROL_CS_STALL;
4496 
4497 	if (mode & EMIT_FLUSH) {
4498 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4499 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4500 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4501 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4502 	}
4503 
4504 	if (mode & EMIT_INVALIDATE) {
4505 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4506 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4507 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4508 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4509 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4510 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4511 		flags |= PIPE_CONTROL_QW_WRITE;
4512 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4513 
4514 		/*
4515 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4516 		 * pipe control.
4517 		 */
4518 		if (IS_GEN(request->i915, 9))
4519 			vf_flush_wa = true;
4520 
4521 		/* WaForGAMHang:kbl */
4522 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
4523 			dc_flush_wa = true;
4524 	}
4525 
4526 	len = 6;
4527 
4528 	if (vf_flush_wa)
4529 		len += 6;
4530 
4531 	if (dc_flush_wa)
4532 		len += 12;
4533 
4534 	cs = intel_ring_begin(request, len);
4535 	if (IS_ERR(cs))
4536 		return PTR_ERR(cs);
4537 
4538 	if (vf_flush_wa)
4539 		cs = gen8_emit_pipe_control(cs, 0, 0);
4540 
4541 	if (dc_flush_wa)
4542 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4543 					    0);
4544 
4545 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4546 
4547 	if (dc_flush_wa)
4548 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4549 
4550 	intel_ring_advance(request, cs);
4551 
4552 	return 0;
4553 }
4554 
4555 static int gen11_emit_flush_render(struct i915_request *request,
4556 				   u32 mode)
4557 {
4558 	if (mode & EMIT_FLUSH) {
4559 		u32 *cs;
4560 		u32 flags = 0;
4561 
4562 		flags |= PIPE_CONTROL_CS_STALL;
4563 
4564 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4565 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4566 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4567 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4568 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4569 		flags |= PIPE_CONTROL_QW_WRITE;
4570 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4571 
4572 		cs = intel_ring_begin(request, 6);
4573 		if (IS_ERR(cs))
4574 			return PTR_ERR(cs);
4575 
4576 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4577 		intel_ring_advance(request, cs);
4578 	}
4579 
4580 	if (mode & EMIT_INVALIDATE) {
4581 		u32 *cs;
4582 		u32 flags = 0;
4583 
4584 		flags |= PIPE_CONTROL_CS_STALL;
4585 
4586 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4587 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4588 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4589 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4590 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4591 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4592 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4593 		flags |= PIPE_CONTROL_QW_WRITE;
4594 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4595 
4596 		cs = intel_ring_begin(request, 6);
4597 		if (IS_ERR(cs))
4598 			return PTR_ERR(cs);
4599 
4600 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4601 		intel_ring_advance(request, cs);
4602 	}
4603 
4604 	return 0;
4605 }
4606 
4607 static u32 preparser_disable(bool state)
4608 {
4609 	return MI_ARB_CHECK | 1 << 8 | state;
4610 }
4611 
4612 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4613 {
4614 	static const i915_reg_t vd[] = {
4615 		GEN12_VD0_AUX_NV,
4616 		GEN12_VD1_AUX_NV,
4617 		GEN12_VD2_AUX_NV,
4618 		GEN12_VD3_AUX_NV,
4619 	};
4620 
4621 	static const i915_reg_t ve[] = {
4622 		GEN12_VE0_AUX_NV,
4623 		GEN12_VE1_AUX_NV,
4624 	};
4625 
4626 	if (engine->class == VIDEO_DECODE_CLASS)
4627 		return vd[engine->instance];
4628 
4629 	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4630 		return ve[engine->instance];
4631 
4632 	GEM_BUG_ON("unknown aux_inv_reg\n");
4633 
4634 	return INVALID_MMIO_REG;
4635 }
4636 
4637 static u32 *
4638 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4639 {
4640 	*cs++ = MI_LOAD_REGISTER_IMM(1);
4641 	*cs++ = i915_mmio_reg_offset(inv_reg);
4642 	*cs++ = AUX_INV;
4643 	*cs++ = MI_NOOP;
4644 
4645 	return cs;
4646 }
4647 
4648 static int gen12_emit_flush_render(struct i915_request *request,
4649 				   u32 mode)
4650 {
4651 	if (mode & EMIT_FLUSH) {
4652 		u32 flags = 0;
4653 		u32 *cs;
4654 
4655 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4656 		flags |= PIPE_CONTROL_FLUSH_L3;
4657 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4658 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4659 		/* Wa_1409600907:tgl */
4660 		flags |= PIPE_CONTROL_DEPTH_STALL;
4661 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4662 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4663 
4664 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4665 		flags |= PIPE_CONTROL_QW_WRITE;
4666 
4667 		flags |= PIPE_CONTROL_CS_STALL;
4668 
4669 		cs = intel_ring_begin(request, 6);
4670 		if (IS_ERR(cs))
4671 			return PTR_ERR(cs);
4672 
4673 		cs = gen12_emit_pipe_control(cs,
4674 					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4675 					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4676 		intel_ring_advance(request, cs);
4677 	}
4678 
4679 	if (mode & EMIT_INVALIDATE) {
4680 		u32 flags = 0;
4681 		u32 *cs;
4682 
4683 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4684 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4685 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4686 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4687 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4688 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4689 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4690 
4691 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4692 		flags |= PIPE_CONTROL_QW_WRITE;
4693 
4694 		flags |= PIPE_CONTROL_CS_STALL;
4695 
4696 		cs = intel_ring_begin(request, 8 + 4);
4697 		if (IS_ERR(cs))
4698 			return PTR_ERR(cs);
4699 
4700 		/*
4701 		 * Prevent the pre-parser from skipping past the TLB
4702 		 * invalidate and loading a stale page for the batch
4703 		 * buffer / request payload.
4704 		 */
4705 		*cs++ = preparser_disable(true);
4706 
4707 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4708 
4709 		/* hsdes: 1809175790 */
4710 		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4711 
4712 		*cs++ = preparser_disable(false);
4713 		intel_ring_advance(request, cs);
4714 	}
4715 
4716 	return 0;
4717 }
4718 
4719 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4720 {
4721 	intel_engine_mask_t aux_inv = 0;
4722 	u32 cmd, *cs;
4723 
4724 	if (mode & EMIT_INVALIDATE)
4725 		aux_inv = request->engine->mask & ~BIT(BCS0);
4726 
4727 	cs = intel_ring_begin(request,
4728 			      4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0));
4729 	if (IS_ERR(cs))
4730 		return PTR_ERR(cs);
4731 
4732 	cmd = MI_FLUSH_DW + 1;
4733 
4734 	/* We always require a command barrier so that subsequent
4735 	 * commands, such as breadcrumb interrupts, are strictly ordered
4736 	 * wrt the contents of the write cache being flushed to memory
4737 	 * (and thus being coherent from the CPU).
4738 	 */
4739 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4740 
4741 	if (mode & EMIT_INVALIDATE) {
4742 		cmd |= MI_INVALIDATE_TLB;
4743 		if (request->engine->class == VIDEO_DECODE_CLASS)
4744 			cmd |= MI_INVALIDATE_BSD;
4745 	}
4746 
4747 	*cs++ = cmd;
4748 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4749 	*cs++ = 0; /* upper addr */
4750 	*cs++ = 0; /* value */
4751 
4752 	if (aux_inv) { /* hsdes: 1809175790 */
4753 		struct intel_engine_cs *engine;
4754 		unsigned int tmp;
4755 
4756 		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4757 		for_each_engine_masked(engine, request->engine->gt,
4758 				       aux_inv, tmp) {
4759 			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4760 			*cs++ = AUX_INV;
4761 		}
4762 		*cs++ = MI_NOOP;
4763 	}
4764 	intel_ring_advance(request, cs);
4765 
4766 	return 0;
4767 }
4768 
4769 static void assert_request_valid(struct i915_request *rq)
4770 {
4771 	struct intel_ring *ring __maybe_unused = rq->ring;
4772 
4773 	/* Can we unwind this request without appearing to go forwards? */
4774 	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4775 }
4776 
4777 /*
4778  * Reserve space for 2 NOOPs at the end of each request to be
4779  * used as a workaround for not being allowed to do lite
4780  * restore with HEAD==TAIL (WaIdleLiteRestore).
4781  */
4782 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4783 {
4784 	/* Ensure there's always at least one preemption point per-request. */
4785 	*cs++ = MI_ARB_CHECK;
4786 	*cs++ = MI_NOOP;
4787 	request->wa_tail = intel_ring_offset(request, cs);
4788 
4789 	/* Check that entire request is less than half the ring */
4790 	assert_request_valid(request);
4791 
4792 	return cs;
4793 }
4794 
4795 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4796 {
4797 	*cs++ = MI_SEMAPHORE_WAIT |
4798 		MI_SEMAPHORE_GLOBAL_GTT |
4799 		MI_SEMAPHORE_POLL |
4800 		MI_SEMAPHORE_SAD_EQ_SDD;
4801 	*cs++ = 0;
4802 	*cs++ = intel_hws_preempt_address(request->engine);
4803 	*cs++ = 0;
4804 
4805 	return cs;
4806 }
4807 
4808 static __always_inline u32*
4809 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4810 {
4811 	*cs++ = MI_USER_INTERRUPT;
4812 
4813 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4814 	if (intel_engine_has_semaphores(request->engine))
4815 		cs = emit_preempt_busywait(request, cs);
4816 
4817 	request->tail = intel_ring_offset(request, cs);
4818 	assert_ring_tail_valid(request->ring, request->tail);
4819 
4820 	return gen8_emit_wa_tail(request, cs);
4821 }
4822 
4823 static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs)
4824 {
4825 	u32 addr = i915_request_active_timeline(request)->hwsp_offset;
4826 
4827 	return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0);
4828 }
4829 
4830 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4831 {
4832 	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4833 }
4834 
4835 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4836 {
4837 	cs = gen8_emit_pipe_control(cs,
4838 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4839 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4840 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4841 				    0);
4842 
4843 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4844 	cs = gen8_emit_ggtt_write_rcs(cs,
4845 				      request->fence.seqno,
4846 				      i915_request_active_timeline(request)->hwsp_offset,
4847 				      PIPE_CONTROL_FLUSH_ENABLE |
4848 				      PIPE_CONTROL_CS_STALL);
4849 
4850 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4851 }
4852 
4853 static u32 *
4854 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4855 {
4856 	cs = gen8_emit_ggtt_write_rcs(cs,
4857 				      request->fence.seqno,
4858 				      i915_request_active_timeline(request)->hwsp_offset,
4859 				      PIPE_CONTROL_CS_STALL |
4860 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4861 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4862 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4863 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4864 				      PIPE_CONTROL_FLUSH_ENABLE);
4865 
4866 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4867 }
4868 
4869 /*
4870  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4871  * flush and will continue pre-fetching the instructions after it before the
4872  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4873  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4874  * of the next request before the memory has been flushed, we're guaranteed that
4875  * we won't access the batch itself too early.
4876  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4877  * so, if the current request is modifying an instruction in the next request on
4878  * the same intel_context, we might pre-fetch and then execute the pre-update
4879  * instruction. To avoid this, the users of self-modifying code should either
4880  * disable the parser around the code emitting the memory writes, via a new flag
4881  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4882  * the in-kernel use-cases we've opted to use a separate context, see
4883  * reloc_gpu() as an example.
4884  * All the above applies only to the instructions themselves. Non-inline data
4885  * used by the instructions is not pre-fetched.
4886  */
4887 
4888 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4889 {
4890 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4891 		MI_SEMAPHORE_GLOBAL_GTT |
4892 		MI_SEMAPHORE_POLL |
4893 		MI_SEMAPHORE_SAD_EQ_SDD;
4894 	*cs++ = 0;
4895 	*cs++ = intel_hws_preempt_address(request->engine);
4896 	*cs++ = 0;
4897 	*cs++ = 0;
4898 	*cs++ = MI_NOOP;
4899 
4900 	return cs;
4901 }
4902 
4903 static __always_inline u32*
4904 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4905 {
4906 	*cs++ = MI_USER_INTERRUPT;
4907 
4908 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4909 	if (intel_engine_has_semaphores(request->engine))
4910 		cs = gen12_emit_preempt_busywait(request, cs);
4911 
4912 	request->tail = intel_ring_offset(request, cs);
4913 	assert_ring_tail_valid(request->ring, request->tail);
4914 
4915 	return gen8_emit_wa_tail(request, cs);
4916 }
4917 
4918 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4919 {
4920 	return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4921 }
4922 
4923 static u32 *
4924 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4925 {
4926 	cs = gen12_emit_ggtt_write_rcs(cs,
4927 				       request->fence.seqno,
4928 				       i915_request_active_timeline(request)->hwsp_offset,
4929 				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4930 				       PIPE_CONTROL_CS_STALL |
4931 				       PIPE_CONTROL_TILE_CACHE_FLUSH |
4932 				       PIPE_CONTROL_FLUSH_L3 |
4933 				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4934 				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4935 				       /* Wa_1409600907:tgl */
4936 				       PIPE_CONTROL_DEPTH_STALL |
4937 				       PIPE_CONTROL_DC_FLUSH_ENABLE |
4938 				       PIPE_CONTROL_FLUSH_ENABLE);
4939 
4940 	return gen12_emit_fini_breadcrumb_tail(request, cs);
4941 }
4942 
4943 static void execlists_park(struct intel_engine_cs *engine)
4944 {
4945 	cancel_timer(&engine->execlists.timer);
4946 	cancel_timer(&engine->execlists.preempt);
4947 }
4948 
4949 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4950 {
4951 	engine->submit_request = execlists_submit_request;
4952 	engine->schedule = i915_schedule;
4953 	engine->execlists.tasklet.func = execlists_submission_tasklet;
4954 
4955 	engine->reset.prepare = execlists_reset_prepare;
4956 	engine->reset.rewind = execlists_reset_rewind;
4957 	engine->reset.cancel = execlists_reset_cancel;
4958 	engine->reset.finish = execlists_reset_finish;
4959 
4960 	engine->park = execlists_park;
4961 	engine->unpark = NULL;
4962 
4963 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4964 	if (!intel_vgpu_active(engine->i915)) {
4965 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4966 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
4967 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4968 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
4969 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
4970 		}
4971 	}
4972 
4973 	if (INTEL_GEN(engine->i915) >= 12)
4974 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4975 
4976 	if (intel_engine_has_preemption(engine))
4977 		engine->emit_bb_start = gen8_emit_bb_start;
4978 	else
4979 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
4980 }
4981 
4982 static void execlists_shutdown(struct intel_engine_cs *engine)
4983 {
4984 	/* Synchronise with residual timers and any softirq they raise */
4985 	del_timer_sync(&engine->execlists.timer);
4986 	del_timer_sync(&engine->execlists.preempt);
4987 	tasklet_kill(&engine->execlists.tasklet);
4988 }
4989 
4990 static void execlists_release(struct intel_engine_cs *engine)
4991 {
4992 	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
4993 
4994 	execlists_shutdown(engine);
4995 
4996 	intel_engine_cleanup_common(engine);
4997 	lrc_destroy_wa_ctx(engine);
4998 }
4999 
5000 static void
5001 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5002 {
5003 	/* Default vfuncs which can be overriden by each engine. */
5004 
5005 	engine->resume = execlists_resume;
5006 
5007 	engine->cops = &execlists_context_ops;
5008 	engine->request_alloc = execlists_request_alloc;
5009 
5010 	engine->emit_flush = gen8_emit_flush;
5011 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5012 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5013 	if (INTEL_GEN(engine->i915) >= 12) {
5014 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5015 		engine->emit_flush = gen12_emit_flush;
5016 	}
5017 	engine->set_default_submission = intel_execlists_set_default_submission;
5018 
5019 	if (INTEL_GEN(engine->i915) < 11) {
5020 		engine->irq_enable = gen8_logical_ring_enable_irq;
5021 		engine->irq_disable = gen8_logical_ring_disable_irq;
5022 	} else {
5023 		/*
5024 		 * TODO: On Gen11 interrupt masks need to be clear
5025 		 * to allow C6 entry. Keep interrupts enabled at
5026 		 * and take the hit of generating extra interrupts
5027 		 * until a more refined solution exists.
5028 		 */
5029 	}
5030 }
5031 
5032 static inline void
5033 logical_ring_default_irqs(struct intel_engine_cs *engine)
5034 {
5035 	unsigned int shift = 0;
5036 
5037 	if (INTEL_GEN(engine->i915) < 11) {
5038 		const u8 irq_shifts[] = {
5039 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5040 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5041 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5042 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5043 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5044 		};
5045 
5046 		shift = irq_shifts[engine->id];
5047 	}
5048 
5049 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5050 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5051 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5052 	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5053 }
5054 
5055 static void rcs_submission_override(struct intel_engine_cs *engine)
5056 {
5057 	switch (INTEL_GEN(engine->i915)) {
5058 	case 12:
5059 		engine->emit_flush = gen12_emit_flush_render;
5060 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5061 		break;
5062 	case 11:
5063 		engine->emit_flush = gen11_emit_flush_render;
5064 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5065 		break;
5066 	default:
5067 		engine->emit_flush = gen8_emit_flush_render;
5068 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5069 		break;
5070 	}
5071 }
5072 
5073 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5074 {
5075 	struct intel_engine_execlists * const execlists = &engine->execlists;
5076 	struct drm_i915_private *i915 = engine->i915;
5077 	struct intel_uncore *uncore = engine->uncore;
5078 	u32 base = engine->mmio_base;
5079 
5080 	tasklet_init(&engine->execlists.tasklet,
5081 		     execlists_submission_tasklet, (unsigned long)engine);
5082 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5083 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5084 
5085 	logical_ring_default_vfuncs(engine);
5086 	logical_ring_default_irqs(engine);
5087 
5088 	if (engine->class == RENDER_CLASS)
5089 		rcs_submission_override(engine);
5090 
5091 	if (intel_init_workaround_bb(engine))
5092 		/*
5093 		 * We continue even if we fail to initialize WA batch
5094 		 * because we only expect rare glitches but nothing
5095 		 * critical to prevent us from using GPU
5096 		 */
5097 		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5098 
5099 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5100 		execlists->submit_reg = uncore->regs +
5101 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5102 		execlists->ctrl_reg = uncore->regs +
5103 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5104 	} else {
5105 		execlists->submit_reg = uncore->regs +
5106 			i915_mmio_reg_offset(RING_ELSP(base));
5107 	}
5108 
5109 	execlists->csb_status =
5110 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5111 
5112 	execlists->csb_write =
5113 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5114 
5115 	if (INTEL_GEN(i915) < 11)
5116 		execlists->csb_size = GEN8_CSB_ENTRIES;
5117 	else
5118 		execlists->csb_size = GEN11_CSB_ENTRIES;
5119 
5120 	if (INTEL_GEN(engine->i915) >= 11) {
5121 		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5122 		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5123 	}
5124 
5125 	/* Finally, take ownership and responsibility for cleanup! */
5126 	engine->sanitize = execlists_sanitize;
5127 	engine->release = execlists_release;
5128 
5129 	return 0;
5130 }
5131 
5132 static void init_common_reg_state(u32 * const regs,
5133 				  const struct intel_engine_cs *engine,
5134 				  const struct intel_ring *ring,
5135 				  bool inhibit)
5136 {
5137 	u32 ctl;
5138 
5139 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5140 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5141 	if (inhibit)
5142 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5143 	if (INTEL_GEN(engine->i915) < 11)
5144 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5145 					   CTX_CTRL_RS_CTX_ENABLE);
5146 	regs[CTX_CONTEXT_CONTROL] = ctl;
5147 
5148 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5149 	regs[CTX_TIMESTAMP] = 0;
5150 }
5151 
5152 static void init_wa_bb_reg_state(u32 * const regs,
5153 				 const struct intel_engine_cs *engine)
5154 {
5155 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5156 
5157 	if (wa_ctx->per_ctx.size) {
5158 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5159 
5160 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5161 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5162 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5163 	}
5164 
5165 	if (wa_ctx->indirect_ctx.size) {
5166 		lrc_ring_setup_indirect_ctx(regs, engine,
5167 					    i915_ggtt_offset(wa_ctx->vma) +
5168 					    wa_ctx->indirect_ctx.offset,
5169 					    wa_ctx->indirect_ctx.size);
5170 	}
5171 }
5172 
5173 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5174 {
5175 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5176 		/* 64b PPGTT (48bit canonical)
5177 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5178 		 * other PDP Descriptors are ignored.
5179 		 */
5180 		ASSIGN_CTX_PML4(ppgtt, regs);
5181 	} else {
5182 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5183 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5184 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5185 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5186 	}
5187 }
5188 
5189 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5190 {
5191 	if (i915_is_ggtt(vm))
5192 		return i915_vm_to_ggtt(vm)->alias;
5193 	else
5194 		return i915_vm_to_ppgtt(vm);
5195 }
5196 
5197 static void execlists_init_reg_state(u32 *regs,
5198 				     const struct intel_context *ce,
5199 				     const struct intel_engine_cs *engine,
5200 				     const struct intel_ring *ring,
5201 				     bool inhibit)
5202 {
5203 	/*
5204 	 * A context is actually a big batch buffer with several
5205 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5206 	 * values we are setting here are only for the first context restore:
5207 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5208 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5209 	 * we are not initializing here).
5210 	 *
5211 	 * Must keep consistent with virtual_update_register_offsets().
5212 	 */
5213 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5214 
5215 	init_common_reg_state(regs, engine, ring, inhibit);
5216 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5217 
5218 	init_wa_bb_reg_state(regs, engine);
5219 
5220 	__reset_stop_ring(regs, engine);
5221 }
5222 
5223 static int
5224 populate_lr_context(struct intel_context *ce,
5225 		    struct drm_i915_gem_object *ctx_obj,
5226 		    struct intel_engine_cs *engine,
5227 		    struct intel_ring *ring)
5228 {
5229 	bool inhibit = true;
5230 	void *vaddr;
5231 
5232 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5233 	if (IS_ERR(vaddr)) {
5234 		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5235 		return PTR_ERR(vaddr);
5236 	}
5237 
5238 	set_redzone(vaddr, engine);
5239 
5240 	if (engine->default_state) {
5241 		shmem_read(engine->default_state, 0,
5242 			   vaddr, engine->context_size);
5243 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5244 		inhibit = false;
5245 	}
5246 
5247 	/* Clear the ppHWSP (inc. per-context counters) */
5248 	memset(vaddr, 0, PAGE_SIZE);
5249 
5250 	/*
5251 	 * The second page of the context object contains some registers which
5252 	 * must be set up prior to the first execution.
5253 	 */
5254 	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5255 				 ce, engine, ring, inhibit);
5256 
5257 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5258 	i915_gem_object_unpin_map(ctx_obj);
5259 	return 0;
5260 }
5261 
5262 static int __execlists_context_alloc(struct intel_context *ce,
5263 				     struct intel_engine_cs *engine)
5264 {
5265 	struct drm_i915_gem_object *ctx_obj;
5266 	struct intel_ring *ring;
5267 	struct i915_vma *vma;
5268 	u32 context_size;
5269 	int ret;
5270 
5271 	GEM_BUG_ON(ce->state);
5272 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5273 
5274 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5275 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5276 
5277 	if (INTEL_GEN(engine->i915) == 12) {
5278 		ce->wa_bb_page = context_size / PAGE_SIZE;
5279 		context_size += PAGE_SIZE;
5280 	}
5281 
5282 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5283 	if (IS_ERR(ctx_obj))
5284 		return PTR_ERR(ctx_obj);
5285 
5286 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5287 	if (IS_ERR(vma)) {
5288 		ret = PTR_ERR(vma);
5289 		goto error_deref_obj;
5290 	}
5291 
5292 	if (!ce->timeline) {
5293 		struct intel_timeline *tl;
5294 		struct i915_vma *hwsp;
5295 
5296 		/*
5297 		 * Use the static global HWSP for the kernel context, and
5298 		 * a dynamically allocated cacheline for everyone else.
5299 		 */
5300 		hwsp = NULL;
5301 		if (unlikely(intel_context_is_barrier(ce)))
5302 			hwsp = engine->status_page.vma;
5303 
5304 		tl = intel_timeline_create(engine->gt, hwsp);
5305 		if (IS_ERR(tl)) {
5306 			ret = PTR_ERR(tl);
5307 			goto error_deref_obj;
5308 		}
5309 
5310 		ce->timeline = tl;
5311 	}
5312 
5313 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5314 	if (IS_ERR(ring)) {
5315 		ret = PTR_ERR(ring);
5316 		goto error_deref_obj;
5317 	}
5318 
5319 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5320 	if (ret) {
5321 		drm_dbg(&engine->i915->drm,
5322 			"Failed to populate LRC: %d\n", ret);
5323 		goto error_ring_free;
5324 	}
5325 
5326 	ce->ring = ring;
5327 	ce->state = vma;
5328 
5329 	return 0;
5330 
5331 error_ring_free:
5332 	intel_ring_put(ring);
5333 error_deref_obj:
5334 	i915_gem_object_put(ctx_obj);
5335 	return ret;
5336 }
5337 
5338 static struct list_head *virtual_queue(struct virtual_engine *ve)
5339 {
5340 	return &ve->base.execlists.default_priolist.requests[0];
5341 }
5342 
5343 static void virtual_context_destroy(struct kref *kref)
5344 {
5345 	struct virtual_engine *ve =
5346 		container_of(kref, typeof(*ve), context.ref);
5347 	unsigned int n;
5348 
5349 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5350 	GEM_BUG_ON(ve->request);
5351 	GEM_BUG_ON(ve->context.inflight);
5352 
5353 	for (n = 0; n < ve->num_siblings; n++) {
5354 		struct intel_engine_cs *sibling = ve->siblings[n];
5355 		struct rb_node *node = &ve->nodes[sibling->id].rb;
5356 		unsigned long flags;
5357 
5358 		if (RB_EMPTY_NODE(node))
5359 			continue;
5360 
5361 		spin_lock_irqsave(&sibling->active.lock, flags);
5362 
5363 		/* Detachment is lazily performed in the execlists tasklet */
5364 		if (!RB_EMPTY_NODE(node))
5365 			rb_erase_cached(node, &sibling->execlists.virtual);
5366 
5367 		spin_unlock_irqrestore(&sibling->active.lock, flags);
5368 	}
5369 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5370 
5371 	if (ve->context.state)
5372 		__execlists_context_fini(&ve->context);
5373 	intel_context_fini(&ve->context);
5374 
5375 	intel_engine_free_request_pool(&ve->base);
5376 
5377 	kfree(ve->bonds);
5378 	kfree(ve);
5379 }
5380 
5381 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5382 {
5383 	int swp;
5384 
5385 	/*
5386 	 * Pick a random sibling on starting to help spread the load around.
5387 	 *
5388 	 * New contexts are typically created with exactly the same order
5389 	 * of siblings, and often started in batches. Due to the way we iterate
5390 	 * the array of sibling when submitting requests, sibling[0] is
5391 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5392 	 * randomised across the system, we also help spread the load by the
5393 	 * first engine we inspect being different each time.
5394 	 *
5395 	 * NB This does not force us to execute on this engine, it will just
5396 	 * typically be the first we inspect for submission.
5397 	 */
5398 	swp = prandom_u32_max(ve->num_siblings);
5399 	if (!swp)
5400 		return;
5401 
5402 	swap(ve->siblings[swp], ve->siblings[0]);
5403 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
5404 		virtual_update_register_offsets(ve->context.lrc_reg_state,
5405 						ve->siblings[0]);
5406 }
5407 
5408 static int virtual_context_alloc(struct intel_context *ce)
5409 {
5410 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5411 
5412 	return __execlists_context_alloc(ce, ve->siblings[0]);
5413 }
5414 
5415 static int virtual_context_pin(struct intel_context *ce)
5416 {
5417 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5418 	int err;
5419 
5420 	/* Note: we must use a real engine class for setting up reg state */
5421 	err = __execlists_context_pin(ce, ve->siblings[0]);
5422 	if (err)
5423 		return err;
5424 
5425 	virtual_engine_initial_hint(ve);
5426 	return 0;
5427 }
5428 
5429 static void virtual_context_enter(struct intel_context *ce)
5430 {
5431 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5432 	unsigned int n;
5433 
5434 	for (n = 0; n < ve->num_siblings; n++)
5435 		intel_engine_pm_get(ve->siblings[n]);
5436 
5437 	intel_timeline_enter(ce->timeline);
5438 }
5439 
5440 static void virtual_context_exit(struct intel_context *ce)
5441 {
5442 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5443 	unsigned int n;
5444 
5445 	intel_timeline_exit(ce->timeline);
5446 
5447 	for (n = 0; n < ve->num_siblings; n++)
5448 		intel_engine_pm_put(ve->siblings[n]);
5449 }
5450 
5451 static const struct intel_context_ops virtual_context_ops = {
5452 	.alloc = virtual_context_alloc,
5453 
5454 	.pin = virtual_context_pin,
5455 	.unpin = execlists_context_unpin,
5456 
5457 	.enter = virtual_context_enter,
5458 	.exit = virtual_context_exit,
5459 
5460 	.destroy = virtual_context_destroy,
5461 };
5462 
5463 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5464 {
5465 	struct i915_request *rq;
5466 	intel_engine_mask_t mask;
5467 
5468 	rq = READ_ONCE(ve->request);
5469 	if (!rq)
5470 		return 0;
5471 
5472 	/* The rq is ready for submission; rq->execution_mask is now stable. */
5473 	mask = rq->execution_mask;
5474 	if (unlikely(!mask)) {
5475 		/* Invalid selection, submit to a random engine in error */
5476 		i915_request_set_error_once(rq, -ENODEV);
5477 		mask = ve->siblings[0]->mask;
5478 	}
5479 
5480 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5481 		     rq->fence.context, rq->fence.seqno,
5482 		     mask, ve->base.execlists.queue_priority_hint);
5483 
5484 	return mask;
5485 }
5486 
5487 static void virtual_submission_tasklet(unsigned long data)
5488 {
5489 	struct virtual_engine * const ve = (struct virtual_engine *)data;
5490 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5491 	intel_engine_mask_t mask;
5492 	unsigned int n;
5493 
5494 	rcu_read_lock();
5495 	mask = virtual_submission_mask(ve);
5496 	rcu_read_unlock();
5497 	if (unlikely(!mask))
5498 		return;
5499 
5500 	local_irq_disable();
5501 	for (n = 0; n < ve->num_siblings; n++) {
5502 		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5503 		struct ve_node * const node = &ve->nodes[sibling->id];
5504 		struct rb_node **parent, *rb;
5505 		bool first;
5506 
5507 		if (!READ_ONCE(ve->request))
5508 			break; /* already handled by a sibling's tasklet */
5509 
5510 		if (unlikely(!(mask & sibling->mask))) {
5511 			if (!RB_EMPTY_NODE(&node->rb)) {
5512 				spin_lock(&sibling->active.lock);
5513 				rb_erase_cached(&node->rb,
5514 						&sibling->execlists.virtual);
5515 				RB_CLEAR_NODE(&node->rb);
5516 				spin_unlock(&sibling->active.lock);
5517 			}
5518 			continue;
5519 		}
5520 
5521 		spin_lock(&sibling->active.lock);
5522 
5523 		if (!RB_EMPTY_NODE(&node->rb)) {
5524 			/*
5525 			 * Cheat and avoid rebalancing the tree if we can
5526 			 * reuse this node in situ.
5527 			 */
5528 			first = rb_first_cached(&sibling->execlists.virtual) ==
5529 				&node->rb;
5530 			if (prio == node->prio || (prio > node->prio && first))
5531 				goto submit_engine;
5532 
5533 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5534 		}
5535 
5536 		rb = NULL;
5537 		first = true;
5538 		parent = &sibling->execlists.virtual.rb_root.rb_node;
5539 		while (*parent) {
5540 			struct ve_node *other;
5541 
5542 			rb = *parent;
5543 			other = rb_entry(rb, typeof(*other), rb);
5544 			if (prio > other->prio) {
5545 				parent = &rb->rb_left;
5546 			} else {
5547 				parent = &rb->rb_right;
5548 				first = false;
5549 			}
5550 		}
5551 
5552 		rb_link_node(&node->rb, rb, parent);
5553 		rb_insert_color_cached(&node->rb,
5554 				       &sibling->execlists.virtual,
5555 				       first);
5556 
5557 submit_engine:
5558 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5559 		node->prio = prio;
5560 		if (first && prio > sibling->execlists.queue_priority_hint)
5561 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5562 
5563 		spin_unlock(&sibling->active.lock);
5564 	}
5565 	local_irq_enable();
5566 }
5567 
5568 static void virtual_submit_request(struct i915_request *rq)
5569 {
5570 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5571 	struct i915_request *old;
5572 	unsigned long flags;
5573 
5574 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5575 		     rq->fence.context,
5576 		     rq->fence.seqno);
5577 
5578 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5579 
5580 	spin_lock_irqsave(&ve->base.active.lock, flags);
5581 
5582 	old = ve->request;
5583 	if (old) { /* background completion event from preempt-to-busy */
5584 		GEM_BUG_ON(!i915_request_completed(old));
5585 		__i915_request_submit(old);
5586 		i915_request_put(old);
5587 	}
5588 
5589 	if (i915_request_completed(rq)) {
5590 		__i915_request_submit(rq);
5591 
5592 		ve->base.execlists.queue_priority_hint = INT_MIN;
5593 		ve->request = NULL;
5594 	} else {
5595 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5596 		ve->request = i915_request_get(rq);
5597 
5598 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5599 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5600 
5601 		tasklet_schedule(&ve->base.execlists.tasklet);
5602 	}
5603 
5604 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5605 }
5606 
5607 static struct ve_bond *
5608 virtual_find_bond(struct virtual_engine *ve,
5609 		  const struct intel_engine_cs *master)
5610 {
5611 	int i;
5612 
5613 	for (i = 0; i < ve->num_bonds; i++) {
5614 		if (ve->bonds[i].master == master)
5615 			return &ve->bonds[i];
5616 	}
5617 
5618 	return NULL;
5619 }
5620 
5621 static void
5622 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5623 {
5624 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5625 	intel_engine_mask_t allowed, exec;
5626 	struct ve_bond *bond;
5627 
5628 	allowed = ~to_request(signal)->engine->mask;
5629 
5630 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5631 	if (bond)
5632 		allowed &= bond->sibling_mask;
5633 
5634 	/* Restrict the bonded request to run on only the available engines */
5635 	exec = READ_ONCE(rq->execution_mask);
5636 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5637 		;
5638 
5639 	/* Prevent the master from being re-run on the bonded engines */
5640 	to_request(signal)->execution_mask &= ~allowed;
5641 }
5642 
5643 struct intel_context *
5644 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5645 			       unsigned int count)
5646 {
5647 	struct virtual_engine *ve;
5648 	unsigned int n;
5649 	int err;
5650 
5651 	if (count == 0)
5652 		return ERR_PTR(-EINVAL);
5653 
5654 	if (count == 1)
5655 		return intel_context_create(siblings[0]);
5656 
5657 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5658 	if (!ve)
5659 		return ERR_PTR(-ENOMEM);
5660 
5661 	ve->base.i915 = siblings[0]->i915;
5662 	ve->base.gt = siblings[0]->gt;
5663 	ve->base.uncore = siblings[0]->uncore;
5664 	ve->base.id = -1;
5665 
5666 	ve->base.class = OTHER_CLASS;
5667 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5668 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5669 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5670 
5671 	/*
5672 	 * The decision on whether to submit a request using semaphores
5673 	 * depends on the saturated state of the engine. We only compute
5674 	 * this during HW submission of the request, and we need for this
5675 	 * state to be globally applied to all requests being submitted
5676 	 * to this engine. Virtual engines encompass more than one physical
5677 	 * engine and so we cannot accurately tell in advance if one of those
5678 	 * engines is already saturated and so cannot afford to use a semaphore
5679 	 * and be pessimized in priority for doing so -- if we are the only
5680 	 * context using semaphores after all other clients have stopped, we
5681 	 * will be starved on the saturated system. Such a global switch for
5682 	 * semaphores is less than ideal, but alas is the current compromise.
5683 	 */
5684 	ve->base.saturated = ALL_ENGINES;
5685 
5686 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5687 
5688 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5689 	intel_engine_init_breadcrumbs(&ve->base);
5690 	intel_engine_init_execlists(&ve->base);
5691 
5692 	ve->base.cops = &virtual_context_ops;
5693 	ve->base.request_alloc = execlists_request_alloc;
5694 
5695 	ve->base.schedule = i915_schedule;
5696 	ve->base.submit_request = virtual_submit_request;
5697 	ve->base.bond_execute = virtual_bond_execute;
5698 
5699 	INIT_LIST_HEAD(virtual_queue(ve));
5700 	ve->base.execlists.queue_priority_hint = INT_MIN;
5701 	tasklet_init(&ve->base.execlists.tasklet,
5702 		     virtual_submission_tasklet,
5703 		     (unsigned long)ve);
5704 
5705 	intel_context_init(&ve->context, &ve->base);
5706 
5707 	for (n = 0; n < count; n++) {
5708 		struct intel_engine_cs *sibling = siblings[n];
5709 
5710 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5711 		if (sibling->mask & ve->base.mask) {
5712 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5713 				  sibling->name);
5714 			err = -EINVAL;
5715 			goto err_put;
5716 		}
5717 
5718 		/*
5719 		 * The virtual engine implementation is tightly coupled to
5720 		 * the execlists backend -- we push out request directly
5721 		 * into a tree inside each physical engine. We could support
5722 		 * layering if we handle cloning of the requests and
5723 		 * submitting a copy into each backend.
5724 		 */
5725 		if (sibling->execlists.tasklet.func !=
5726 		    execlists_submission_tasklet) {
5727 			err = -ENODEV;
5728 			goto err_put;
5729 		}
5730 
5731 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5732 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5733 
5734 		ve->siblings[ve->num_siblings++] = sibling;
5735 		ve->base.mask |= sibling->mask;
5736 
5737 		/*
5738 		 * All physical engines must be compatible for their emission
5739 		 * functions (as we build the instructions during request
5740 		 * construction and do not alter them before submission
5741 		 * on the physical engine). We use the engine class as a guide
5742 		 * here, although that could be refined.
5743 		 */
5744 		if (ve->base.class != OTHER_CLASS) {
5745 			if (ve->base.class != sibling->class) {
5746 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5747 					  sibling->class, ve->base.class);
5748 				err = -EINVAL;
5749 				goto err_put;
5750 			}
5751 			continue;
5752 		}
5753 
5754 		ve->base.class = sibling->class;
5755 		ve->base.uabi_class = sibling->uabi_class;
5756 		snprintf(ve->base.name, sizeof(ve->base.name),
5757 			 "v%dx%d", ve->base.class, count);
5758 		ve->base.context_size = sibling->context_size;
5759 
5760 		ve->base.emit_bb_start = sibling->emit_bb_start;
5761 		ve->base.emit_flush = sibling->emit_flush;
5762 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5763 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5764 		ve->base.emit_fini_breadcrumb_dw =
5765 			sibling->emit_fini_breadcrumb_dw;
5766 
5767 		ve->base.flags = sibling->flags;
5768 	}
5769 
5770 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5771 
5772 	return &ve->context;
5773 
5774 err_put:
5775 	intel_context_put(&ve->context);
5776 	return ERR_PTR(err);
5777 }
5778 
5779 struct intel_context *
5780 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5781 {
5782 	struct virtual_engine *se = to_virtual_engine(src);
5783 	struct intel_context *dst;
5784 
5785 	dst = intel_execlists_create_virtual(se->siblings,
5786 					     se->num_siblings);
5787 	if (IS_ERR(dst))
5788 		return dst;
5789 
5790 	if (se->num_bonds) {
5791 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5792 
5793 		de->bonds = kmemdup(se->bonds,
5794 				    sizeof(*se->bonds) * se->num_bonds,
5795 				    GFP_KERNEL);
5796 		if (!de->bonds) {
5797 			intel_context_put(dst);
5798 			return ERR_PTR(-ENOMEM);
5799 		}
5800 
5801 		de->num_bonds = se->num_bonds;
5802 	}
5803 
5804 	return dst;
5805 }
5806 
5807 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5808 				     const struct intel_engine_cs *master,
5809 				     const struct intel_engine_cs *sibling)
5810 {
5811 	struct virtual_engine *ve = to_virtual_engine(engine);
5812 	struct ve_bond *bond;
5813 	int n;
5814 
5815 	/* Sanity check the sibling is part of the virtual engine */
5816 	for (n = 0; n < ve->num_siblings; n++)
5817 		if (sibling == ve->siblings[n])
5818 			break;
5819 	if (n == ve->num_siblings)
5820 		return -EINVAL;
5821 
5822 	bond = virtual_find_bond(ve, master);
5823 	if (bond) {
5824 		bond->sibling_mask |= sibling->mask;
5825 		return 0;
5826 	}
5827 
5828 	bond = krealloc(ve->bonds,
5829 			sizeof(*bond) * (ve->num_bonds + 1),
5830 			GFP_KERNEL);
5831 	if (!bond)
5832 		return -ENOMEM;
5833 
5834 	bond[ve->num_bonds].master = master;
5835 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5836 
5837 	ve->bonds = bond;
5838 	ve->num_bonds++;
5839 
5840 	return 0;
5841 }
5842 
5843 struct intel_engine_cs *
5844 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5845 				 unsigned int sibling)
5846 {
5847 	struct virtual_engine *ve = to_virtual_engine(engine);
5848 
5849 	if (sibling >= ve->num_siblings)
5850 		return NULL;
5851 
5852 	return ve->siblings[sibling];
5853 }
5854 
5855 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5856 				   struct drm_printer *m,
5857 				   void (*show_request)(struct drm_printer *m,
5858 							struct i915_request *rq,
5859 							const char *prefix),
5860 				   unsigned int max)
5861 {
5862 	const struct intel_engine_execlists *execlists = &engine->execlists;
5863 	struct i915_request *rq, *last;
5864 	unsigned long flags;
5865 	unsigned int count;
5866 	struct rb_node *rb;
5867 
5868 	spin_lock_irqsave(&engine->active.lock, flags);
5869 
5870 	last = NULL;
5871 	count = 0;
5872 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5873 		if (count++ < max - 1)
5874 			show_request(m, rq, "\t\tE ");
5875 		else
5876 			last = rq;
5877 	}
5878 	if (last) {
5879 		if (count > max) {
5880 			drm_printf(m,
5881 				   "\t\t...skipping %d executing requests...\n",
5882 				   count - max);
5883 		}
5884 		show_request(m, last, "\t\tE ");
5885 	}
5886 
5887 	if (execlists->switch_priority_hint != INT_MIN)
5888 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5889 			   READ_ONCE(execlists->switch_priority_hint));
5890 	if (execlists->queue_priority_hint != INT_MIN)
5891 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5892 			   READ_ONCE(execlists->queue_priority_hint));
5893 
5894 	last = NULL;
5895 	count = 0;
5896 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5897 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5898 		int i;
5899 
5900 		priolist_for_each_request(rq, p, i) {
5901 			if (count++ < max - 1)
5902 				show_request(m, rq, "\t\tQ ");
5903 			else
5904 				last = rq;
5905 		}
5906 	}
5907 	if (last) {
5908 		if (count > max) {
5909 			drm_printf(m,
5910 				   "\t\t...skipping %d queued requests...\n",
5911 				   count - max);
5912 		}
5913 		show_request(m, last, "\t\tQ ");
5914 	}
5915 
5916 	last = NULL;
5917 	count = 0;
5918 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5919 		struct virtual_engine *ve =
5920 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5921 		struct i915_request *rq = READ_ONCE(ve->request);
5922 
5923 		if (rq) {
5924 			if (count++ < max - 1)
5925 				show_request(m, rq, "\t\tV ");
5926 			else
5927 				last = rq;
5928 		}
5929 	}
5930 	if (last) {
5931 		if (count > max) {
5932 			drm_printf(m,
5933 				   "\t\t...skipping %d virtual requests...\n",
5934 				   count - max);
5935 		}
5936 		show_request(m, last, "\t\tV ");
5937 	}
5938 
5939 	spin_unlock_irqrestore(&engine->active.lock, flags);
5940 }
5941 
5942 void intel_lr_context_reset(struct intel_engine_cs *engine,
5943 			    struct intel_context *ce,
5944 			    u32 head,
5945 			    bool scrub)
5946 {
5947 	GEM_BUG_ON(!intel_context_is_pinned(ce));
5948 
5949 	/*
5950 	 * We want a simple context + ring to execute the breadcrumb update.
5951 	 * We cannot rely on the context being intact across the GPU hang,
5952 	 * so clear it and rebuild just what we need for the breadcrumb.
5953 	 * All pending requests for this context will be zapped, and any
5954 	 * future request will be after userspace has had the opportunity
5955 	 * to recreate its own state.
5956 	 */
5957 	if (scrub)
5958 		restore_default_state(ce, engine);
5959 
5960 	/* Rerun the request; its payload has been neutered (if guilty). */
5961 	__execlists_update_reg_state(ce, engine, head);
5962 }
5963 
5964 bool
5965 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5966 {
5967 	return engine->set_default_submission ==
5968 	       intel_execlists_set_default_submission;
5969 }
5970 
5971 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5972 #include "selftest_lrc.c"
5973 #endif
5974