xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision f97769fd)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 #include "shmem_utils.h"
151 
152 #define RING_EXECLIST_QFULL		(1 << 0x2)
153 #define RING_EXECLIST1_VALID		(1 << 0x3)
154 #define RING_EXECLIST0_VALID		(1 << 0x4)
155 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
156 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
157 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
158 
159 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
160 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
162 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
163 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
164 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
165 
166 #define GEN8_CTX_STATUS_COMPLETED_MASK \
167 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
168 
169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
170 
171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
173 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
174 #define GEN12_IDLE_CTX_ID		0x7FF
175 #define GEN12_CSB_CTX_VALID(csb_dw) \
176 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
177 
178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
180 
181 struct virtual_engine {
182 	struct intel_engine_cs base;
183 	struct intel_context context;
184 
185 	/*
186 	 * We allow only a single request through the virtual engine at a time
187 	 * (each request in the timeline waits for the completion fence of
188 	 * the previous before being submitted). By restricting ourselves to
189 	 * only submitting a single request, each request is placed on to a
190 	 * physical to maximise load spreading (by virtue of the late greedy
191 	 * scheduling -- each real engine takes the next available request
192 	 * upon idling).
193 	 */
194 	struct i915_request *request;
195 
196 	/*
197 	 * We keep a rbtree of available virtual engines inside each physical
198 	 * engine, sorted by priority. Here we preallocate the nodes we need
199 	 * for the virtual engine, indexed by physical_engine->id.
200 	 */
201 	struct ve_node {
202 		struct rb_node rb;
203 		int prio;
204 	} nodes[I915_NUM_ENGINES];
205 
206 	/*
207 	 * Keep track of bonded pairs -- restrictions upon on our selection
208 	 * of physical engines any particular request may be submitted to.
209 	 * If we receive a submit-fence from a master engine, we will only
210 	 * use one of sibling_mask physical engines.
211 	 */
212 	struct ve_bond {
213 		const struct intel_engine_cs *master;
214 		intel_engine_mask_t sibling_mask;
215 	} *bonds;
216 	unsigned int num_bonds;
217 
218 	/* And finally, which physical engines this virtual engine maps onto. */
219 	unsigned int num_siblings;
220 	struct intel_engine_cs *siblings[];
221 };
222 
223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
224 {
225 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
226 	return container_of(engine, struct virtual_engine, base);
227 }
228 
229 static int __execlists_context_alloc(struct intel_context *ce,
230 				     struct intel_engine_cs *engine);
231 
232 static void execlists_init_reg_state(u32 *reg_state,
233 				     const struct intel_context *ce,
234 				     const struct intel_engine_cs *engine,
235 				     const struct intel_ring *ring,
236 				     bool close);
237 static void
238 __execlists_update_reg_state(const struct intel_context *ce,
239 			     const struct intel_engine_cs *engine,
240 			     u32 head);
241 
242 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
243 {
244 	if (INTEL_GEN(engine->i915) >= 12)
245 		return 0x60;
246 	else if (INTEL_GEN(engine->i915) >= 9)
247 		return 0x54;
248 	else if (engine->class == RENDER_CLASS)
249 		return 0x58;
250 	else
251 		return -1;
252 }
253 
254 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
255 {
256 	if (INTEL_GEN(engine->i915) >= 12)
257 		return 0x74;
258 	else if (INTEL_GEN(engine->i915) >= 9)
259 		return 0x68;
260 	else if (engine->class == RENDER_CLASS)
261 		return 0xd8;
262 	else
263 		return -1;
264 }
265 
266 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
267 {
268 	if (INTEL_GEN(engine->i915) >= 12)
269 		return 0x12;
270 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
271 		return 0x18;
272 	else
273 		return -1;
274 }
275 
276 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
277 {
278 	int x;
279 
280 	x = lrc_ring_wa_bb_per_ctx(engine);
281 	if (x < 0)
282 		return x;
283 
284 	return x + 2;
285 }
286 
287 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
288 {
289 	int x;
290 
291 	x = lrc_ring_indirect_ptr(engine);
292 	if (x < 0)
293 		return x;
294 
295 	return x + 2;
296 }
297 
298 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
299 {
300 	if (engine->class != RENDER_CLASS)
301 		return -1;
302 
303 	if (INTEL_GEN(engine->i915) >= 12)
304 		return 0xb6;
305 	else if (INTEL_GEN(engine->i915) >= 11)
306 		return 0xaa;
307 	else
308 		return -1;
309 }
310 
311 static u32
312 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
313 {
314 	switch (INTEL_GEN(engine->i915)) {
315 	default:
316 		MISSING_CASE(INTEL_GEN(engine->i915));
317 		fallthrough;
318 	case 12:
319 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
320 	case 11:
321 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
322 	case 10:
323 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
324 	case 9:
325 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
326 	case 8:
327 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
328 	}
329 }
330 
331 static void
332 lrc_ring_setup_indirect_ctx(u32 *regs,
333 			    const struct intel_engine_cs *engine,
334 			    u32 ctx_bb_ggtt_addr,
335 			    u32 size)
336 {
337 	GEM_BUG_ON(!size);
338 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
339 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
340 	regs[lrc_ring_indirect_ptr(engine) + 1] =
341 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
342 
343 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
344 	regs[lrc_ring_indirect_offset(engine) + 1] =
345 		lrc_ring_indirect_offset_default(engine) << 6;
346 }
347 
348 static u32 intel_context_get_runtime(const struct intel_context *ce)
349 {
350 	/*
351 	 * We can use either ppHWSP[16] which is recorded before the context
352 	 * switch (and so excludes the cost of context switches) or use the
353 	 * value from the context image itself, which is saved/restored earlier
354 	 * and so includes the cost of the save.
355 	 */
356 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
357 }
358 
359 static void mark_eio(struct i915_request *rq)
360 {
361 	if (i915_request_completed(rq))
362 		return;
363 
364 	GEM_BUG_ON(i915_request_signaled(rq));
365 
366 	i915_request_set_error_once(rq, -EIO);
367 	i915_request_mark_complete(rq);
368 }
369 
370 static struct i915_request *
371 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
372 {
373 	struct i915_request *active = rq;
374 
375 	rcu_read_lock();
376 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
377 		if (i915_request_completed(rq))
378 			break;
379 
380 		active = rq;
381 	}
382 	rcu_read_unlock();
383 
384 	return active;
385 }
386 
387 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
388 {
389 	return (i915_ggtt_offset(engine->status_page.vma) +
390 		I915_GEM_HWS_PREEMPT_ADDR);
391 }
392 
393 static inline void
394 ring_set_paused(const struct intel_engine_cs *engine, int state)
395 {
396 	/*
397 	 * We inspect HWS_PREEMPT with a semaphore inside
398 	 * engine->emit_fini_breadcrumb. If the dword is true,
399 	 * the ring is paused as the semaphore will busywait
400 	 * until the dword is false.
401 	 */
402 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
403 	if (state)
404 		wmb();
405 }
406 
407 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
408 {
409 	return rb_entry(rb, struct i915_priolist, node);
410 }
411 
412 static inline int rq_prio(const struct i915_request *rq)
413 {
414 	return READ_ONCE(rq->sched.attr.priority);
415 }
416 
417 static int effective_prio(const struct i915_request *rq)
418 {
419 	int prio = rq_prio(rq);
420 
421 	/*
422 	 * If this request is special and must not be interrupted at any
423 	 * cost, so be it. Note we are only checking the most recent request
424 	 * in the context and so may be masking an earlier vip request. It
425 	 * is hoped that under the conditions where nopreempt is used, this
426 	 * will not matter (i.e. all requests to that context will be
427 	 * nopreempt for as long as desired).
428 	 */
429 	if (i915_request_has_nopreempt(rq))
430 		prio = I915_PRIORITY_UNPREEMPTABLE;
431 
432 	return prio;
433 }
434 
435 static int queue_prio(const struct intel_engine_execlists *execlists)
436 {
437 	struct i915_priolist *p;
438 	struct rb_node *rb;
439 
440 	rb = rb_first_cached(&execlists->queue);
441 	if (!rb)
442 		return INT_MIN;
443 
444 	/*
445 	 * As the priolist[] are inverted, with the highest priority in [0],
446 	 * we have to flip the index value to become priority.
447 	 */
448 	p = to_priolist(rb);
449 	if (!I915_USER_PRIORITY_SHIFT)
450 		return p->priority;
451 
452 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
453 }
454 
455 static inline bool need_preempt(const struct intel_engine_cs *engine,
456 				const struct i915_request *rq,
457 				struct rb_node *rb)
458 {
459 	int last_prio;
460 
461 	if (!intel_engine_has_semaphores(engine))
462 		return false;
463 
464 	/*
465 	 * Check if the current priority hint merits a preemption attempt.
466 	 *
467 	 * We record the highest value priority we saw during rescheduling
468 	 * prior to this dequeue, therefore we know that if it is strictly
469 	 * less than the current tail of ESLP[0], we do not need to force
470 	 * a preempt-to-idle cycle.
471 	 *
472 	 * However, the priority hint is a mere hint that we may need to
473 	 * preempt. If that hint is stale or we may be trying to preempt
474 	 * ourselves, ignore the request.
475 	 *
476 	 * More naturally we would write
477 	 *      prio >= max(0, last);
478 	 * except that we wish to prevent triggering preemption at the same
479 	 * priority level: the task that is running should remain running
480 	 * to preserve FIFO ordering of dependencies.
481 	 */
482 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
483 	if (engine->execlists.queue_priority_hint <= last_prio)
484 		return false;
485 
486 	/*
487 	 * Check against the first request in ELSP[1], it will, thanks to the
488 	 * power of PI, be the highest priority of that context.
489 	 */
490 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
491 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
492 		return true;
493 
494 	if (rb) {
495 		struct virtual_engine *ve =
496 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
497 		bool preempt = false;
498 
499 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
500 			struct i915_request *next;
501 
502 			rcu_read_lock();
503 			next = READ_ONCE(ve->request);
504 			if (next)
505 				preempt = rq_prio(next) > last_prio;
506 			rcu_read_unlock();
507 		}
508 
509 		if (preempt)
510 			return preempt;
511 	}
512 
513 	/*
514 	 * If the inflight context did not trigger the preemption, then maybe
515 	 * it was the set of queued requests? Pick the highest priority in
516 	 * the queue (the first active priolist) and see if it deserves to be
517 	 * running instead of ELSP[0].
518 	 *
519 	 * The highest priority request in the queue can not be either
520 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
521 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
522 	 */
523 	return queue_prio(&engine->execlists) > last_prio;
524 }
525 
526 __maybe_unused static inline bool
527 assert_priority_queue(const struct i915_request *prev,
528 		      const struct i915_request *next)
529 {
530 	/*
531 	 * Without preemption, the prev may refer to the still active element
532 	 * which we refuse to let go.
533 	 *
534 	 * Even with preemption, there are times when we think it is better not
535 	 * to preempt and leave an ostensibly lower priority request in flight.
536 	 */
537 	if (i915_request_is_active(prev))
538 		return true;
539 
540 	return rq_prio(prev) >= rq_prio(next);
541 }
542 
543 /*
544  * The context descriptor encodes various attributes of a context,
545  * including its GTT address and some flags. Because it's fairly
546  * expensive to calculate, we'll just do it once and cache the result,
547  * which remains valid until the context is unpinned.
548  *
549  * This is what a descriptor looks like, from LSB to MSB::
550  *
551  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
552  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
553  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
554  *      bits 53-54:    mbz, reserved for use by hardware
555  *      bits 55-63:    group ID, currently unused and set to 0
556  *
557  * Starting from Gen11, the upper dword of the descriptor has a new format:
558  *
559  *      bits 32-36:    reserved
560  *      bits 37-47:    SW context ID
561  *      bits 48:53:    engine instance
562  *      bit 54:        mbz, reserved for use by hardware
563  *      bits 55-60:    SW counter
564  *      bits 61-63:    engine class
565  *
566  * engine info, SW context ID and SW counter need to form a unique number
567  * (Context ID) per lrc.
568  */
569 static u32
570 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
571 {
572 	u32 desc;
573 
574 	desc = INTEL_LEGACY_32B_CONTEXT;
575 	if (i915_vm_is_4lvl(ce->vm))
576 		desc = INTEL_LEGACY_64B_CONTEXT;
577 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
578 
579 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
580 	if (IS_GEN(engine->i915, 8))
581 		desc |= GEN8_CTX_L3LLC_COHERENT;
582 
583 	return i915_ggtt_offset(ce->state) | desc;
584 }
585 
586 static inline unsigned int dword_in_page(void *addr)
587 {
588 	return offset_in_page(addr) / sizeof(u32);
589 }
590 
591 static void set_offsets(u32 *regs,
592 			const u8 *data,
593 			const struct intel_engine_cs *engine,
594 			bool clear)
595 #define NOP(x) (BIT(7) | (x))
596 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
597 #define POSTED BIT(0)
598 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
599 #define REG16(x) \
600 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
601 	(((x) >> 2) & 0x7f)
602 #define END(total_state_size) 0, (total_state_size)
603 {
604 	const u32 base = engine->mmio_base;
605 
606 	while (*data) {
607 		u8 count, flags;
608 
609 		if (*data & BIT(7)) { /* skip */
610 			count = *data++ & ~BIT(7);
611 			if (clear)
612 				memset32(regs, MI_NOOP, count);
613 			regs += count;
614 			continue;
615 		}
616 
617 		count = *data & 0x3f;
618 		flags = *data >> 6;
619 		data++;
620 
621 		*regs = MI_LOAD_REGISTER_IMM(count);
622 		if (flags & POSTED)
623 			*regs |= MI_LRI_FORCE_POSTED;
624 		if (INTEL_GEN(engine->i915) >= 11)
625 			*regs |= MI_LRI_LRM_CS_MMIO;
626 		regs++;
627 
628 		GEM_BUG_ON(!count);
629 		do {
630 			u32 offset = 0;
631 			u8 v;
632 
633 			do {
634 				v = *data++;
635 				offset <<= 7;
636 				offset |= v & ~BIT(7);
637 			} while (v & BIT(7));
638 
639 			regs[0] = base + (offset << 2);
640 			if (clear)
641 				regs[1] = 0;
642 			regs += 2;
643 		} while (--count);
644 	}
645 
646 	if (clear) {
647 		u8 count = *++data;
648 
649 		/* Clear past the tail for HW access */
650 		GEM_BUG_ON(dword_in_page(regs) > count);
651 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
652 
653 		/* Close the batch; used mainly by live_lrc_layout() */
654 		*regs = MI_BATCH_BUFFER_END;
655 		if (INTEL_GEN(engine->i915) >= 10)
656 			*regs |= BIT(0);
657 	}
658 }
659 
660 static const u8 gen8_xcs_offsets[] = {
661 	NOP(1),
662 	LRI(11, 0),
663 	REG16(0x244),
664 	REG(0x034),
665 	REG(0x030),
666 	REG(0x038),
667 	REG(0x03c),
668 	REG(0x168),
669 	REG(0x140),
670 	REG(0x110),
671 	REG(0x11c),
672 	REG(0x114),
673 	REG(0x118),
674 
675 	NOP(9),
676 	LRI(9, 0),
677 	REG16(0x3a8),
678 	REG16(0x28c),
679 	REG16(0x288),
680 	REG16(0x284),
681 	REG16(0x280),
682 	REG16(0x27c),
683 	REG16(0x278),
684 	REG16(0x274),
685 	REG16(0x270),
686 
687 	NOP(13),
688 	LRI(2, 0),
689 	REG16(0x200),
690 	REG(0x028),
691 
692 	END(80)
693 };
694 
695 static const u8 gen9_xcs_offsets[] = {
696 	NOP(1),
697 	LRI(14, POSTED),
698 	REG16(0x244),
699 	REG(0x034),
700 	REG(0x030),
701 	REG(0x038),
702 	REG(0x03c),
703 	REG(0x168),
704 	REG(0x140),
705 	REG(0x110),
706 	REG(0x11c),
707 	REG(0x114),
708 	REG(0x118),
709 	REG(0x1c0),
710 	REG(0x1c4),
711 	REG(0x1c8),
712 
713 	NOP(3),
714 	LRI(9, POSTED),
715 	REG16(0x3a8),
716 	REG16(0x28c),
717 	REG16(0x288),
718 	REG16(0x284),
719 	REG16(0x280),
720 	REG16(0x27c),
721 	REG16(0x278),
722 	REG16(0x274),
723 	REG16(0x270),
724 
725 	NOP(13),
726 	LRI(1, POSTED),
727 	REG16(0x200),
728 
729 	NOP(13),
730 	LRI(44, POSTED),
731 	REG(0x028),
732 	REG(0x09c),
733 	REG(0x0c0),
734 	REG(0x178),
735 	REG(0x17c),
736 	REG16(0x358),
737 	REG(0x170),
738 	REG(0x150),
739 	REG(0x154),
740 	REG(0x158),
741 	REG16(0x41c),
742 	REG16(0x600),
743 	REG16(0x604),
744 	REG16(0x608),
745 	REG16(0x60c),
746 	REG16(0x610),
747 	REG16(0x614),
748 	REG16(0x618),
749 	REG16(0x61c),
750 	REG16(0x620),
751 	REG16(0x624),
752 	REG16(0x628),
753 	REG16(0x62c),
754 	REG16(0x630),
755 	REG16(0x634),
756 	REG16(0x638),
757 	REG16(0x63c),
758 	REG16(0x640),
759 	REG16(0x644),
760 	REG16(0x648),
761 	REG16(0x64c),
762 	REG16(0x650),
763 	REG16(0x654),
764 	REG16(0x658),
765 	REG16(0x65c),
766 	REG16(0x660),
767 	REG16(0x664),
768 	REG16(0x668),
769 	REG16(0x66c),
770 	REG16(0x670),
771 	REG16(0x674),
772 	REG16(0x678),
773 	REG16(0x67c),
774 	REG(0x068),
775 
776 	END(176)
777 };
778 
779 static const u8 gen12_xcs_offsets[] = {
780 	NOP(1),
781 	LRI(13, POSTED),
782 	REG16(0x244),
783 	REG(0x034),
784 	REG(0x030),
785 	REG(0x038),
786 	REG(0x03c),
787 	REG(0x168),
788 	REG(0x140),
789 	REG(0x110),
790 	REG(0x1c0),
791 	REG(0x1c4),
792 	REG(0x1c8),
793 	REG(0x180),
794 	REG16(0x2b4),
795 
796 	NOP(5),
797 	LRI(9, POSTED),
798 	REG16(0x3a8),
799 	REG16(0x28c),
800 	REG16(0x288),
801 	REG16(0x284),
802 	REG16(0x280),
803 	REG16(0x27c),
804 	REG16(0x278),
805 	REG16(0x274),
806 	REG16(0x270),
807 
808 	END(80)
809 };
810 
811 static const u8 gen8_rcs_offsets[] = {
812 	NOP(1),
813 	LRI(14, POSTED),
814 	REG16(0x244),
815 	REG(0x034),
816 	REG(0x030),
817 	REG(0x038),
818 	REG(0x03c),
819 	REG(0x168),
820 	REG(0x140),
821 	REG(0x110),
822 	REG(0x11c),
823 	REG(0x114),
824 	REG(0x118),
825 	REG(0x1c0),
826 	REG(0x1c4),
827 	REG(0x1c8),
828 
829 	NOP(3),
830 	LRI(9, POSTED),
831 	REG16(0x3a8),
832 	REG16(0x28c),
833 	REG16(0x288),
834 	REG16(0x284),
835 	REG16(0x280),
836 	REG16(0x27c),
837 	REG16(0x278),
838 	REG16(0x274),
839 	REG16(0x270),
840 
841 	NOP(13),
842 	LRI(1, 0),
843 	REG(0x0c8),
844 
845 	END(80)
846 };
847 
848 static const u8 gen9_rcs_offsets[] = {
849 	NOP(1),
850 	LRI(14, POSTED),
851 	REG16(0x244),
852 	REG(0x34),
853 	REG(0x30),
854 	REG(0x38),
855 	REG(0x3c),
856 	REG(0x168),
857 	REG(0x140),
858 	REG(0x110),
859 	REG(0x11c),
860 	REG(0x114),
861 	REG(0x118),
862 	REG(0x1c0),
863 	REG(0x1c4),
864 	REG(0x1c8),
865 
866 	NOP(3),
867 	LRI(9, POSTED),
868 	REG16(0x3a8),
869 	REG16(0x28c),
870 	REG16(0x288),
871 	REG16(0x284),
872 	REG16(0x280),
873 	REG16(0x27c),
874 	REG16(0x278),
875 	REG16(0x274),
876 	REG16(0x270),
877 
878 	NOP(13),
879 	LRI(1, 0),
880 	REG(0xc8),
881 
882 	NOP(13),
883 	LRI(44, POSTED),
884 	REG(0x28),
885 	REG(0x9c),
886 	REG(0xc0),
887 	REG(0x178),
888 	REG(0x17c),
889 	REG16(0x358),
890 	REG(0x170),
891 	REG(0x150),
892 	REG(0x154),
893 	REG(0x158),
894 	REG16(0x41c),
895 	REG16(0x600),
896 	REG16(0x604),
897 	REG16(0x608),
898 	REG16(0x60c),
899 	REG16(0x610),
900 	REG16(0x614),
901 	REG16(0x618),
902 	REG16(0x61c),
903 	REG16(0x620),
904 	REG16(0x624),
905 	REG16(0x628),
906 	REG16(0x62c),
907 	REG16(0x630),
908 	REG16(0x634),
909 	REG16(0x638),
910 	REG16(0x63c),
911 	REG16(0x640),
912 	REG16(0x644),
913 	REG16(0x648),
914 	REG16(0x64c),
915 	REG16(0x650),
916 	REG16(0x654),
917 	REG16(0x658),
918 	REG16(0x65c),
919 	REG16(0x660),
920 	REG16(0x664),
921 	REG16(0x668),
922 	REG16(0x66c),
923 	REG16(0x670),
924 	REG16(0x674),
925 	REG16(0x678),
926 	REG16(0x67c),
927 	REG(0x68),
928 
929 	END(176)
930 };
931 
932 static const u8 gen11_rcs_offsets[] = {
933 	NOP(1),
934 	LRI(15, POSTED),
935 	REG16(0x244),
936 	REG(0x034),
937 	REG(0x030),
938 	REG(0x038),
939 	REG(0x03c),
940 	REG(0x168),
941 	REG(0x140),
942 	REG(0x110),
943 	REG(0x11c),
944 	REG(0x114),
945 	REG(0x118),
946 	REG(0x1c0),
947 	REG(0x1c4),
948 	REG(0x1c8),
949 	REG(0x180),
950 
951 	NOP(1),
952 	LRI(9, POSTED),
953 	REG16(0x3a8),
954 	REG16(0x28c),
955 	REG16(0x288),
956 	REG16(0x284),
957 	REG16(0x280),
958 	REG16(0x27c),
959 	REG16(0x278),
960 	REG16(0x274),
961 	REG16(0x270),
962 
963 	LRI(1, POSTED),
964 	REG(0x1b0),
965 
966 	NOP(10),
967 	LRI(1, 0),
968 	REG(0x0c8),
969 
970 	END(80)
971 };
972 
973 static const u8 gen12_rcs_offsets[] = {
974 	NOP(1),
975 	LRI(13, POSTED),
976 	REG16(0x244),
977 	REG(0x034),
978 	REG(0x030),
979 	REG(0x038),
980 	REG(0x03c),
981 	REG(0x168),
982 	REG(0x140),
983 	REG(0x110),
984 	REG(0x1c0),
985 	REG(0x1c4),
986 	REG(0x1c8),
987 	REG(0x180),
988 	REG16(0x2b4),
989 
990 	NOP(5),
991 	LRI(9, POSTED),
992 	REG16(0x3a8),
993 	REG16(0x28c),
994 	REG16(0x288),
995 	REG16(0x284),
996 	REG16(0x280),
997 	REG16(0x27c),
998 	REG16(0x278),
999 	REG16(0x274),
1000 	REG16(0x270),
1001 
1002 	LRI(3, POSTED),
1003 	REG(0x1b0),
1004 	REG16(0x5a8),
1005 	REG16(0x5ac),
1006 
1007 	NOP(6),
1008 	LRI(1, 0),
1009 	REG(0x0c8),
1010 	NOP(3 + 9 + 1),
1011 
1012 	LRI(51, POSTED),
1013 	REG16(0x588),
1014 	REG16(0x588),
1015 	REG16(0x588),
1016 	REG16(0x588),
1017 	REG16(0x588),
1018 	REG16(0x588),
1019 	REG(0x028),
1020 	REG(0x09c),
1021 	REG(0x0c0),
1022 	REG(0x178),
1023 	REG(0x17c),
1024 	REG16(0x358),
1025 	REG(0x170),
1026 	REG(0x150),
1027 	REG(0x154),
1028 	REG(0x158),
1029 	REG16(0x41c),
1030 	REG16(0x600),
1031 	REG16(0x604),
1032 	REG16(0x608),
1033 	REG16(0x60c),
1034 	REG16(0x610),
1035 	REG16(0x614),
1036 	REG16(0x618),
1037 	REG16(0x61c),
1038 	REG16(0x620),
1039 	REG16(0x624),
1040 	REG16(0x628),
1041 	REG16(0x62c),
1042 	REG16(0x630),
1043 	REG16(0x634),
1044 	REG16(0x638),
1045 	REG16(0x63c),
1046 	REG16(0x640),
1047 	REG16(0x644),
1048 	REG16(0x648),
1049 	REG16(0x64c),
1050 	REG16(0x650),
1051 	REG16(0x654),
1052 	REG16(0x658),
1053 	REG16(0x65c),
1054 	REG16(0x660),
1055 	REG16(0x664),
1056 	REG16(0x668),
1057 	REG16(0x66c),
1058 	REG16(0x670),
1059 	REG16(0x674),
1060 	REG16(0x678),
1061 	REG16(0x67c),
1062 	REG(0x068),
1063 	REG(0x084),
1064 	NOP(1),
1065 
1066 	END(192)
1067 };
1068 
1069 #undef END
1070 #undef REG16
1071 #undef REG
1072 #undef LRI
1073 #undef NOP
1074 
1075 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1076 {
1077 	/*
1078 	 * The gen12+ lists only have the registers we program in the basic
1079 	 * default state. We rely on the context image using relative
1080 	 * addressing to automatic fixup the register state between the
1081 	 * physical engines for virtual engine.
1082 	 */
1083 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1084 		   !intel_engine_has_relative_mmio(engine));
1085 
1086 	if (engine->class == RENDER_CLASS) {
1087 		if (INTEL_GEN(engine->i915) >= 12)
1088 			return gen12_rcs_offsets;
1089 		else if (INTEL_GEN(engine->i915) >= 11)
1090 			return gen11_rcs_offsets;
1091 		else if (INTEL_GEN(engine->i915) >= 9)
1092 			return gen9_rcs_offsets;
1093 		else
1094 			return gen8_rcs_offsets;
1095 	} else {
1096 		if (INTEL_GEN(engine->i915) >= 12)
1097 			return gen12_xcs_offsets;
1098 		else if (INTEL_GEN(engine->i915) >= 9)
1099 			return gen9_xcs_offsets;
1100 		else
1101 			return gen8_xcs_offsets;
1102 	}
1103 }
1104 
1105 static struct i915_request *
1106 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1107 {
1108 	struct i915_request *rq, *rn, *active = NULL;
1109 	struct list_head *pl;
1110 	int prio = I915_PRIORITY_INVALID;
1111 
1112 	lockdep_assert_held(&engine->active.lock);
1113 
1114 	list_for_each_entry_safe_reverse(rq, rn,
1115 					 &engine->active.requests,
1116 					 sched.link) {
1117 		if (i915_request_completed(rq))
1118 			continue; /* XXX */
1119 
1120 		__i915_request_unsubmit(rq);
1121 
1122 		/*
1123 		 * Push the request back into the queue for later resubmission.
1124 		 * If this request is not native to this physical engine (i.e.
1125 		 * it came from a virtual source), push it back onto the virtual
1126 		 * engine so that it can be moved across onto another physical
1127 		 * engine as load dictates.
1128 		 */
1129 		if (likely(rq->execution_mask == engine->mask)) {
1130 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1131 			if (rq_prio(rq) != prio) {
1132 				prio = rq_prio(rq);
1133 				pl = i915_sched_lookup_priolist(engine, prio);
1134 			}
1135 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1136 
1137 			list_move(&rq->sched.link, pl);
1138 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1139 
1140 			/* Check in case we rollback so far we wrap [size/2] */
1141 			if (intel_ring_direction(rq->ring,
1142 						 intel_ring_wrap(rq->ring,
1143 								 rq->tail),
1144 						 rq->ring->tail) > 0)
1145 				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1146 
1147 			active = rq;
1148 		} else {
1149 			struct intel_engine_cs *owner = rq->context->engine;
1150 
1151 			/*
1152 			 * Decouple the virtual breadcrumb before moving it
1153 			 * back to the virtual engine -- we don't want the
1154 			 * request to complete in the background and try
1155 			 * and cancel the breadcrumb on the virtual engine
1156 			 * (instead of the old engine where it is linked)!
1157 			 */
1158 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1159 				     &rq->fence.flags)) {
1160 				spin_lock_nested(&rq->lock,
1161 						 SINGLE_DEPTH_NESTING);
1162 				i915_request_cancel_breadcrumb(rq);
1163 				spin_unlock(&rq->lock);
1164 			}
1165 			WRITE_ONCE(rq->engine, owner);
1166 			owner->submit_request(rq);
1167 			active = NULL;
1168 		}
1169 	}
1170 
1171 	return active;
1172 }
1173 
1174 struct i915_request *
1175 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1176 {
1177 	struct intel_engine_cs *engine =
1178 		container_of(execlists, typeof(*engine), execlists);
1179 
1180 	return __unwind_incomplete_requests(engine);
1181 }
1182 
1183 static inline void
1184 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1185 {
1186 	/*
1187 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1188 	 * The compiler should eliminate this function as dead-code.
1189 	 */
1190 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1191 		return;
1192 
1193 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1194 				   status, rq);
1195 }
1196 
1197 static void intel_engine_context_in(struct intel_engine_cs *engine)
1198 {
1199 	unsigned long flags;
1200 
1201 	if (atomic_add_unless(&engine->stats.active, 1, 0))
1202 		return;
1203 
1204 	write_seqlock_irqsave(&engine->stats.lock, flags);
1205 	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1206 		engine->stats.start = ktime_get();
1207 		atomic_inc(&engine->stats.active);
1208 	}
1209 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1210 }
1211 
1212 static void intel_engine_context_out(struct intel_engine_cs *engine)
1213 {
1214 	unsigned long flags;
1215 
1216 	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1217 
1218 	if (atomic_add_unless(&engine->stats.active, -1, 1))
1219 		return;
1220 
1221 	write_seqlock_irqsave(&engine->stats.lock, flags);
1222 	if (atomic_dec_and_test(&engine->stats.active)) {
1223 		engine->stats.total =
1224 			ktime_add(engine->stats.total,
1225 				  ktime_sub(ktime_get(), engine->stats.start));
1226 	}
1227 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1228 }
1229 
1230 static void
1231 execlists_check_context(const struct intel_context *ce,
1232 			const struct intel_engine_cs *engine)
1233 {
1234 	const struct intel_ring *ring = ce->ring;
1235 	u32 *regs = ce->lrc_reg_state;
1236 	bool valid = true;
1237 	int x;
1238 
1239 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1240 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1241 		       engine->name,
1242 		       regs[CTX_RING_START],
1243 		       i915_ggtt_offset(ring->vma));
1244 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1245 		valid = false;
1246 	}
1247 
1248 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1249 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1250 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1251 		       engine->name,
1252 		       regs[CTX_RING_CTL],
1253 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1254 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1255 		valid = false;
1256 	}
1257 
1258 	x = lrc_ring_mi_mode(engine);
1259 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1260 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1261 		       engine->name, regs[x + 1]);
1262 		regs[x + 1] &= ~STOP_RING;
1263 		regs[x + 1] |= STOP_RING << 16;
1264 		valid = false;
1265 	}
1266 
1267 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1268 }
1269 
1270 static void restore_default_state(struct intel_context *ce,
1271 				  struct intel_engine_cs *engine)
1272 {
1273 	u32 *regs;
1274 
1275 	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1276 	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1277 
1278 	ce->runtime.last = intel_context_get_runtime(ce);
1279 }
1280 
1281 static void reset_active(struct i915_request *rq,
1282 			 struct intel_engine_cs *engine)
1283 {
1284 	struct intel_context * const ce = rq->context;
1285 	u32 head;
1286 
1287 	/*
1288 	 * The executing context has been cancelled. We want to prevent
1289 	 * further execution along this context and propagate the error on
1290 	 * to anything depending on its results.
1291 	 *
1292 	 * In __i915_request_submit(), we apply the -EIO and remove the
1293 	 * requests' payloads for any banned requests. But first, we must
1294 	 * rewind the context back to the start of the incomplete request so
1295 	 * that we do not jump back into the middle of the batch.
1296 	 *
1297 	 * We preserve the breadcrumbs and semaphores of the incomplete
1298 	 * requests so that inter-timeline dependencies (i.e other timelines)
1299 	 * remain correctly ordered. And we defer to __i915_request_submit()
1300 	 * so that all asynchronous waits are correctly handled.
1301 	 */
1302 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1303 		     rq->fence.context, rq->fence.seqno);
1304 
1305 	/* On resubmission of the active request, payload will be scrubbed */
1306 	if (i915_request_completed(rq))
1307 		head = rq->tail;
1308 	else
1309 		head = active_request(ce->timeline, rq)->head;
1310 	head = intel_ring_wrap(ce->ring, head);
1311 
1312 	/* Scrub the context image to prevent replaying the previous batch */
1313 	restore_default_state(ce, engine);
1314 	__execlists_update_reg_state(ce, engine, head);
1315 
1316 	/* We've switched away, so this should be a no-op, but intent matters */
1317 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1318 }
1319 
1320 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1321 {
1322 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1323 	ce->runtime.num_underflow += dt < 0;
1324 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1325 #endif
1326 }
1327 
1328 static void intel_context_update_runtime(struct intel_context *ce)
1329 {
1330 	u32 old;
1331 	s32 dt;
1332 
1333 	if (intel_context_is_barrier(ce))
1334 		return;
1335 
1336 	old = ce->runtime.last;
1337 	ce->runtime.last = intel_context_get_runtime(ce);
1338 	dt = ce->runtime.last - old;
1339 
1340 	if (unlikely(dt <= 0)) {
1341 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1342 			 old, ce->runtime.last, dt);
1343 		st_update_runtime_underflow(ce, dt);
1344 		return;
1345 	}
1346 
1347 	ewma_runtime_add(&ce->runtime.avg, dt);
1348 	ce->runtime.total += dt;
1349 }
1350 
1351 static inline struct intel_engine_cs *
1352 __execlists_schedule_in(struct i915_request *rq)
1353 {
1354 	struct intel_engine_cs * const engine = rq->engine;
1355 	struct intel_context * const ce = rq->context;
1356 
1357 	intel_context_get(ce);
1358 
1359 	if (unlikely(intel_context_is_banned(ce)))
1360 		reset_active(rq, engine);
1361 
1362 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1363 		execlists_check_context(ce, engine);
1364 
1365 	if (ce->tag) {
1366 		/* Use a fixed tag for OA and friends */
1367 		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1368 		ce->lrc.ccid = ce->tag;
1369 	} else {
1370 		/* We don't need a strict matching tag, just different values */
1371 		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1372 
1373 		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1374 		clear_bit(tag - 1, &engine->context_tag);
1375 		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1376 
1377 		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1378 	}
1379 
1380 	ce->lrc.ccid |= engine->execlists.ccid;
1381 
1382 	__intel_gt_pm_get(engine->gt);
1383 	if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1384 		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1385 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1386 	intel_engine_context_in(engine);
1387 
1388 	return engine;
1389 }
1390 
1391 static inline struct i915_request *
1392 execlists_schedule_in(struct i915_request *rq, int idx)
1393 {
1394 	struct intel_context * const ce = rq->context;
1395 	struct intel_engine_cs *old;
1396 
1397 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1398 	trace_i915_request_in(rq, idx);
1399 
1400 	old = READ_ONCE(ce->inflight);
1401 	do {
1402 		if (!old) {
1403 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1404 			break;
1405 		}
1406 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1407 
1408 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1409 	return i915_request_get(rq);
1410 }
1411 
1412 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1413 {
1414 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1415 	struct i915_request *next = READ_ONCE(ve->request);
1416 
1417 	if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1418 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
1419 }
1420 
1421 static inline void
1422 __execlists_schedule_out(struct i915_request *rq,
1423 			 struct intel_engine_cs * const engine,
1424 			 unsigned int ccid)
1425 {
1426 	struct intel_context * const ce = rq->context;
1427 
1428 	/*
1429 	 * NB process_csb() is not under the engine->active.lock and hence
1430 	 * schedule_out can race with schedule_in meaning that we should
1431 	 * refrain from doing non-trivial work here.
1432 	 */
1433 
1434 	/*
1435 	 * If we have just completed this context, the engine may now be
1436 	 * idle and we want to re-enter powersaving.
1437 	 */
1438 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1439 	    i915_request_completed(rq))
1440 		intel_engine_add_retire(engine, ce->timeline);
1441 
1442 	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1443 	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1444 	if (ccid < BITS_PER_LONG) {
1445 		GEM_BUG_ON(ccid == 0);
1446 		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1447 		set_bit(ccid - 1, &engine->context_tag);
1448 	}
1449 
1450 	intel_context_update_runtime(ce);
1451 	intel_engine_context_out(engine);
1452 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1453 	if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1454 		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1455 	intel_gt_pm_put_async(engine->gt);
1456 
1457 	/*
1458 	 * If this is part of a virtual engine, its next request may
1459 	 * have been blocked waiting for access to the active context.
1460 	 * We have to kick all the siblings again in case we need to
1461 	 * switch (e.g. the next request is not runnable on this
1462 	 * engine). Hopefully, we will already have submitted the next
1463 	 * request before the tasklet runs and do not need to rebuild
1464 	 * each virtual tree and kick everyone again.
1465 	 */
1466 	if (ce->engine != engine)
1467 		kick_siblings(rq, ce);
1468 
1469 	intel_context_put(ce);
1470 }
1471 
1472 static inline void
1473 execlists_schedule_out(struct i915_request *rq)
1474 {
1475 	struct intel_context * const ce = rq->context;
1476 	struct intel_engine_cs *cur, *old;
1477 	u32 ccid;
1478 
1479 	trace_i915_request_out(rq);
1480 
1481 	ccid = rq->context->lrc.ccid;
1482 	old = READ_ONCE(ce->inflight);
1483 	do
1484 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1485 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1486 	if (!cur)
1487 		__execlists_schedule_out(rq, old, ccid);
1488 
1489 	i915_request_put(rq);
1490 }
1491 
1492 static u64 execlists_update_context(struct i915_request *rq)
1493 {
1494 	struct intel_context *ce = rq->context;
1495 	u64 desc = ce->lrc.desc;
1496 	u32 tail, prev;
1497 
1498 	/*
1499 	 * WaIdleLiteRestore:bdw,skl
1500 	 *
1501 	 * We should never submit the context with the same RING_TAIL twice
1502 	 * just in case we submit an empty ring, which confuses the HW.
1503 	 *
1504 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1505 	 * the normal request to be able to always advance the RING_TAIL on
1506 	 * subsequent resubmissions (for lite restore). Should that fail us,
1507 	 * and we try and submit the same tail again, force the context
1508 	 * reload.
1509 	 *
1510 	 * If we need to return to a preempted context, we need to skip the
1511 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1512 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1513 	 * an earlier request.
1514 	 */
1515 	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1516 	prev = rq->ring->tail;
1517 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1518 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1519 		desc |= CTX_DESC_FORCE_RESTORE;
1520 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1521 	rq->tail = rq->wa_tail;
1522 
1523 	/*
1524 	 * Make sure the context image is complete before we submit it to HW.
1525 	 *
1526 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1527 	 * an uncached write such as our mmio register access, the empirical
1528 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1529 	 * may not be visible to the HW prior to the completion of the UC
1530 	 * register write and that we may begin execution from the context
1531 	 * before its image is complete leading to invalid PD chasing.
1532 	 */
1533 	wmb();
1534 
1535 	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1536 	return desc;
1537 }
1538 
1539 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1540 {
1541 	if (execlists->ctrl_reg) {
1542 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1543 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1544 	} else {
1545 		writel(upper_32_bits(desc), execlists->submit_reg);
1546 		writel(lower_32_bits(desc), execlists->submit_reg);
1547 	}
1548 }
1549 
1550 static __maybe_unused char *
1551 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1552 {
1553 	if (!rq)
1554 		return "";
1555 
1556 	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1557 		 prefix,
1558 		 rq->context->lrc.ccid,
1559 		 rq->fence.context, rq->fence.seqno,
1560 		 i915_request_completed(rq) ? "!" :
1561 		 i915_request_started(rq) ? "*" :
1562 		 "",
1563 		 rq_prio(rq));
1564 
1565 	return buf;
1566 }
1567 
1568 static __maybe_unused void
1569 trace_ports(const struct intel_engine_execlists *execlists,
1570 	    const char *msg,
1571 	    struct i915_request * const *ports)
1572 {
1573 	const struct intel_engine_cs *engine =
1574 		container_of(execlists, typeof(*engine), execlists);
1575 	char __maybe_unused p0[40], p1[40];
1576 
1577 	if (!ports[0])
1578 		return;
1579 
1580 	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1581 		     dump_port(p0, sizeof(p0), "", ports[0]),
1582 		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1583 }
1584 
1585 static inline bool
1586 reset_in_progress(const struct intel_engine_execlists *execlists)
1587 {
1588 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1589 }
1590 
1591 static __maybe_unused bool
1592 assert_pending_valid(const struct intel_engine_execlists *execlists,
1593 		     const char *msg)
1594 {
1595 	struct intel_engine_cs *engine =
1596 		container_of(execlists, typeof(*engine), execlists);
1597 	struct i915_request * const *port, *rq;
1598 	struct intel_context *ce = NULL;
1599 	bool sentinel = false;
1600 	u32 ccid = -1;
1601 
1602 	trace_ports(execlists, msg, execlists->pending);
1603 
1604 	/* We may be messing around with the lists during reset, lalala */
1605 	if (reset_in_progress(execlists))
1606 		return true;
1607 
1608 	if (!execlists->pending[0]) {
1609 		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1610 			      engine->name);
1611 		return false;
1612 	}
1613 
1614 	if (execlists->pending[execlists_num_ports(execlists)]) {
1615 		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1616 			      engine->name, execlists_num_ports(execlists));
1617 		return false;
1618 	}
1619 
1620 	for (port = execlists->pending; (rq = *port); port++) {
1621 		unsigned long flags;
1622 		bool ok = true;
1623 
1624 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1625 		GEM_BUG_ON(!i915_request_is_active(rq));
1626 
1627 		if (ce == rq->context) {
1628 			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1629 				      engine->name,
1630 				      ce->timeline->fence_context,
1631 				      port - execlists->pending);
1632 			return false;
1633 		}
1634 		ce = rq->context;
1635 
1636 		if (ccid == ce->lrc.ccid) {
1637 			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1638 				      engine->name,
1639 				      ccid, ce->timeline->fence_context,
1640 				      port - execlists->pending);
1641 			return false;
1642 		}
1643 		ccid = ce->lrc.ccid;
1644 
1645 		/*
1646 		 * Sentinels are supposed to be the last request so they flush
1647 		 * the current execution off the HW. Check that they are the only
1648 		 * request in the pending submission.
1649 		 */
1650 		if (sentinel) {
1651 			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1652 				      engine->name,
1653 				      ce->timeline->fence_context,
1654 				      port - execlists->pending);
1655 			return false;
1656 		}
1657 		sentinel = i915_request_has_sentinel(rq);
1658 
1659 		/* Hold tightly onto the lock to prevent concurrent retires! */
1660 		if (!spin_trylock_irqsave(&rq->lock, flags))
1661 			continue;
1662 
1663 		if (i915_request_completed(rq))
1664 			goto unlock;
1665 
1666 		if (i915_active_is_idle(&ce->active) &&
1667 		    !intel_context_is_barrier(ce)) {
1668 			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1669 				      engine->name,
1670 				      ce->timeline->fence_context,
1671 				      port - execlists->pending);
1672 			ok = false;
1673 			goto unlock;
1674 		}
1675 
1676 		if (!i915_vma_is_pinned(ce->state)) {
1677 			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1678 				      engine->name,
1679 				      ce->timeline->fence_context,
1680 				      port - execlists->pending);
1681 			ok = false;
1682 			goto unlock;
1683 		}
1684 
1685 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1686 			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1687 				      engine->name,
1688 				      ce->timeline->fence_context,
1689 				      port - execlists->pending);
1690 			ok = false;
1691 			goto unlock;
1692 		}
1693 
1694 unlock:
1695 		spin_unlock_irqrestore(&rq->lock, flags);
1696 		if (!ok)
1697 			return false;
1698 	}
1699 
1700 	return ce;
1701 }
1702 
1703 static void execlists_submit_ports(struct intel_engine_cs *engine)
1704 {
1705 	struct intel_engine_execlists *execlists = &engine->execlists;
1706 	unsigned int n;
1707 
1708 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1709 
1710 	/*
1711 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1712 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1713 	 * not be relinquished until the device is idle (see
1714 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1715 	 * that all ELSP are drained i.e. we have processed the CSB,
1716 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1717 	 */
1718 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1719 
1720 	/*
1721 	 * ELSQ note: the submit queue is not cleared after being submitted
1722 	 * to the HW so we need to make sure we always clean it up. This is
1723 	 * currently ensured by the fact that we always write the same number
1724 	 * of elsq entries, keep this in mind before changing the loop below.
1725 	 */
1726 	for (n = execlists_num_ports(execlists); n--; ) {
1727 		struct i915_request *rq = execlists->pending[n];
1728 
1729 		write_desc(execlists,
1730 			   rq ? execlists_update_context(rq) : 0,
1731 			   n);
1732 	}
1733 
1734 	/* we need to manually load the submit queue */
1735 	if (execlists->ctrl_reg)
1736 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1737 }
1738 
1739 static bool ctx_single_port_submission(const struct intel_context *ce)
1740 {
1741 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1742 		intel_context_force_single_submission(ce));
1743 }
1744 
1745 static bool can_merge_ctx(const struct intel_context *prev,
1746 			  const struct intel_context *next)
1747 {
1748 	if (prev != next)
1749 		return false;
1750 
1751 	if (ctx_single_port_submission(prev))
1752 		return false;
1753 
1754 	return true;
1755 }
1756 
1757 static unsigned long i915_request_flags(const struct i915_request *rq)
1758 {
1759 	return READ_ONCE(rq->fence.flags);
1760 }
1761 
1762 static bool can_merge_rq(const struct i915_request *prev,
1763 			 const struct i915_request *next)
1764 {
1765 	GEM_BUG_ON(prev == next);
1766 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1767 
1768 	/*
1769 	 * We do not submit known completed requests. Therefore if the next
1770 	 * request is already completed, we can pretend to merge it in
1771 	 * with the previous context (and we will skip updating the ELSP
1772 	 * and tracking). Thus hopefully keeping the ELSP full with active
1773 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1774 	 * us.
1775 	 */
1776 	if (i915_request_completed(next))
1777 		return true;
1778 
1779 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1780 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1781 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1782 		return false;
1783 
1784 	if (!can_merge_ctx(prev->context, next->context))
1785 		return false;
1786 
1787 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1788 	return true;
1789 }
1790 
1791 static void virtual_update_register_offsets(u32 *regs,
1792 					    struct intel_engine_cs *engine)
1793 {
1794 	set_offsets(regs, reg_offsets(engine), engine, false);
1795 }
1796 
1797 static bool virtual_matches(const struct virtual_engine *ve,
1798 			    const struct i915_request *rq,
1799 			    const struct intel_engine_cs *engine)
1800 {
1801 	const struct intel_engine_cs *inflight;
1802 
1803 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1804 		return false;
1805 
1806 	/*
1807 	 * We track when the HW has completed saving the context image
1808 	 * (i.e. when we have seen the final CS event switching out of
1809 	 * the context) and must not overwrite the context image before
1810 	 * then. This restricts us to only using the active engine
1811 	 * while the previous virtualized request is inflight (so
1812 	 * we reuse the register offsets). This is a very small
1813 	 * hystersis on the greedy seelction algorithm.
1814 	 */
1815 	inflight = intel_context_inflight(&ve->context);
1816 	if (inflight && inflight != engine)
1817 		return false;
1818 
1819 	return true;
1820 }
1821 
1822 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve)
1823 {
1824 	/*
1825 	 * All the outstanding signals on ve->siblings[0] must have
1826 	 * been completed, just pending the interrupt handler. As those
1827 	 * signals still refer to the old sibling (via rq->engine), we must
1828 	 * transfer those to the old irq_worker to keep our locking
1829 	 * consistent.
1830 	 */
1831 	intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context);
1832 }
1833 
1834 #define for_each_waiter(p__, rq__) \
1835 	list_for_each_entry_lockless(p__, \
1836 				     &(rq__)->sched.waiters_list, \
1837 				     wait_link)
1838 
1839 #define for_each_signaler(p__, rq__) \
1840 	list_for_each_entry_rcu(p__, \
1841 				&(rq__)->sched.signalers_list, \
1842 				signal_link)
1843 
1844 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1845 {
1846 	LIST_HEAD(list);
1847 
1848 	/*
1849 	 * We want to move the interrupted request to the back of
1850 	 * the round-robin list (i.e. its priority level), but
1851 	 * in doing so, we must then move all requests that were in
1852 	 * flight and were waiting for the interrupted request to
1853 	 * be run after it again.
1854 	 */
1855 	do {
1856 		struct i915_dependency *p;
1857 
1858 		GEM_BUG_ON(i915_request_is_active(rq));
1859 		list_move_tail(&rq->sched.link, pl);
1860 
1861 		for_each_waiter(p, rq) {
1862 			struct i915_request *w =
1863 				container_of(p->waiter, typeof(*w), sched);
1864 
1865 			if (p->flags & I915_DEPENDENCY_WEAK)
1866 				continue;
1867 
1868 			/* Leave semaphores spinning on the other engines */
1869 			if (w->engine != rq->engine)
1870 				continue;
1871 
1872 			/* No waiter should start before its signaler */
1873 			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1874 				   i915_request_started(w) &&
1875 				   !i915_request_completed(rq));
1876 
1877 			GEM_BUG_ON(i915_request_is_active(w));
1878 			if (!i915_request_is_ready(w))
1879 				continue;
1880 
1881 			if (rq_prio(w) < rq_prio(rq))
1882 				continue;
1883 
1884 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1885 			list_move_tail(&w->sched.link, &list);
1886 		}
1887 
1888 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1889 	} while (rq);
1890 }
1891 
1892 static void defer_active(struct intel_engine_cs *engine)
1893 {
1894 	struct i915_request *rq;
1895 
1896 	rq = __unwind_incomplete_requests(engine);
1897 	if (!rq)
1898 		return;
1899 
1900 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1901 }
1902 
1903 static bool
1904 need_timeslice(const struct intel_engine_cs *engine,
1905 	       const struct i915_request *rq,
1906 	       const struct rb_node *rb)
1907 {
1908 	int hint;
1909 
1910 	if (!intel_engine_has_timeslices(engine))
1911 		return false;
1912 
1913 	hint = engine->execlists.queue_priority_hint;
1914 
1915 	if (rb) {
1916 		const struct virtual_engine *ve =
1917 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1918 		const struct intel_engine_cs *inflight =
1919 			intel_context_inflight(&ve->context);
1920 
1921 		if (!inflight || inflight == engine) {
1922 			struct i915_request *next;
1923 
1924 			rcu_read_lock();
1925 			next = READ_ONCE(ve->request);
1926 			if (next)
1927 				hint = max(hint, rq_prio(next));
1928 			rcu_read_unlock();
1929 		}
1930 	}
1931 
1932 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1933 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1934 
1935 	GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1936 	return hint >= effective_prio(rq);
1937 }
1938 
1939 static bool
1940 timeslice_yield(const struct intel_engine_execlists *el,
1941 		const struct i915_request *rq)
1942 {
1943 	/*
1944 	 * Once bitten, forever smitten!
1945 	 *
1946 	 * If the active context ever busy-waited on a semaphore,
1947 	 * it will be treated as a hog until the end of its timeslice (i.e.
1948 	 * until it is scheduled out and replaced by a new submission,
1949 	 * possibly even its own lite-restore). The HW only sends an interrupt
1950 	 * on the first miss, and we do know if that semaphore has been
1951 	 * signaled, or even if it is now stuck on another semaphore. Play
1952 	 * safe, yield if it might be stuck -- it will be given a fresh
1953 	 * timeslice in the near future.
1954 	 */
1955 	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1956 }
1957 
1958 static bool
1959 timeslice_expired(const struct intel_engine_execlists *el,
1960 		  const struct i915_request *rq)
1961 {
1962 	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1963 }
1964 
1965 static int
1966 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1967 {
1968 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1969 		return engine->execlists.queue_priority_hint;
1970 
1971 	return rq_prio(list_next_entry(rq, sched.link));
1972 }
1973 
1974 static inline unsigned long
1975 timeslice(const struct intel_engine_cs *engine)
1976 {
1977 	return READ_ONCE(engine->props.timeslice_duration_ms);
1978 }
1979 
1980 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1981 {
1982 	const struct intel_engine_execlists *execlists = &engine->execlists;
1983 	const struct i915_request *rq = *execlists->active;
1984 
1985 	if (!rq || i915_request_completed(rq))
1986 		return 0;
1987 
1988 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1989 		return 0;
1990 
1991 	return timeslice(engine);
1992 }
1993 
1994 static void set_timeslice(struct intel_engine_cs *engine)
1995 {
1996 	unsigned long duration;
1997 
1998 	if (!intel_engine_has_timeslices(engine))
1999 		return;
2000 
2001 	duration = active_timeslice(engine);
2002 	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2003 
2004 	set_timer_ms(&engine->execlists.timer, duration);
2005 }
2006 
2007 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2008 {
2009 	struct intel_engine_execlists *execlists = &engine->execlists;
2010 	unsigned long duration;
2011 
2012 	if (!intel_engine_has_timeslices(engine))
2013 		return;
2014 
2015 	WRITE_ONCE(execlists->switch_priority_hint, prio);
2016 	if (prio == INT_MIN)
2017 		return;
2018 
2019 	if (timer_pending(&execlists->timer))
2020 		return;
2021 
2022 	duration = timeslice(engine);
2023 	ENGINE_TRACE(engine,
2024 		     "start timeslicing, prio:%d, interval:%lu",
2025 		     prio, duration);
2026 
2027 	set_timer_ms(&execlists->timer, duration);
2028 }
2029 
2030 static void record_preemption(struct intel_engine_execlists *execlists)
2031 {
2032 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2033 }
2034 
2035 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2036 					    const struct i915_request *rq)
2037 {
2038 	if (!rq)
2039 		return 0;
2040 
2041 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2042 	if (unlikely(intel_context_is_banned(rq->context)))
2043 		return 1;
2044 
2045 	return READ_ONCE(engine->props.preempt_timeout_ms);
2046 }
2047 
2048 static void set_preempt_timeout(struct intel_engine_cs *engine,
2049 				const struct i915_request *rq)
2050 {
2051 	if (!intel_engine_has_preempt_reset(engine))
2052 		return;
2053 
2054 	set_timer_ms(&engine->execlists.preempt,
2055 		     active_preempt_timeout(engine, rq));
2056 }
2057 
2058 static inline void clear_ports(struct i915_request **ports, int count)
2059 {
2060 	memset_p((void **)ports, NULL, count);
2061 }
2062 
2063 static void execlists_dequeue(struct intel_engine_cs *engine)
2064 {
2065 	struct intel_engine_execlists * const execlists = &engine->execlists;
2066 	struct i915_request **port = execlists->pending;
2067 	struct i915_request ** const last_port = port + execlists->port_mask;
2068 	struct i915_request * const *active;
2069 	struct i915_request *last;
2070 	struct rb_node *rb;
2071 	bool submit = false;
2072 
2073 	/*
2074 	 * Hardware submission is through 2 ports. Conceptually each port
2075 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2076 	 * static for a context, and unique to each, so we only execute
2077 	 * requests belonging to a single context from each ring. RING_HEAD
2078 	 * is maintained by the CS in the context image, it marks the place
2079 	 * where it got up to last time, and through RING_TAIL we tell the CS
2080 	 * where we want to execute up to this time.
2081 	 *
2082 	 * In this list the requests are in order of execution. Consecutive
2083 	 * requests from the same context are adjacent in the ringbuffer. We
2084 	 * can combine these requests into a single RING_TAIL update:
2085 	 *
2086 	 *              RING_HEAD...req1...req2
2087 	 *                                    ^- RING_TAIL
2088 	 * since to execute req2 the CS must first execute req1.
2089 	 *
2090 	 * Our goal then is to point each port to the end of a consecutive
2091 	 * sequence of requests as being the most optimal (fewest wake ups
2092 	 * and context switches) submission.
2093 	 */
2094 
2095 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2096 		struct virtual_engine *ve =
2097 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2098 		struct i915_request *rq = READ_ONCE(ve->request);
2099 
2100 		if (!rq) { /* lazily cleanup after another engine handled rq */
2101 			rb_erase_cached(rb, &execlists->virtual);
2102 			RB_CLEAR_NODE(rb);
2103 			rb = rb_first_cached(&execlists->virtual);
2104 			continue;
2105 		}
2106 
2107 		if (!virtual_matches(ve, rq, engine)) {
2108 			rb = rb_next(rb);
2109 			continue;
2110 		}
2111 
2112 		break;
2113 	}
2114 
2115 	/*
2116 	 * If the queue is higher priority than the last
2117 	 * request in the currently active context, submit afresh.
2118 	 * We will resubmit again afterwards in case we need to split
2119 	 * the active context to interject the preemption request,
2120 	 * i.e. we will retrigger preemption following the ack in case
2121 	 * of trouble.
2122 	 */
2123 	active = READ_ONCE(execlists->active);
2124 
2125 	/*
2126 	 * In theory we can skip over completed contexts that have not
2127 	 * yet been processed by events (as those events are in flight):
2128 	 *
2129 	 * while ((last = *active) && i915_request_completed(last))
2130 	 *	active++;
2131 	 *
2132 	 * However, the GPU cannot handle this as it will ultimately
2133 	 * find itself trying to jump back into a context it has just
2134 	 * completed and barf.
2135 	 */
2136 
2137 	if ((last = *active)) {
2138 		if (need_preempt(engine, last, rb)) {
2139 			if (i915_request_completed(last)) {
2140 				tasklet_hi_schedule(&execlists->tasklet);
2141 				return;
2142 			}
2143 
2144 			ENGINE_TRACE(engine,
2145 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2146 				     last->fence.context,
2147 				     last->fence.seqno,
2148 				     last->sched.attr.priority,
2149 				     execlists->queue_priority_hint);
2150 			record_preemption(execlists);
2151 
2152 			/*
2153 			 * Don't let the RING_HEAD advance past the breadcrumb
2154 			 * as we unwind (and until we resubmit) so that we do
2155 			 * not accidentally tell it to go backwards.
2156 			 */
2157 			ring_set_paused(engine, 1);
2158 
2159 			/*
2160 			 * Note that we have not stopped the GPU at this point,
2161 			 * so we are unwinding the incomplete requests as they
2162 			 * remain inflight and so by the time we do complete
2163 			 * the preemption, some of the unwound requests may
2164 			 * complete!
2165 			 */
2166 			__unwind_incomplete_requests(engine);
2167 
2168 			last = NULL;
2169 		} else if (need_timeslice(engine, last, rb) &&
2170 			   timeslice_expired(execlists, last)) {
2171 			if (i915_request_completed(last)) {
2172 				tasklet_hi_schedule(&execlists->tasklet);
2173 				return;
2174 			}
2175 
2176 			ENGINE_TRACE(engine,
2177 				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2178 				     last->fence.context,
2179 				     last->fence.seqno,
2180 				     last->sched.attr.priority,
2181 				     execlists->queue_priority_hint,
2182 				     yesno(timeslice_yield(execlists, last)));
2183 
2184 			ring_set_paused(engine, 1);
2185 			defer_active(engine);
2186 
2187 			/*
2188 			 * Unlike for preemption, if we rewind and continue
2189 			 * executing the same context as previously active,
2190 			 * the order of execution will remain the same and
2191 			 * the tail will only advance. We do not need to
2192 			 * force a full context restore, as a lite-restore
2193 			 * is sufficient to resample the monotonic TAIL.
2194 			 *
2195 			 * If we switch to any other context, similarly we
2196 			 * will not rewind TAIL of current context, and
2197 			 * normal save/restore will preserve state and allow
2198 			 * us to later continue executing the same request.
2199 			 */
2200 			last = NULL;
2201 		} else {
2202 			/*
2203 			 * Otherwise if we already have a request pending
2204 			 * for execution after the current one, we can
2205 			 * just wait until the next CS event before
2206 			 * queuing more. In either case we will force a
2207 			 * lite-restore preemption event, but if we wait
2208 			 * we hopefully coalesce several updates into a single
2209 			 * submission.
2210 			 */
2211 			if (!list_is_last(&last->sched.link,
2212 					  &engine->active.requests)) {
2213 				/*
2214 				 * Even if ELSP[1] is occupied and not worthy
2215 				 * of timeslices, our queue might be.
2216 				 */
2217 				start_timeslice(engine, queue_prio(execlists));
2218 				return;
2219 			}
2220 		}
2221 	}
2222 
2223 	while (rb) { /* XXX virtual is always taking precedence */
2224 		struct virtual_engine *ve =
2225 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2226 		struct i915_request *rq;
2227 
2228 		spin_lock(&ve->base.active.lock);
2229 
2230 		rq = ve->request;
2231 		if (unlikely(!rq)) { /* lost the race to a sibling */
2232 			spin_unlock(&ve->base.active.lock);
2233 			rb_erase_cached(rb, &execlists->virtual);
2234 			RB_CLEAR_NODE(rb);
2235 			rb = rb_first_cached(&execlists->virtual);
2236 			continue;
2237 		}
2238 
2239 		GEM_BUG_ON(rq != ve->request);
2240 		GEM_BUG_ON(rq->engine != &ve->base);
2241 		GEM_BUG_ON(rq->context != &ve->context);
2242 
2243 		if (rq_prio(rq) >= queue_prio(execlists)) {
2244 			if (!virtual_matches(ve, rq, engine)) {
2245 				spin_unlock(&ve->base.active.lock);
2246 				rb = rb_next(rb);
2247 				continue;
2248 			}
2249 
2250 			if (last && !can_merge_rq(last, rq)) {
2251 				spin_unlock(&ve->base.active.lock);
2252 				start_timeslice(engine, rq_prio(rq));
2253 				return; /* leave this for another sibling */
2254 			}
2255 
2256 			ENGINE_TRACE(engine,
2257 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2258 				     rq->fence.context,
2259 				     rq->fence.seqno,
2260 				     i915_request_completed(rq) ? "!" :
2261 				     i915_request_started(rq) ? "*" :
2262 				     "",
2263 				     yesno(engine != ve->siblings[0]));
2264 
2265 			WRITE_ONCE(ve->request, NULL);
2266 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2267 				   INT_MIN);
2268 			rb_erase_cached(rb, &execlists->virtual);
2269 			RB_CLEAR_NODE(rb);
2270 
2271 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2272 			WRITE_ONCE(rq->engine, engine);
2273 
2274 			if (engine != ve->siblings[0]) {
2275 				u32 *regs = ve->context.lrc_reg_state;
2276 				unsigned int n;
2277 
2278 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2279 
2280 				if (!intel_engine_has_relative_mmio(engine))
2281 					virtual_update_register_offsets(regs,
2282 									engine);
2283 
2284 				if (!list_empty(&ve->context.signals))
2285 					virtual_xfer_breadcrumbs(ve);
2286 
2287 				/*
2288 				 * Move the bound engine to the top of the list
2289 				 * for future execution. We then kick this
2290 				 * tasklet first before checking others, so that
2291 				 * we preferentially reuse this set of bound
2292 				 * registers.
2293 				 */
2294 				for (n = 1; n < ve->num_siblings; n++) {
2295 					if (ve->siblings[n] == engine) {
2296 						swap(ve->siblings[n],
2297 						     ve->siblings[0]);
2298 						break;
2299 					}
2300 				}
2301 
2302 				GEM_BUG_ON(ve->siblings[0] != engine);
2303 			}
2304 
2305 			if (__i915_request_submit(rq)) {
2306 				submit = true;
2307 				last = rq;
2308 			}
2309 			i915_request_put(rq);
2310 
2311 			/*
2312 			 * Hmm, we have a bunch of virtual engine requests,
2313 			 * but the first one was already completed (thanks
2314 			 * preempt-to-busy!). Keep looking at the veng queue
2315 			 * until we have no more relevant requests (i.e.
2316 			 * the normal submit queue has higher priority).
2317 			 */
2318 			if (!submit) {
2319 				spin_unlock(&ve->base.active.lock);
2320 				rb = rb_first_cached(&execlists->virtual);
2321 				continue;
2322 			}
2323 		}
2324 
2325 		spin_unlock(&ve->base.active.lock);
2326 		break;
2327 	}
2328 
2329 	while ((rb = rb_first_cached(&execlists->queue))) {
2330 		struct i915_priolist *p = to_priolist(rb);
2331 		struct i915_request *rq, *rn;
2332 		int i;
2333 
2334 		priolist_for_each_request_consume(rq, rn, p, i) {
2335 			bool merge = true;
2336 
2337 			/*
2338 			 * Can we combine this request with the current port?
2339 			 * It has to be the same context/ringbuffer and not
2340 			 * have any exceptions (e.g. GVT saying never to
2341 			 * combine contexts).
2342 			 *
2343 			 * If we can combine the requests, we can execute both
2344 			 * by updating the RING_TAIL to point to the end of the
2345 			 * second request, and so we never need to tell the
2346 			 * hardware about the first.
2347 			 */
2348 			if (last && !can_merge_rq(last, rq)) {
2349 				/*
2350 				 * If we are on the second port and cannot
2351 				 * combine this request with the last, then we
2352 				 * are done.
2353 				 */
2354 				if (port == last_port)
2355 					goto done;
2356 
2357 				/*
2358 				 * We must not populate both ELSP[] with the
2359 				 * same LRCA, i.e. we must submit 2 different
2360 				 * contexts if we submit 2 ELSP.
2361 				 */
2362 				if (last->context == rq->context)
2363 					goto done;
2364 
2365 				if (i915_request_has_sentinel(last))
2366 					goto done;
2367 
2368 				/*
2369 				 * If GVT overrides us we only ever submit
2370 				 * port[0], leaving port[1] empty. Note that we
2371 				 * also have to be careful that we don't queue
2372 				 * the same context (even though a different
2373 				 * request) to the second port.
2374 				 */
2375 				if (ctx_single_port_submission(last->context) ||
2376 				    ctx_single_port_submission(rq->context))
2377 					goto done;
2378 
2379 				merge = false;
2380 			}
2381 
2382 			if (__i915_request_submit(rq)) {
2383 				if (!merge) {
2384 					*port = execlists_schedule_in(last, port - execlists->pending);
2385 					port++;
2386 					last = NULL;
2387 				}
2388 
2389 				GEM_BUG_ON(last &&
2390 					   !can_merge_ctx(last->context,
2391 							  rq->context));
2392 				GEM_BUG_ON(last &&
2393 					   i915_seqno_passed(last->fence.seqno,
2394 							     rq->fence.seqno));
2395 
2396 				submit = true;
2397 				last = rq;
2398 			}
2399 		}
2400 
2401 		rb_erase_cached(&p->node, &execlists->queue);
2402 		i915_priolist_free(p);
2403 	}
2404 
2405 done:
2406 	/*
2407 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2408 	 *
2409 	 * We choose the priority hint such that if we add a request of greater
2410 	 * priority than this, we kick the submission tasklet to decide on
2411 	 * the right order of submitting the requests to hardware. We must
2412 	 * also be prepared to reorder requests as they are in-flight on the
2413 	 * HW. We derive the priority hint then as the first "hole" in
2414 	 * the HW submission ports and if there are no available slots,
2415 	 * the priority of the lowest executing request, i.e. last.
2416 	 *
2417 	 * When we do receive a higher priority request ready to run from the
2418 	 * user, see queue_request(), the priority hint is bumped to that
2419 	 * request triggering preemption on the next dequeue (or subsequent
2420 	 * interrupt for secondary ports).
2421 	 */
2422 	execlists->queue_priority_hint = queue_prio(execlists);
2423 
2424 	if (submit) {
2425 		*port = execlists_schedule_in(last, port - execlists->pending);
2426 		execlists->switch_priority_hint =
2427 			switch_prio(engine, *execlists->pending);
2428 
2429 		/*
2430 		 * Skip if we ended up with exactly the same set of requests,
2431 		 * e.g. trying to timeslice a pair of ordered contexts
2432 		 */
2433 		if (!memcmp(active, execlists->pending,
2434 			    (port - execlists->pending + 1) * sizeof(*port))) {
2435 			do
2436 				execlists_schedule_out(fetch_and_zero(port));
2437 			while (port-- != execlists->pending);
2438 
2439 			goto skip_submit;
2440 		}
2441 		clear_ports(port + 1, last_port - port);
2442 
2443 		WRITE_ONCE(execlists->yield, -1);
2444 		set_preempt_timeout(engine, *active);
2445 		execlists_submit_ports(engine);
2446 	} else {
2447 		start_timeslice(engine, execlists->queue_priority_hint);
2448 skip_submit:
2449 		ring_set_paused(engine, 0);
2450 	}
2451 }
2452 
2453 static void
2454 cancel_port_requests(struct intel_engine_execlists * const execlists)
2455 {
2456 	struct i915_request * const *port;
2457 
2458 	for (port = execlists->pending; *port; port++)
2459 		execlists_schedule_out(*port);
2460 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2461 
2462 	/* Mark the end of active before we overwrite *active */
2463 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2464 		execlists_schedule_out(*port);
2465 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2466 
2467 	smp_wmb(); /* complete the seqlock for execlists_active() */
2468 	WRITE_ONCE(execlists->active, execlists->inflight);
2469 }
2470 
2471 static inline void
2472 invalidate_csb_entries(const u32 *first, const u32 *last)
2473 {
2474 	clflush((void *)first);
2475 	clflush((void *)last);
2476 }
2477 
2478 /*
2479  * Starting with Gen12, the status has a new format:
2480  *
2481  *     bit  0:     switched to new queue
2482  *     bit  1:     reserved
2483  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2484  *                 switch detail is set to "wait on semaphore"
2485  *     bits 3-5:   engine class
2486  *     bits 6-11:  engine instance
2487  *     bits 12-14: reserved
2488  *     bits 15-25: sw context id of the lrc the GT switched to
2489  *     bits 26-31: sw counter of the lrc the GT switched to
2490  *     bits 32-35: context switch detail
2491  *                  - 0: ctx complete
2492  *                  - 1: wait on sync flip
2493  *                  - 2: wait on vblank
2494  *                  - 3: wait on scanline
2495  *                  - 4: wait on semaphore
2496  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2497  *                       WAIT_FOR_EVENT)
2498  *     bit  36:    reserved
2499  *     bits 37-43: wait detail (for switch detail 1 to 4)
2500  *     bits 44-46: reserved
2501  *     bits 47-57: sw context id of the lrc the GT switched away from
2502  *     bits 58-63: sw counter of the lrc the GT switched away from
2503  */
2504 static inline bool
2505 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2506 {
2507 	u32 lower_dw = csb[0];
2508 	u32 upper_dw = csb[1];
2509 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2510 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2511 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2512 
2513 	/*
2514 	 * The context switch detail is not guaranteed to be 5 when a preemption
2515 	 * occurs, so we can't just check for that. The check below works for
2516 	 * all the cases we care about, including preemptions of WAIT
2517 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2518 	 * would require some extra handling, but we don't support that.
2519 	 */
2520 	if (!ctx_away_valid || new_queue) {
2521 		GEM_BUG_ON(!ctx_to_valid);
2522 		return true;
2523 	}
2524 
2525 	/*
2526 	 * switch detail = 5 is covered by the case above and we do not expect a
2527 	 * context switch on an unsuccessful wait instruction since we always
2528 	 * use polling mode.
2529 	 */
2530 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2531 	return false;
2532 }
2533 
2534 static inline bool
2535 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2536 {
2537 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2538 }
2539 
2540 static void process_csb(struct intel_engine_cs *engine)
2541 {
2542 	struct intel_engine_execlists * const execlists = &engine->execlists;
2543 	const u32 * const buf = execlists->csb_status;
2544 	const u8 num_entries = execlists->csb_size;
2545 	u8 head, tail;
2546 
2547 	/*
2548 	 * As we modify our execlists state tracking we require exclusive
2549 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2550 	 * and we assume that is only inside the reset paths and so serialised.
2551 	 */
2552 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2553 		   !reset_in_progress(execlists));
2554 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2555 
2556 	/*
2557 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2558 	 * When reading from the csb_write mmio register, we have to be
2559 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2560 	 * the low 4bits. As it happens we know the next 4bits are always
2561 	 * zero and so we can simply masked off the low u8 of the register
2562 	 * and treat it identically to reading from the HWSP (without having
2563 	 * to use explicit shifting and masking, and probably bifurcating
2564 	 * the code to handle the legacy mmio read).
2565 	 */
2566 	head = execlists->csb_head;
2567 	tail = READ_ONCE(*execlists->csb_write);
2568 	if (unlikely(head == tail))
2569 		return;
2570 
2571 	/*
2572 	 * We will consume all events from HW, or at least pretend to.
2573 	 *
2574 	 * The sequence of events from the HW is deterministic, and derived
2575 	 * from our writes to the ELSP, with a smidgen of variability for
2576 	 * the arrival of the asynchronous requests wrt to the inflight
2577 	 * execution. If the HW sends an event that does not correspond with
2578 	 * the one we are expecting, we have to abandon all hope as we lose
2579 	 * all tracking of what the engine is actually executing. We will
2580 	 * only detect we are out of sequence with the HW when we get an
2581 	 * 'impossible' event because we have already drained our own
2582 	 * preemption/promotion queue. If this occurs, we know that we likely
2583 	 * lost track of execution earlier and must unwind and restart, the
2584 	 * simplest way is by stop processing the event queue and force the
2585 	 * engine to reset.
2586 	 */
2587 	execlists->csb_head = tail;
2588 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2589 
2590 	/*
2591 	 * Hopefully paired with a wmb() in HW!
2592 	 *
2593 	 * We must complete the read of the write pointer before any reads
2594 	 * from the CSB, so that we do not see stale values. Without an rmb
2595 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2596 	 * we perform the READ_ONCE(*csb_write).
2597 	 */
2598 	rmb();
2599 	do {
2600 		bool promote;
2601 
2602 		if (++head == num_entries)
2603 			head = 0;
2604 
2605 		/*
2606 		 * We are flying near dragons again.
2607 		 *
2608 		 * We hold a reference to the request in execlist_port[]
2609 		 * but no more than that. We are operating in softirq
2610 		 * context and so cannot hold any mutex or sleep. That
2611 		 * prevents us stopping the requests we are processing
2612 		 * in port[] from being retired simultaneously (the
2613 		 * breadcrumb will be complete before we see the
2614 		 * context-switch). As we only hold the reference to the
2615 		 * request, any pointer chasing underneath the request
2616 		 * is subject to a potential use-after-free. Thus we
2617 		 * store all of the bookkeeping within port[] as
2618 		 * required, and avoid using unguarded pointers beneath
2619 		 * request itself. The same applies to the atomic
2620 		 * status notifier.
2621 		 */
2622 
2623 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2624 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2625 
2626 		if (INTEL_GEN(engine->i915) >= 12)
2627 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2628 		else
2629 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2630 		if (promote) {
2631 			struct i915_request * const *old = execlists->active;
2632 
2633 			if (GEM_WARN_ON(!*execlists->pending)) {
2634 				execlists->error_interrupt |= ERROR_CSB;
2635 				break;
2636 			}
2637 
2638 			ring_set_paused(engine, 0);
2639 
2640 			/* Point active to the new ELSP; prevent overwriting */
2641 			WRITE_ONCE(execlists->active, execlists->pending);
2642 			smp_wmb(); /* notify execlists_active() */
2643 
2644 			/* cancel old inflight, prepare for switch */
2645 			trace_ports(execlists, "preempted", old);
2646 			while (*old)
2647 				execlists_schedule_out(*old++);
2648 
2649 			/* switch pending to inflight */
2650 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2651 			memcpy(execlists->inflight,
2652 			       execlists->pending,
2653 			       execlists_num_ports(execlists) *
2654 			       sizeof(*execlists->pending));
2655 			smp_wmb(); /* complete the seqlock */
2656 			WRITE_ONCE(execlists->active, execlists->inflight);
2657 
2658 			WRITE_ONCE(execlists->pending[0], NULL);
2659 		} else {
2660 			if (GEM_WARN_ON(!*execlists->active)) {
2661 				execlists->error_interrupt |= ERROR_CSB;
2662 				break;
2663 			}
2664 
2665 			/* port0 completed, advanced to port1 */
2666 			trace_ports(execlists, "completed", execlists->active);
2667 
2668 			/*
2669 			 * We rely on the hardware being strongly
2670 			 * ordered, that the breadcrumb write is
2671 			 * coherent (visible from the CPU) before the
2672 			 * user interrupt is processed. One might assume
2673 			 * that the breadcrumb write being before the
2674 			 * user interrupt and the CS event for the context
2675 			 * switch would therefore be before the CS event
2676 			 * itself...
2677 			 */
2678 			if (GEM_SHOW_DEBUG() &&
2679 			    !i915_request_completed(*execlists->active)) {
2680 				struct i915_request *rq = *execlists->active;
2681 				const u32 *regs __maybe_unused =
2682 					rq->context->lrc_reg_state;
2683 
2684 				ENGINE_TRACE(engine,
2685 					     "context completed before request!\n");
2686 				ENGINE_TRACE(engine,
2687 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2688 					     ENGINE_READ(engine, RING_START),
2689 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2690 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2691 					     ENGINE_READ(engine, RING_CTL),
2692 					     ENGINE_READ(engine, RING_MI_MODE));
2693 				ENGINE_TRACE(engine,
2694 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2695 					     i915_ggtt_offset(rq->ring->vma),
2696 					     rq->head, rq->tail,
2697 					     rq->fence.context,
2698 					     lower_32_bits(rq->fence.seqno),
2699 					     hwsp_seqno(rq));
2700 				ENGINE_TRACE(engine,
2701 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2702 					     regs[CTX_RING_START],
2703 					     regs[CTX_RING_HEAD],
2704 					     regs[CTX_RING_TAIL]);
2705 			}
2706 
2707 			execlists_schedule_out(*execlists->active++);
2708 
2709 			GEM_BUG_ON(execlists->active - execlists->inflight >
2710 				   execlists_num_ports(execlists));
2711 		}
2712 	} while (head != tail);
2713 
2714 	set_timeslice(engine);
2715 
2716 	/*
2717 	 * Gen11 has proven to fail wrt global observation point between
2718 	 * entry and tail update, failing on the ordering and thus
2719 	 * we see an old entry in the context status buffer.
2720 	 *
2721 	 * Forcibly evict out entries for the next gpu csb update,
2722 	 * to increase the odds that we get a fresh entries with non
2723 	 * working hardware. The cost for doing so comes out mostly with
2724 	 * the wash as hardware, working or not, will need to do the
2725 	 * invalidation before.
2726 	 */
2727 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2728 }
2729 
2730 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2731 {
2732 	lockdep_assert_held(&engine->active.lock);
2733 	if (!READ_ONCE(engine->execlists.pending[0])) {
2734 		rcu_read_lock(); /* protect peeking at execlists->active */
2735 		execlists_dequeue(engine);
2736 		rcu_read_unlock();
2737 	}
2738 }
2739 
2740 static void __execlists_hold(struct i915_request *rq)
2741 {
2742 	LIST_HEAD(list);
2743 
2744 	do {
2745 		struct i915_dependency *p;
2746 
2747 		if (i915_request_is_active(rq))
2748 			__i915_request_unsubmit(rq);
2749 
2750 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2751 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2752 		i915_request_set_hold(rq);
2753 		RQ_TRACE(rq, "on hold\n");
2754 
2755 		for_each_waiter(p, rq) {
2756 			struct i915_request *w =
2757 				container_of(p->waiter, typeof(*w), sched);
2758 
2759 			/* Leave semaphores spinning on the other engines */
2760 			if (w->engine != rq->engine)
2761 				continue;
2762 
2763 			if (!i915_request_is_ready(w))
2764 				continue;
2765 
2766 			if (i915_request_completed(w))
2767 				continue;
2768 
2769 			if (i915_request_on_hold(w))
2770 				continue;
2771 
2772 			list_move_tail(&w->sched.link, &list);
2773 		}
2774 
2775 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2776 	} while (rq);
2777 }
2778 
2779 static bool execlists_hold(struct intel_engine_cs *engine,
2780 			   struct i915_request *rq)
2781 {
2782 	spin_lock_irq(&engine->active.lock);
2783 
2784 	if (i915_request_completed(rq)) { /* too late! */
2785 		rq = NULL;
2786 		goto unlock;
2787 	}
2788 
2789 	if (rq->engine != engine) { /* preempted virtual engine */
2790 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2791 
2792 		/*
2793 		 * intel_context_inflight() is only protected by virtue
2794 		 * of process_csb() being called only by the tasklet (or
2795 		 * directly from inside reset while the tasklet is suspended).
2796 		 * Assert that neither of those are allowed to run while we
2797 		 * poke at the request queues.
2798 		 */
2799 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2800 
2801 		/*
2802 		 * An unsubmitted request along a virtual engine will
2803 		 * remain on the active (this) engine until we are able
2804 		 * to process the context switch away (and so mark the
2805 		 * context as no longer in flight). That cannot have happened
2806 		 * yet, otherwise we would not be hanging!
2807 		 */
2808 		spin_lock(&ve->base.active.lock);
2809 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2810 		GEM_BUG_ON(ve->request != rq);
2811 		ve->request = NULL;
2812 		spin_unlock(&ve->base.active.lock);
2813 		i915_request_put(rq);
2814 
2815 		rq->engine = engine;
2816 	}
2817 
2818 	/*
2819 	 * Transfer this request onto the hold queue to prevent it
2820 	 * being resumbitted to HW (and potentially completed) before we have
2821 	 * released it. Since we may have already submitted following
2822 	 * requests, we need to remove those as well.
2823 	 */
2824 	GEM_BUG_ON(i915_request_on_hold(rq));
2825 	GEM_BUG_ON(rq->engine != engine);
2826 	__execlists_hold(rq);
2827 	GEM_BUG_ON(list_empty(&engine->active.hold));
2828 
2829 unlock:
2830 	spin_unlock_irq(&engine->active.lock);
2831 	return rq;
2832 }
2833 
2834 static bool hold_request(const struct i915_request *rq)
2835 {
2836 	struct i915_dependency *p;
2837 	bool result = false;
2838 
2839 	/*
2840 	 * If one of our ancestors is on hold, we must also be on hold,
2841 	 * otherwise we will bypass it and execute before it.
2842 	 */
2843 	rcu_read_lock();
2844 	for_each_signaler(p, rq) {
2845 		const struct i915_request *s =
2846 			container_of(p->signaler, typeof(*s), sched);
2847 
2848 		if (s->engine != rq->engine)
2849 			continue;
2850 
2851 		result = i915_request_on_hold(s);
2852 		if (result)
2853 			break;
2854 	}
2855 	rcu_read_unlock();
2856 
2857 	return result;
2858 }
2859 
2860 static void __execlists_unhold(struct i915_request *rq)
2861 {
2862 	LIST_HEAD(list);
2863 
2864 	do {
2865 		struct i915_dependency *p;
2866 
2867 		RQ_TRACE(rq, "hold release\n");
2868 
2869 		GEM_BUG_ON(!i915_request_on_hold(rq));
2870 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2871 
2872 		i915_request_clear_hold(rq);
2873 		list_move_tail(&rq->sched.link,
2874 			       i915_sched_lookup_priolist(rq->engine,
2875 							  rq_prio(rq)));
2876 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2877 
2878 		/* Also release any children on this engine that are ready */
2879 		for_each_waiter(p, rq) {
2880 			struct i915_request *w =
2881 				container_of(p->waiter, typeof(*w), sched);
2882 
2883 			/* Propagate any change in error status */
2884 			if (rq->fence.error)
2885 				i915_request_set_error_once(w, rq->fence.error);
2886 
2887 			if (w->engine != rq->engine)
2888 				continue;
2889 
2890 			if (!i915_request_on_hold(w))
2891 				continue;
2892 
2893 			/* Check that no other parents are also on hold */
2894 			if (hold_request(w))
2895 				continue;
2896 
2897 			list_move_tail(&w->sched.link, &list);
2898 		}
2899 
2900 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2901 	} while (rq);
2902 }
2903 
2904 static void execlists_unhold(struct intel_engine_cs *engine,
2905 			     struct i915_request *rq)
2906 {
2907 	spin_lock_irq(&engine->active.lock);
2908 
2909 	/*
2910 	 * Move this request back to the priority queue, and all of its
2911 	 * children and grandchildren that were suspended along with it.
2912 	 */
2913 	__execlists_unhold(rq);
2914 
2915 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2916 		engine->execlists.queue_priority_hint = rq_prio(rq);
2917 		tasklet_hi_schedule(&engine->execlists.tasklet);
2918 	}
2919 
2920 	spin_unlock_irq(&engine->active.lock);
2921 }
2922 
2923 struct execlists_capture {
2924 	struct work_struct work;
2925 	struct i915_request *rq;
2926 	struct i915_gpu_coredump *error;
2927 };
2928 
2929 static void execlists_capture_work(struct work_struct *work)
2930 {
2931 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2932 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2933 	struct intel_engine_cs *engine = cap->rq->engine;
2934 	struct intel_gt_coredump *gt = cap->error->gt;
2935 	struct intel_engine_capture_vma *vma;
2936 
2937 	/* Compress all the objects attached to the request, slow! */
2938 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2939 	if (vma) {
2940 		struct i915_vma_compress *compress =
2941 			i915_vma_capture_prepare(gt);
2942 
2943 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2944 		i915_vma_capture_finish(gt, compress);
2945 	}
2946 
2947 	gt->simulated = gt->engine->simulated;
2948 	cap->error->simulated = gt->simulated;
2949 
2950 	/* Publish the error state, and announce it to the world */
2951 	i915_error_state_store(cap->error);
2952 	i915_gpu_coredump_put(cap->error);
2953 
2954 	/* Return this request and all that depend upon it for signaling */
2955 	execlists_unhold(engine, cap->rq);
2956 	i915_request_put(cap->rq);
2957 
2958 	kfree(cap);
2959 }
2960 
2961 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2962 {
2963 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2964 	struct execlists_capture *cap;
2965 
2966 	cap = kmalloc(sizeof(*cap), gfp);
2967 	if (!cap)
2968 		return NULL;
2969 
2970 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2971 	if (!cap->error)
2972 		goto err_cap;
2973 
2974 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2975 	if (!cap->error->gt)
2976 		goto err_gpu;
2977 
2978 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2979 	if (!cap->error->gt->engine)
2980 		goto err_gt;
2981 
2982 	return cap;
2983 
2984 err_gt:
2985 	kfree(cap->error->gt);
2986 err_gpu:
2987 	kfree(cap->error);
2988 err_cap:
2989 	kfree(cap);
2990 	return NULL;
2991 }
2992 
2993 static struct i915_request *
2994 active_context(struct intel_engine_cs *engine, u32 ccid)
2995 {
2996 	const struct intel_engine_execlists * const el = &engine->execlists;
2997 	struct i915_request * const *port, *rq;
2998 
2999 	/*
3000 	 * Use the most recent result from process_csb(), but just in case
3001 	 * we trigger an error (via interrupt) before the first CS event has
3002 	 * been written, peek at the next submission.
3003 	 */
3004 
3005 	for (port = el->active; (rq = *port); port++) {
3006 		if (rq->context->lrc.ccid == ccid) {
3007 			ENGINE_TRACE(engine,
3008 				     "ccid found at active:%zd\n",
3009 				     port - el->active);
3010 			return rq;
3011 		}
3012 	}
3013 
3014 	for (port = el->pending; (rq = *port); port++) {
3015 		if (rq->context->lrc.ccid == ccid) {
3016 			ENGINE_TRACE(engine,
3017 				     "ccid found at pending:%zd\n",
3018 				     port - el->pending);
3019 			return rq;
3020 		}
3021 	}
3022 
3023 	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3024 	return NULL;
3025 }
3026 
3027 static u32 active_ccid(struct intel_engine_cs *engine)
3028 {
3029 	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3030 }
3031 
3032 static void execlists_capture(struct intel_engine_cs *engine)
3033 {
3034 	struct execlists_capture *cap;
3035 
3036 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3037 		return;
3038 
3039 	/*
3040 	 * We need to _quickly_ capture the engine state before we reset.
3041 	 * We are inside an atomic section (softirq) here and we are delaying
3042 	 * the forced preemption event.
3043 	 */
3044 	cap = capture_regs(engine);
3045 	if (!cap)
3046 		return;
3047 
3048 	spin_lock_irq(&engine->active.lock);
3049 	cap->rq = active_context(engine, active_ccid(engine));
3050 	if (cap->rq) {
3051 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3052 		cap->rq = i915_request_get_rcu(cap->rq);
3053 	}
3054 	spin_unlock_irq(&engine->active.lock);
3055 	if (!cap->rq)
3056 		goto err_free;
3057 
3058 	/*
3059 	 * Remove the request from the execlists queue, and take ownership
3060 	 * of the request. We pass it to our worker who will _slowly_ compress
3061 	 * all the pages the _user_ requested for debugging their batch, after
3062 	 * which we return it to the queue for signaling.
3063 	 *
3064 	 * By removing them from the execlists queue, we also remove the
3065 	 * requests from being processed by __unwind_incomplete_requests()
3066 	 * during the intel_engine_reset(), and so they will *not* be replayed
3067 	 * afterwards.
3068 	 *
3069 	 * Note that because we have not yet reset the engine at this point,
3070 	 * it is possible for the request that we have identified as being
3071 	 * guilty, did in fact complete and we will then hit an arbitration
3072 	 * point allowing the outstanding preemption to succeed. The likelihood
3073 	 * of that is very low (as capturing of the engine registers should be
3074 	 * fast enough to run inside an irq-off atomic section!), so we will
3075 	 * simply hold that request accountable for being non-preemptible
3076 	 * long enough to force the reset.
3077 	 */
3078 	if (!execlists_hold(engine, cap->rq))
3079 		goto err_rq;
3080 
3081 	INIT_WORK(&cap->work, execlists_capture_work);
3082 	schedule_work(&cap->work);
3083 	return;
3084 
3085 err_rq:
3086 	i915_request_put(cap->rq);
3087 err_free:
3088 	i915_gpu_coredump_put(cap->error);
3089 	kfree(cap);
3090 }
3091 
3092 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3093 {
3094 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3095 	unsigned long *lock = &engine->gt->reset.flags;
3096 
3097 	if (!intel_has_reset_engine(engine->gt))
3098 		return;
3099 
3100 	if (test_and_set_bit(bit, lock))
3101 		return;
3102 
3103 	ENGINE_TRACE(engine, "reset for %s\n", msg);
3104 
3105 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3106 	tasklet_disable_nosync(&engine->execlists.tasklet);
3107 
3108 	ring_set_paused(engine, 1); /* Freeze the current request in place */
3109 	execlists_capture(engine);
3110 	intel_engine_reset(engine, msg);
3111 
3112 	tasklet_enable(&engine->execlists.tasklet);
3113 	clear_and_wake_up_bit(bit, lock);
3114 }
3115 
3116 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3117 {
3118 	const struct timer_list *t = &engine->execlists.preempt;
3119 
3120 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3121 		return false;
3122 
3123 	if (!timer_expired(t))
3124 		return false;
3125 
3126 	return READ_ONCE(engine->execlists.pending[0]);
3127 }
3128 
3129 /*
3130  * Check the unread Context Status Buffers and manage the submission of new
3131  * contexts to the ELSP accordingly.
3132  */
3133 static void execlists_submission_tasklet(unsigned long data)
3134 {
3135 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3136 	bool timeout = preempt_timeout(engine);
3137 
3138 	process_csb(engine);
3139 
3140 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3141 		const char *msg;
3142 
3143 		/* Generate the error message in priority wrt to the user! */
3144 		if (engine->execlists.error_interrupt & GENMASK(15, 0))
3145 			msg = "CS error"; /* thrown by a user payload */
3146 		else if (engine->execlists.error_interrupt & ERROR_CSB)
3147 			msg = "invalid CSB event";
3148 		else
3149 			msg = "internal error";
3150 
3151 		engine->execlists.error_interrupt = 0;
3152 		execlists_reset(engine, msg);
3153 	}
3154 
3155 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3156 		unsigned long flags;
3157 
3158 		spin_lock_irqsave(&engine->active.lock, flags);
3159 		__execlists_submission_tasklet(engine);
3160 		spin_unlock_irqrestore(&engine->active.lock, flags);
3161 
3162 		/* Recheck after serialising with direct-submission */
3163 		if (unlikely(timeout && preempt_timeout(engine)))
3164 			execlists_reset(engine, "preemption time out");
3165 	}
3166 }
3167 
3168 static void __execlists_kick(struct intel_engine_execlists *execlists)
3169 {
3170 	/* Kick the tasklet for some interrupt coalescing and reset handling */
3171 	tasklet_hi_schedule(&execlists->tasklet);
3172 }
3173 
3174 #define execlists_kick(t, member) \
3175 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3176 
3177 static void execlists_timeslice(struct timer_list *timer)
3178 {
3179 	execlists_kick(timer, timer);
3180 }
3181 
3182 static void execlists_preempt(struct timer_list *timer)
3183 {
3184 	execlists_kick(timer, preempt);
3185 }
3186 
3187 static void queue_request(struct intel_engine_cs *engine,
3188 			  struct i915_request *rq)
3189 {
3190 	GEM_BUG_ON(!list_empty(&rq->sched.link));
3191 	list_add_tail(&rq->sched.link,
3192 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3193 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3194 }
3195 
3196 static void __submit_queue_imm(struct intel_engine_cs *engine)
3197 {
3198 	struct intel_engine_execlists * const execlists = &engine->execlists;
3199 
3200 	if (reset_in_progress(execlists))
3201 		return; /* defer until we restart the engine following reset */
3202 
3203 	__execlists_submission_tasklet(engine);
3204 }
3205 
3206 static void submit_queue(struct intel_engine_cs *engine,
3207 			 const struct i915_request *rq)
3208 {
3209 	struct intel_engine_execlists *execlists = &engine->execlists;
3210 
3211 	if (rq_prio(rq) <= execlists->queue_priority_hint)
3212 		return;
3213 
3214 	execlists->queue_priority_hint = rq_prio(rq);
3215 	__submit_queue_imm(engine);
3216 }
3217 
3218 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3219 			     const struct i915_request *rq)
3220 {
3221 	GEM_BUG_ON(i915_request_on_hold(rq));
3222 	return !list_empty(&engine->active.hold) && hold_request(rq);
3223 }
3224 
3225 static void flush_csb(struct intel_engine_cs *engine)
3226 {
3227 	struct intel_engine_execlists *el = &engine->execlists;
3228 
3229 	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3230 		if (!reset_in_progress(el))
3231 			process_csb(engine);
3232 		tasklet_unlock(&el->tasklet);
3233 	}
3234 }
3235 
3236 static void execlists_submit_request(struct i915_request *request)
3237 {
3238 	struct intel_engine_cs *engine = request->engine;
3239 	unsigned long flags;
3240 
3241 	/* Hopefully we clear execlists->pending[] to let us through */
3242 	flush_csb(engine);
3243 
3244 	/* Will be called from irq-context when using foreign fences. */
3245 	spin_lock_irqsave(&engine->active.lock, flags);
3246 
3247 	if (unlikely(ancestor_on_hold(engine, request))) {
3248 		RQ_TRACE(request, "ancestor on hold\n");
3249 		list_add_tail(&request->sched.link, &engine->active.hold);
3250 		i915_request_set_hold(request);
3251 	} else {
3252 		queue_request(engine, request);
3253 
3254 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3255 		GEM_BUG_ON(list_empty(&request->sched.link));
3256 
3257 		submit_queue(engine, request);
3258 	}
3259 
3260 	spin_unlock_irqrestore(&engine->active.lock, flags);
3261 }
3262 
3263 static void __execlists_context_fini(struct intel_context *ce)
3264 {
3265 	intel_ring_put(ce->ring);
3266 	i915_vma_put(ce->state);
3267 }
3268 
3269 static void execlists_context_destroy(struct kref *kref)
3270 {
3271 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3272 
3273 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3274 	GEM_BUG_ON(intel_context_is_pinned(ce));
3275 
3276 	if (ce->state)
3277 		__execlists_context_fini(ce);
3278 
3279 	intel_context_fini(ce);
3280 	intel_context_free(ce);
3281 }
3282 
3283 static void
3284 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3285 {
3286 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3287 		return;
3288 
3289 	vaddr += engine->context_size;
3290 
3291 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3292 }
3293 
3294 static void
3295 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3296 {
3297 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3298 		return;
3299 
3300 	vaddr += engine->context_size;
3301 
3302 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3303 		drm_err_once(&engine->i915->drm,
3304 			     "%s context redzone overwritten!\n",
3305 			     engine->name);
3306 }
3307 
3308 static void execlists_context_unpin(struct intel_context *ce)
3309 {
3310 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3311 		      ce->engine);
3312 
3313 	i915_gem_object_unpin_map(ce->state->obj);
3314 }
3315 
3316 static u32 *
3317 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3318 {
3319 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3320 		MI_SRM_LRM_GLOBAL_GTT |
3321 		MI_LRI_LRM_CS_MMIO;
3322 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3323 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3324 		CTX_TIMESTAMP * sizeof(u32);
3325 	*cs++ = 0;
3326 
3327 	*cs++ = MI_LOAD_REGISTER_REG |
3328 		MI_LRR_SOURCE_CS_MMIO |
3329 		MI_LRI_LRM_CS_MMIO;
3330 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3331 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3332 
3333 	*cs++ = MI_LOAD_REGISTER_REG |
3334 		MI_LRR_SOURCE_CS_MMIO |
3335 		MI_LRI_LRM_CS_MMIO;
3336 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3337 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3338 
3339 	return cs;
3340 }
3341 
3342 static u32 *
3343 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3344 {
3345 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3346 
3347 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3348 		MI_SRM_LRM_GLOBAL_GTT |
3349 		MI_LRI_LRM_CS_MMIO;
3350 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3351 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3352 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3353 	*cs++ = 0;
3354 
3355 	return cs;
3356 }
3357 
3358 static u32 *
3359 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3360 {
3361 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3362 
3363 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3364 		MI_SRM_LRM_GLOBAL_GTT |
3365 		MI_LRI_LRM_CS_MMIO;
3366 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3367 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3368 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3369 	*cs++ = 0;
3370 
3371 	*cs++ = MI_LOAD_REGISTER_REG |
3372 		MI_LRR_SOURCE_CS_MMIO |
3373 		MI_LRI_LRM_CS_MMIO;
3374 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3375 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3376 
3377 	return cs;
3378 }
3379 
3380 static u32 *
3381 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3382 {
3383 	cs = gen12_emit_timestamp_wa(ce, cs);
3384 	cs = gen12_emit_cmd_buf_wa(ce, cs);
3385 	cs = gen12_emit_restore_scratch(ce, cs);
3386 
3387 	return cs;
3388 }
3389 
3390 static u32 *
3391 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3392 {
3393 	cs = gen12_emit_timestamp_wa(ce, cs);
3394 	cs = gen12_emit_restore_scratch(ce, cs);
3395 
3396 	return cs;
3397 }
3398 
3399 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3400 {
3401 	return PAGE_SIZE * ce->wa_bb_page;
3402 }
3403 
3404 static u32 *context_indirect_bb(const struct intel_context *ce)
3405 {
3406 	void *ptr;
3407 
3408 	GEM_BUG_ON(!ce->wa_bb_page);
3409 
3410 	ptr = ce->lrc_reg_state;
3411 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3412 	ptr += context_wa_bb_offset(ce);
3413 
3414 	return ptr;
3415 }
3416 
3417 static void
3418 setup_indirect_ctx_bb(const struct intel_context *ce,
3419 		      const struct intel_engine_cs *engine,
3420 		      u32 *(*emit)(const struct intel_context *, u32 *))
3421 {
3422 	u32 * const start = context_indirect_bb(ce);
3423 	u32 *cs;
3424 
3425 	cs = emit(ce, start);
3426 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3427 	while ((unsigned long)cs % CACHELINE_BYTES)
3428 		*cs++ = MI_NOOP;
3429 
3430 	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3431 				    i915_ggtt_offset(ce->state) +
3432 				    context_wa_bb_offset(ce),
3433 				    (cs - start) * sizeof(*cs));
3434 }
3435 
3436 static void
3437 __execlists_update_reg_state(const struct intel_context *ce,
3438 			     const struct intel_engine_cs *engine,
3439 			     u32 head)
3440 {
3441 	struct intel_ring *ring = ce->ring;
3442 	u32 *regs = ce->lrc_reg_state;
3443 
3444 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3445 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3446 
3447 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3448 	regs[CTX_RING_HEAD] = head;
3449 	regs[CTX_RING_TAIL] = ring->tail;
3450 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3451 
3452 	/* RPCS */
3453 	if (engine->class == RENDER_CLASS) {
3454 		regs[CTX_R_PWR_CLK_STATE] =
3455 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3456 
3457 		i915_oa_init_reg_state(ce, engine);
3458 	}
3459 
3460 	if (ce->wa_bb_page) {
3461 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3462 
3463 		fn = gen12_emit_indirect_ctx_xcs;
3464 		if (ce->engine->class == RENDER_CLASS)
3465 			fn = gen12_emit_indirect_ctx_rcs;
3466 
3467 		/* Mutually exclusive wrt to global indirect bb */
3468 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3469 		setup_indirect_ctx_bb(ce, engine, fn);
3470 	}
3471 }
3472 
3473 static int
3474 __execlists_context_pin(struct intel_context *ce,
3475 			struct intel_engine_cs *engine)
3476 {
3477 	void *vaddr;
3478 
3479 	GEM_BUG_ON(!ce->state);
3480 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3481 
3482 	vaddr = i915_gem_object_pin_map(ce->state->obj,
3483 					i915_coherent_map_type(engine->i915) |
3484 					I915_MAP_OVERRIDE);
3485 	if (IS_ERR(vaddr))
3486 		return PTR_ERR(vaddr);
3487 
3488 	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3489 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3490 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3491 
3492 	return 0;
3493 }
3494 
3495 static int execlists_context_pin(struct intel_context *ce)
3496 {
3497 	return __execlists_context_pin(ce, ce->engine);
3498 }
3499 
3500 static int execlists_context_alloc(struct intel_context *ce)
3501 {
3502 	return __execlists_context_alloc(ce, ce->engine);
3503 }
3504 
3505 static void execlists_context_reset(struct intel_context *ce)
3506 {
3507 	CE_TRACE(ce, "reset\n");
3508 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3509 
3510 	intel_ring_reset(ce->ring, ce->ring->emit);
3511 
3512 	/* Scrub away the garbage */
3513 	execlists_init_reg_state(ce->lrc_reg_state,
3514 				 ce, ce->engine, ce->ring, true);
3515 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3516 
3517 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3518 }
3519 
3520 static const struct intel_context_ops execlists_context_ops = {
3521 	.alloc = execlists_context_alloc,
3522 
3523 	.pin = execlists_context_pin,
3524 	.unpin = execlists_context_unpin,
3525 
3526 	.enter = intel_context_enter_engine,
3527 	.exit = intel_context_exit_engine,
3528 
3529 	.reset = execlists_context_reset,
3530 	.destroy = execlists_context_destroy,
3531 };
3532 
3533 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3534 {
3535 	u32 *cs;
3536 
3537 	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3538 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3539 		return 0;
3540 
3541 	cs = intel_ring_begin(rq, 6);
3542 	if (IS_ERR(cs))
3543 		return PTR_ERR(cs);
3544 
3545 	/*
3546 	 * Check if we have been preempted before we even get started.
3547 	 *
3548 	 * After this point i915_request_started() reports true, even if
3549 	 * we get preempted and so are no longer running.
3550 	 */
3551 	*cs++ = MI_ARB_CHECK;
3552 	*cs++ = MI_NOOP;
3553 
3554 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3555 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3556 	*cs++ = 0;
3557 	*cs++ = rq->fence.seqno - 1;
3558 
3559 	intel_ring_advance(rq, cs);
3560 
3561 	/* Record the updated position of the request's payload */
3562 	rq->infix = intel_ring_offset(rq, cs);
3563 
3564 	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3565 
3566 	return 0;
3567 }
3568 
3569 static int emit_pdps(struct i915_request *rq)
3570 {
3571 	const struct intel_engine_cs * const engine = rq->engine;
3572 	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3573 	int err, i;
3574 	u32 *cs;
3575 
3576 	GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3577 
3578 	/*
3579 	 * Beware ye of the dragons, this sequence is magic!
3580 	 *
3581 	 * Small changes to this sequence can cause anything from
3582 	 * GPU hangs to forcewake errors and machine lockups!
3583 	 */
3584 
3585 	/* Flush any residual operations from the context load */
3586 	err = engine->emit_flush(rq, EMIT_FLUSH);
3587 	if (err)
3588 		return err;
3589 
3590 	/* Magic required to prevent forcewake errors! */
3591 	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3592 	if (err)
3593 		return err;
3594 
3595 	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3596 	if (IS_ERR(cs))
3597 		return PTR_ERR(cs);
3598 
3599 	/* Ensure the LRI have landed before we invalidate & continue */
3600 	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3601 	for (i = GEN8_3LVL_PDPES; i--; ) {
3602 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3603 		u32 base = engine->mmio_base;
3604 
3605 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3606 		*cs++ = upper_32_bits(pd_daddr);
3607 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3608 		*cs++ = lower_32_bits(pd_daddr);
3609 	}
3610 	*cs++ = MI_NOOP;
3611 
3612 	intel_ring_advance(rq, cs);
3613 
3614 	return 0;
3615 }
3616 
3617 static int execlists_request_alloc(struct i915_request *request)
3618 {
3619 	int ret;
3620 
3621 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3622 
3623 	/*
3624 	 * Flush enough space to reduce the likelihood of waiting after
3625 	 * we start building the request - in which case we will just
3626 	 * have to repeat work.
3627 	 */
3628 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3629 
3630 	/*
3631 	 * Note that after this point, we have committed to using
3632 	 * this request as it is being used to both track the
3633 	 * state of engine initialisation and liveness of the
3634 	 * golden renderstate above. Think twice before you try
3635 	 * to cancel/unwind this request now.
3636 	 */
3637 
3638 	if (!i915_vm_is_4lvl(request->context->vm)) {
3639 		ret = emit_pdps(request);
3640 		if (ret)
3641 			return ret;
3642 	}
3643 
3644 	/* Unconditionally invalidate GPU caches and TLBs. */
3645 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3646 	if (ret)
3647 		return ret;
3648 
3649 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3650 	return 0;
3651 }
3652 
3653 /*
3654  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3655  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3656  * but there is a slight complication as this is applied in WA batch where the
3657  * values are only initialized once so we cannot take register value at the
3658  * beginning and reuse it further; hence we save its value to memory, upload a
3659  * constant value with bit21 set and then we restore it back with the saved value.
3660  * To simplify the WA, a constant value is formed by using the default value
3661  * of this register. This shouldn't be a problem because we are only modifying
3662  * it for a short period and this batch in non-premptible. We can ofcourse
3663  * use additional instructions that read the actual value of the register
3664  * at that time and set our bit of interest but it makes the WA complicated.
3665  *
3666  * This WA is also required for Gen9 so extracting as a function avoids
3667  * code duplication.
3668  */
3669 static u32 *
3670 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3671 {
3672 	/* NB no one else is allowed to scribble over scratch + 256! */
3673 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3674 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3675 	*batch++ = intel_gt_scratch_offset(engine->gt,
3676 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3677 	*batch++ = 0;
3678 
3679 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3680 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3681 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3682 
3683 	batch = gen8_emit_pipe_control(batch,
3684 				       PIPE_CONTROL_CS_STALL |
3685 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3686 				       0);
3687 
3688 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3689 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3690 	*batch++ = intel_gt_scratch_offset(engine->gt,
3691 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3692 	*batch++ = 0;
3693 
3694 	return batch;
3695 }
3696 
3697 /*
3698  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3699  * initialized at the beginning and shared across all contexts but this field
3700  * helps us to have multiple batches at different offsets and select them based
3701  * on a criteria. At the moment this batch always start at the beginning of the page
3702  * and at this point we don't have multiple wa_ctx batch buffers.
3703  *
3704  * The number of WA applied are not known at the beginning; we use this field
3705  * to return the no of DWORDS written.
3706  *
3707  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3708  * so it adds NOOPs as padding to make it cacheline aligned.
3709  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3710  * makes a complete batch buffer.
3711  */
3712 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3713 {
3714 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3715 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3716 
3717 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3718 	if (IS_BROADWELL(engine->i915))
3719 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3720 
3721 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3722 	/* Actual scratch location is at 128 bytes offset */
3723 	batch = gen8_emit_pipe_control(batch,
3724 				       PIPE_CONTROL_FLUSH_L3 |
3725 				       PIPE_CONTROL_STORE_DATA_INDEX |
3726 				       PIPE_CONTROL_CS_STALL |
3727 				       PIPE_CONTROL_QW_WRITE,
3728 				       LRC_PPHWSP_SCRATCH_ADDR);
3729 
3730 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3731 
3732 	/* Pad to end of cacheline */
3733 	while ((unsigned long)batch % CACHELINE_BYTES)
3734 		*batch++ = MI_NOOP;
3735 
3736 	/*
3737 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3738 	 * execution depends on the length specified in terms of cache lines
3739 	 * in the register CTX_RCS_INDIRECT_CTX
3740 	 */
3741 
3742 	return batch;
3743 }
3744 
3745 struct lri {
3746 	i915_reg_t reg;
3747 	u32 value;
3748 };
3749 
3750 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3751 {
3752 	GEM_BUG_ON(!count || count > 63);
3753 
3754 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3755 	do {
3756 		*batch++ = i915_mmio_reg_offset(lri->reg);
3757 		*batch++ = lri->value;
3758 	} while (lri++, --count);
3759 	*batch++ = MI_NOOP;
3760 
3761 	return batch;
3762 }
3763 
3764 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3765 {
3766 	static const struct lri lri[] = {
3767 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3768 		{
3769 			COMMON_SLICE_CHICKEN2,
3770 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3771 				       0),
3772 		},
3773 
3774 		/* BSpec: 11391 */
3775 		{
3776 			FF_SLICE_CHICKEN,
3777 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3778 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3779 		},
3780 
3781 		/* BSpec: 11299 */
3782 		{
3783 			_3D_CHICKEN3,
3784 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3785 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3786 		}
3787 	};
3788 
3789 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3790 
3791 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3792 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3793 
3794 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3795 	batch = gen8_emit_pipe_control(batch,
3796 				       PIPE_CONTROL_FLUSH_L3 |
3797 				       PIPE_CONTROL_STORE_DATA_INDEX |
3798 				       PIPE_CONTROL_CS_STALL |
3799 				       PIPE_CONTROL_QW_WRITE,
3800 				       LRC_PPHWSP_SCRATCH_ADDR);
3801 
3802 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3803 
3804 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3805 	if (HAS_POOLED_EU(engine->i915)) {
3806 		/*
3807 		 * EU pool configuration is setup along with golden context
3808 		 * during context initialization. This value depends on
3809 		 * device type (2x6 or 3x6) and needs to be updated based
3810 		 * on which subslice is disabled especially for 2x6
3811 		 * devices, however it is safe to load default
3812 		 * configuration of 3x6 device instead of masking off
3813 		 * corresponding bits because HW ignores bits of a disabled
3814 		 * subslice and drops down to appropriate config. Please
3815 		 * see render_state_setup() in i915_gem_render_state.c for
3816 		 * possible configurations, to avoid duplication they are
3817 		 * not shown here again.
3818 		 */
3819 		*batch++ = GEN9_MEDIA_POOL_STATE;
3820 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3821 		*batch++ = 0x00777000;
3822 		*batch++ = 0;
3823 		*batch++ = 0;
3824 		*batch++ = 0;
3825 	}
3826 
3827 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3828 
3829 	/* Pad to end of cacheline */
3830 	while ((unsigned long)batch % CACHELINE_BYTES)
3831 		*batch++ = MI_NOOP;
3832 
3833 	return batch;
3834 }
3835 
3836 static u32 *
3837 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3838 {
3839 	int i;
3840 
3841 	/*
3842 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3843 	 *
3844 	 * Ensure the engine is idle prior to programming a
3845 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3846 	 */
3847 	batch = gen8_emit_pipe_control(batch,
3848 				       PIPE_CONTROL_CS_STALL,
3849 				       0);
3850 	/*
3851 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3852 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3853 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3854 	 * confusing. Since gen8_emit_pipe_control() already advances the
3855 	 * batch by 6 dwords, we advance the other 10 here, completing a
3856 	 * cacheline. It's not clear if the workaround requires this padding
3857 	 * before other commands, or if it's just the regular padding we would
3858 	 * already have for the workaround bb, so leave it here for now.
3859 	 */
3860 	for (i = 0; i < 10; i++)
3861 		*batch++ = MI_NOOP;
3862 
3863 	/* Pad to end of cacheline */
3864 	while ((unsigned long)batch % CACHELINE_BYTES)
3865 		*batch++ = MI_NOOP;
3866 
3867 	return batch;
3868 }
3869 
3870 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3871 
3872 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3873 {
3874 	struct drm_i915_gem_object *obj;
3875 	struct i915_vma *vma;
3876 	int err;
3877 
3878 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3879 	if (IS_ERR(obj))
3880 		return PTR_ERR(obj);
3881 
3882 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3883 	if (IS_ERR(vma)) {
3884 		err = PTR_ERR(vma);
3885 		goto err;
3886 	}
3887 
3888 	err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3889 	if (err)
3890 		goto err;
3891 
3892 	engine->wa_ctx.vma = vma;
3893 	return 0;
3894 
3895 err:
3896 	i915_gem_object_put(obj);
3897 	return err;
3898 }
3899 
3900 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3901 {
3902 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3903 }
3904 
3905 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3906 
3907 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3908 {
3909 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3910 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3911 					    &wa_ctx->per_ctx };
3912 	wa_bb_func_t wa_bb_fn[2];
3913 	void *batch, *batch_ptr;
3914 	unsigned int i;
3915 	int ret;
3916 
3917 	if (engine->class != RENDER_CLASS)
3918 		return 0;
3919 
3920 	switch (INTEL_GEN(engine->i915)) {
3921 	case 12:
3922 	case 11:
3923 		return 0;
3924 	case 10:
3925 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3926 		wa_bb_fn[1] = NULL;
3927 		break;
3928 	case 9:
3929 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3930 		wa_bb_fn[1] = NULL;
3931 		break;
3932 	case 8:
3933 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3934 		wa_bb_fn[1] = NULL;
3935 		break;
3936 	default:
3937 		MISSING_CASE(INTEL_GEN(engine->i915));
3938 		return 0;
3939 	}
3940 
3941 	ret = lrc_setup_wa_ctx(engine);
3942 	if (ret) {
3943 		drm_dbg(&engine->i915->drm,
3944 			"Failed to setup context WA page: %d\n", ret);
3945 		return ret;
3946 	}
3947 
3948 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
3949 
3950 	/*
3951 	 * Emit the two workaround batch buffers, recording the offset from the
3952 	 * start of the workaround batch buffer object for each and their
3953 	 * respective sizes.
3954 	 */
3955 	batch_ptr = batch;
3956 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3957 		wa_bb[i]->offset = batch_ptr - batch;
3958 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3959 						  CACHELINE_BYTES))) {
3960 			ret = -EINVAL;
3961 			break;
3962 		}
3963 		if (wa_bb_fn[i])
3964 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3965 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3966 	}
3967 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3968 
3969 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
3970 	__i915_gem_object_release_map(wa_ctx->vma->obj);
3971 	if (ret)
3972 		lrc_destroy_wa_ctx(engine);
3973 
3974 	return ret;
3975 }
3976 
3977 static void reset_csb_pointers(struct intel_engine_cs *engine)
3978 {
3979 	struct intel_engine_execlists * const execlists = &engine->execlists;
3980 	const unsigned int reset_value = execlists->csb_size - 1;
3981 
3982 	ring_set_paused(engine, 0);
3983 
3984 	/*
3985 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3986 	 * Bludgeon them with a mmio update to be sure.
3987 	 */
3988 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3989 		     0xffff << 16 | reset_value << 8 | reset_value);
3990 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3991 
3992 	/*
3993 	 * After a reset, the HW starts writing into CSB entry [0]. We
3994 	 * therefore have to set our HEAD pointer back one entry so that
3995 	 * the *first* entry we check is entry 0. To complicate this further,
3996 	 * as we don't wait for the first interrupt after reset, we have to
3997 	 * fake the HW write to point back to the last entry so that our
3998 	 * inline comparison of our cached head position against the last HW
3999 	 * write works even before the first interrupt.
4000 	 */
4001 	execlists->csb_head = reset_value;
4002 	WRITE_ONCE(*execlists->csb_write, reset_value);
4003 	wmb(); /* Make sure this is visible to HW (paranoia?) */
4004 
4005 	invalidate_csb_entries(&execlists->csb_status[0],
4006 			       &execlists->csb_status[reset_value]);
4007 
4008 	/* Once more for luck and our trusty paranoia */
4009 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4010 		     0xffff << 16 | reset_value << 8 | reset_value);
4011 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4012 
4013 	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4014 }
4015 
4016 static void execlists_sanitize(struct intel_engine_cs *engine)
4017 {
4018 	/*
4019 	 * Poison residual state on resume, in case the suspend didn't!
4020 	 *
4021 	 * We have to assume that across suspend/resume (or other loss
4022 	 * of control) that the contents of our pinned buffers has been
4023 	 * lost, replaced by garbage. Since this doesn't always happen,
4024 	 * let's poison such state so that we more quickly spot when
4025 	 * we falsely assume it has been preserved.
4026 	 */
4027 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4028 		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4029 
4030 	reset_csb_pointers(engine);
4031 
4032 	/*
4033 	 * The kernel_context HWSP is stored in the status_page. As above,
4034 	 * that may be lost on resume/initialisation, and so we need to
4035 	 * reset the value in the HWSP.
4036 	 */
4037 	intel_timeline_reset_seqno(engine->kernel_context->timeline);
4038 
4039 	/* And scrub the dirty cachelines for the HWSP */
4040 	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4041 }
4042 
4043 static void enable_error_interrupt(struct intel_engine_cs *engine)
4044 {
4045 	u32 status;
4046 
4047 	engine->execlists.error_interrupt = 0;
4048 	ENGINE_WRITE(engine, RING_EMR, ~0u);
4049 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4050 
4051 	status = ENGINE_READ(engine, RING_ESR);
4052 	if (unlikely(status)) {
4053 		drm_err(&engine->i915->drm,
4054 			"engine '%s' resumed still in error: %08x\n",
4055 			engine->name, status);
4056 		__intel_gt_reset(engine->gt, engine->mask);
4057 	}
4058 
4059 	/*
4060 	 * On current gen8+, we have 2 signals to play with
4061 	 *
4062 	 * - I915_ERROR_INSTUCTION (bit 0)
4063 	 *
4064 	 *    Generate an error if the command parser encounters an invalid
4065 	 *    instruction
4066 	 *
4067 	 *    This is a fatal error.
4068 	 *
4069 	 * - CP_PRIV (bit 2)
4070 	 *
4071 	 *    Generate an error on privilege violation (where the CP replaces
4072 	 *    the instruction with a no-op). This also fires for writes into
4073 	 *    read-only scratch pages.
4074 	 *
4075 	 *    This is a non-fatal error, parsing continues.
4076 	 *
4077 	 * * there are a few others defined for odd HW that we do not use
4078 	 *
4079 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4080 	 * error (as the HW is validating and suppressing the mistakes), we
4081 	 * only unmask the instruction error bit.
4082 	 */
4083 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4084 }
4085 
4086 static void enable_execlists(struct intel_engine_cs *engine)
4087 {
4088 	u32 mode;
4089 
4090 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4091 
4092 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4093 
4094 	if (INTEL_GEN(engine->i915) >= 11)
4095 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4096 	else
4097 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4098 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4099 
4100 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4101 
4102 	ENGINE_WRITE_FW(engine,
4103 			RING_HWS_PGA,
4104 			i915_ggtt_offset(engine->status_page.vma));
4105 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4106 
4107 	enable_error_interrupt(engine);
4108 
4109 	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4110 }
4111 
4112 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4113 {
4114 	bool unexpected = false;
4115 
4116 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4117 		drm_dbg(&engine->i915->drm,
4118 			"STOP_RING still set in RING_MI_MODE\n");
4119 		unexpected = true;
4120 	}
4121 
4122 	return unexpected;
4123 }
4124 
4125 static int execlists_resume(struct intel_engine_cs *engine)
4126 {
4127 	intel_mocs_init_engine(engine);
4128 
4129 	intel_engine_reset_breadcrumbs(engine);
4130 
4131 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4132 		struct drm_printer p = drm_debug_printer(__func__);
4133 
4134 		intel_engine_dump(engine, &p, NULL);
4135 	}
4136 
4137 	enable_execlists(engine);
4138 
4139 	return 0;
4140 }
4141 
4142 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4143 {
4144 	struct intel_engine_execlists * const execlists = &engine->execlists;
4145 	unsigned long flags;
4146 
4147 	ENGINE_TRACE(engine, "depth<-%d\n",
4148 		     atomic_read(&execlists->tasklet.count));
4149 
4150 	/*
4151 	 * Prevent request submission to the hardware until we have
4152 	 * completed the reset in i915_gem_reset_finish(). If a request
4153 	 * is completed by one engine, it may then queue a request
4154 	 * to a second via its execlists->tasklet *just* as we are
4155 	 * calling engine->resume() and also writing the ELSP.
4156 	 * Turning off the execlists->tasklet until the reset is over
4157 	 * prevents the race.
4158 	 */
4159 	__tasklet_disable_sync_once(&execlists->tasklet);
4160 	GEM_BUG_ON(!reset_in_progress(execlists));
4161 
4162 	/* And flush any current direct submission. */
4163 	spin_lock_irqsave(&engine->active.lock, flags);
4164 	spin_unlock_irqrestore(&engine->active.lock, flags);
4165 
4166 	/*
4167 	 * We stop engines, otherwise we might get failed reset and a
4168 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4169 	 * from system hang if batchbuffer is progressing when
4170 	 * the reset is issued, regardless of READY_TO_RESET ack.
4171 	 * Thus assume it is best to stop engines on all gens
4172 	 * where we have a gpu reset.
4173 	 *
4174 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4175 	 *
4176 	 * FIXME: Wa for more modern gens needs to be validated
4177 	 */
4178 	ring_set_paused(engine, 1);
4179 	intel_engine_stop_cs(engine);
4180 
4181 	engine->execlists.reset_ccid = active_ccid(engine);
4182 }
4183 
4184 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4185 {
4186 	int x;
4187 
4188 	x = lrc_ring_mi_mode(engine);
4189 	if (x != -1) {
4190 		regs[x + 1] &= ~STOP_RING;
4191 		regs[x + 1] |= STOP_RING << 16;
4192 	}
4193 }
4194 
4195 static void __execlists_reset_reg_state(const struct intel_context *ce,
4196 					const struct intel_engine_cs *engine)
4197 {
4198 	u32 *regs = ce->lrc_reg_state;
4199 
4200 	__reset_stop_ring(regs, engine);
4201 }
4202 
4203 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4204 {
4205 	struct intel_engine_execlists * const execlists = &engine->execlists;
4206 	struct intel_context *ce;
4207 	struct i915_request *rq;
4208 	u32 head;
4209 
4210 	mb(); /* paranoia: read the CSB pointers from after the reset */
4211 	clflush(execlists->csb_write);
4212 	mb();
4213 
4214 	process_csb(engine); /* drain preemption events */
4215 
4216 	/* Following the reset, we need to reload the CSB read/write pointers */
4217 	reset_csb_pointers(engine);
4218 
4219 	/*
4220 	 * Save the currently executing context, even if we completed
4221 	 * its request, it was still running at the time of the
4222 	 * reset and will have been clobbered.
4223 	 */
4224 	rq = active_context(engine, engine->execlists.reset_ccid);
4225 	if (!rq)
4226 		goto unwind;
4227 
4228 	ce = rq->context;
4229 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4230 
4231 	if (i915_request_completed(rq)) {
4232 		/* Idle context; tidy up the ring so we can restart afresh */
4233 		head = intel_ring_wrap(ce->ring, rq->tail);
4234 		goto out_replay;
4235 	}
4236 
4237 	/* We still have requests in-flight; the engine should be active */
4238 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4239 
4240 	/* Context has requests still in-flight; it should not be idle! */
4241 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4242 
4243 	rq = active_request(ce->timeline, rq);
4244 	head = intel_ring_wrap(ce->ring, rq->head);
4245 	GEM_BUG_ON(head == ce->ring->tail);
4246 
4247 	/*
4248 	 * If this request hasn't started yet, e.g. it is waiting on a
4249 	 * semaphore, we need to avoid skipping the request or else we
4250 	 * break the signaling chain. However, if the context is corrupt
4251 	 * the request will not restart and we will be stuck with a wedged
4252 	 * device. It is quite often the case that if we issue a reset
4253 	 * while the GPU is loading the context image, that the context
4254 	 * image becomes corrupt.
4255 	 *
4256 	 * Otherwise, if we have not started yet, the request should replay
4257 	 * perfectly and we do not need to flag the result as being erroneous.
4258 	 */
4259 	if (!i915_request_started(rq))
4260 		goto out_replay;
4261 
4262 	/*
4263 	 * If the request was innocent, we leave the request in the ELSP
4264 	 * and will try to replay it on restarting. The context image may
4265 	 * have been corrupted by the reset, in which case we may have
4266 	 * to service a new GPU hang, but more likely we can continue on
4267 	 * without impact.
4268 	 *
4269 	 * If the request was guilty, we presume the context is corrupt
4270 	 * and have to at least restore the RING register in the context
4271 	 * image back to the expected values to skip over the guilty request.
4272 	 */
4273 	__i915_request_reset(rq, stalled);
4274 
4275 	/*
4276 	 * We want a simple context + ring to execute the breadcrumb update.
4277 	 * We cannot rely on the context being intact across the GPU hang,
4278 	 * so clear it and rebuild just what we need for the breadcrumb.
4279 	 * All pending requests for this context will be zapped, and any
4280 	 * future request will be after userspace has had the opportunity
4281 	 * to recreate its own state.
4282 	 */
4283 out_replay:
4284 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4285 		     head, ce->ring->tail);
4286 	__execlists_reset_reg_state(ce, engine);
4287 	__execlists_update_reg_state(ce, engine, head);
4288 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4289 
4290 unwind:
4291 	/* Push back any incomplete requests for replay after the reset. */
4292 	cancel_port_requests(execlists);
4293 	__unwind_incomplete_requests(engine);
4294 }
4295 
4296 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4297 {
4298 	unsigned long flags;
4299 
4300 	ENGINE_TRACE(engine, "\n");
4301 
4302 	spin_lock_irqsave(&engine->active.lock, flags);
4303 
4304 	__execlists_reset(engine, stalled);
4305 
4306 	spin_unlock_irqrestore(&engine->active.lock, flags);
4307 }
4308 
4309 static void nop_submission_tasklet(unsigned long data)
4310 {
4311 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4312 
4313 	/* The driver is wedged; don't process any more events. */
4314 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4315 }
4316 
4317 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4318 {
4319 	struct intel_engine_execlists * const execlists = &engine->execlists;
4320 	struct i915_request *rq, *rn;
4321 	struct rb_node *rb;
4322 	unsigned long flags;
4323 
4324 	ENGINE_TRACE(engine, "\n");
4325 
4326 	/*
4327 	 * Before we call engine->cancel_requests(), we should have exclusive
4328 	 * access to the submission state. This is arranged for us by the
4329 	 * caller disabling the interrupt generation, the tasklet and other
4330 	 * threads that may then access the same state, giving us a free hand
4331 	 * to reset state. However, we still need to let lockdep be aware that
4332 	 * we know this state may be accessed in hardirq context, so we
4333 	 * disable the irq around this manipulation and we want to keep
4334 	 * the spinlock focused on its duties and not accidentally conflate
4335 	 * coverage to the submission's irq state. (Similarly, although we
4336 	 * shouldn't need to disable irq around the manipulation of the
4337 	 * submission's irq state, we also wish to remind ourselves that
4338 	 * it is irq state.)
4339 	 */
4340 	spin_lock_irqsave(&engine->active.lock, flags);
4341 
4342 	__execlists_reset(engine, true);
4343 
4344 	/* Mark all executing requests as skipped. */
4345 	list_for_each_entry(rq, &engine->active.requests, sched.link)
4346 		mark_eio(rq);
4347 
4348 	/* Flush the queued requests to the timeline list (for retiring). */
4349 	while ((rb = rb_first_cached(&execlists->queue))) {
4350 		struct i915_priolist *p = to_priolist(rb);
4351 		int i;
4352 
4353 		priolist_for_each_request_consume(rq, rn, p, i) {
4354 			mark_eio(rq);
4355 			__i915_request_submit(rq);
4356 		}
4357 
4358 		rb_erase_cached(&p->node, &execlists->queue);
4359 		i915_priolist_free(p);
4360 	}
4361 
4362 	/* On-hold requests will be flushed to timeline upon their release */
4363 	list_for_each_entry(rq, &engine->active.hold, sched.link)
4364 		mark_eio(rq);
4365 
4366 	/* Cancel all attached virtual engines */
4367 	while ((rb = rb_first_cached(&execlists->virtual))) {
4368 		struct virtual_engine *ve =
4369 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4370 
4371 		rb_erase_cached(rb, &execlists->virtual);
4372 		RB_CLEAR_NODE(rb);
4373 
4374 		spin_lock(&ve->base.active.lock);
4375 		rq = fetch_and_zero(&ve->request);
4376 		if (rq) {
4377 			mark_eio(rq);
4378 
4379 			rq->engine = engine;
4380 			__i915_request_submit(rq);
4381 			i915_request_put(rq);
4382 
4383 			ve->base.execlists.queue_priority_hint = INT_MIN;
4384 		}
4385 		spin_unlock(&ve->base.active.lock);
4386 	}
4387 
4388 	/* Remaining _unready_ requests will be nop'ed when submitted */
4389 
4390 	execlists->queue_priority_hint = INT_MIN;
4391 	execlists->queue = RB_ROOT_CACHED;
4392 
4393 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4394 	execlists->tasklet.func = nop_submission_tasklet;
4395 
4396 	spin_unlock_irqrestore(&engine->active.lock, flags);
4397 }
4398 
4399 static void execlists_reset_finish(struct intel_engine_cs *engine)
4400 {
4401 	struct intel_engine_execlists * const execlists = &engine->execlists;
4402 
4403 	/*
4404 	 * After a GPU reset, we may have requests to replay. Do so now while
4405 	 * we still have the forcewake to be sure that the GPU is not allowed
4406 	 * to sleep before we restart and reload a context.
4407 	 */
4408 	GEM_BUG_ON(!reset_in_progress(execlists));
4409 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4410 		execlists->tasklet.func(execlists->tasklet.data);
4411 
4412 	if (__tasklet_enable(&execlists->tasklet))
4413 		/* And kick in case we missed a new request submission. */
4414 		tasklet_hi_schedule(&execlists->tasklet);
4415 	ENGINE_TRACE(engine, "depth->%d\n",
4416 		     atomic_read(&execlists->tasklet.count));
4417 }
4418 
4419 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4420 				    u64 offset, u32 len,
4421 				    const unsigned int flags)
4422 {
4423 	u32 *cs;
4424 
4425 	cs = intel_ring_begin(rq, 4);
4426 	if (IS_ERR(cs))
4427 		return PTR_ERR(cs);
4428 
4429 	/*
4430 	 * WaDisableCtxRestoreArbitration:bdw,chv
4431 	 *
4432 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4433 	 * particular all the gen that do not need the w/a at all!), if we
4434 	 * took care to make sure that on every switch into this context
4435 	 * (both ordinary and for preemption) that arbitrartion was enabled
4436 	 * we would be fine.  However, for gen8 there is another w/a that
4437 	 * requires us to not preempt inside GPGPU execution, so we keep
4438 	 * arbitration disabled for gen8 batches. Arbitration will be
4439 	 * re-enabled before we close the request
4440 	 * (engine->emit_fini_breadcrumb).
4441 	 */
4442 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4443 
4444 	/* FIXME(BDW+): Address space and security selectors. */
4445 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4446 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4447 	*cs++ = lower_32_bits(offset);
4448 	*cs++ = upper_32_bits(offset);
4449 
4450 	intel_ring_advance(rq, cs);
4451 
4452 	return 0;
4453 }
4454 
4455 static int gen8_emit_bb_start(struct i915_request *rq,
4456 			      u64 offset, u32 len,
4457 			      const unsigned int flags)
4458 {
4459 	u32 *cs;
4460 
4461 	cs = intel_ring_begin(rq, 6);
4462 	if (IS_ERR(cs))
4463 		return PTR_ERR(cs);
4464 
4465 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4466 
4467 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4468 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4469 	*cs++ = lower_32_bits(offset);
4470 	*cs++ = upper_32_bits(offset);
4471 
4472 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4473 	*cs++ = MI_NOOP;
4474 
4475 	intel_ring_advance(rq, cs);
4476 
4477 	return 0;
4478 }
4479 
4480 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4481 {
4482 	ENGINE_WRITE(engine, RING_IMR,
4483 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4484 	ENGINE_POSTING_READ(engine, RING_IMR);
4485 }
4486 
4487 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4488 {
4489 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4490 }
4491 
4492 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4493 {
4494 	u32 cmd, *cs;
4495 
4496 	cs = intel_ring_begin(request, 4);
4497 	if (IS_ERR(cs))
4498 		return PTR_ERR(cs);
4499 
4500 	cmd = MI_FLUSH_DW + 1;
4501 
4502 	/* We always require a command barrier so that subsequent
4503 	 * commands, such as breadcrumb interrupts, are strictly ordered
4504 	 * wrt the contents of the write cache being flushed to memory
4505 	 * (and thus being coherent from the CPU).
4506 	 */
4507 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4508 
4509 	if (mode & EMIT_INVALIDATE) {
4510 		cmd |= MI_INVALIDATE_TLB;
4511 		if (request->engine->class == VIDEO_DECODE_CLASS)
4512 			cmd |= MI_INVALIDATE_BSD;
4513 	}
4514 
4515 	*cs++ = cmd;
4516 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4517 	*cs++ = 0; /* upper addr */
4518 	*cs++ = 0; /* value */
4519 	intel_ring_advance(request, cs);
4520 
4521 	return 0;
4522 }
4523 
4524 static int gen8_emit_flush_render(struct i915_request *request,
4525 				  u32 mode)
4526 {
4527 	bool vf_flush_wa = false, dc_flush_wa = false;
4528 	u32 *cs, flags = 0;
4529 	int len;
4530 
4531 	flags |= PIPE_CONTROL_CS_STALL;
4532 
4533 	if (mode & EMIT_FLUSH) {
4534 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4535 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4536 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4537 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4538 	}
4539 
4540 	if (mode & EMIT_INVALIDATE) {
4541 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4542 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4543 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4544 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4545 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4546 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4547 		flags |= PIPE_CONTROL_QW_WRITE;
4548 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4549 
4550 		/*
4551 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4552 		 * pipe control.
4553 		 */
4554 		if (IS_GEN(request->engine->i915, 9))
4555 			vf_flush_wa = true;
4556 
4557 		/* WaForGAMHang:kbl */
4558 		if (IS_KBL_REVID(request->engine->i915, 0, KBL_REVID_B0))
4559 			dc_flush_wa = true;
4560 	}
4561 
4562 	len = 6;
4563 
4564 	if (vf_flush_wa)
4565 		len += 6;
4566 
4567 	if (dc_flush_wa)
4568 		len += 12;
4569 
4570 	cs = intel_ring_begin(request, len);
4571 	if (IS_ERR(cs))
4572 		return PTR_ERR(cs);
4573 
4574 	if (vf_flush_wa)
4575 		cs = gen8_emit_pipe_control(cs, 0, 0);
4576 
4577 	if (dc_flush_wa)
4578 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4579 					    0);
4580 
4581 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4582 
4583 	if (dc_flush_wa)
4584 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4585 
4586 	intel_ring_advance(request, cs);
4587 
4588 	return 0;
4589 }
4590 
4591 static int gen11_emit_flush_render(struct i915_request *request,
4592 				   u32 mode)
4593 {
4594 	if (mode & EMIT_FLUSH) {
4595 		u32 *cs;
4596 		u32 flags = 0;
4597 
4598 		flags |= PIPE_CONTROL_CS_STALL;
4599 
4600 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4601 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4602 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4603 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4604 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4605 		flags |= PIPE_CONTROL_QW_WRITE;
4606 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4607 
4608 		cs = intel_ring_begin(request, 6);
4609 		if (IS_ERR(cs))
4610 			return PTR_ERR(cs);
4611 
4612 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4613 		intel_ring_advance(request, cs);
4614 	}
4615 
4616 	if (mode & EMIT_INVALIDATE) {
4617 		u32 *cs;
4618 		u32 flags = 0;
4619 
4620 		flags |= PIPE_CONTROL_CS_STALL;
4621 
4622 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4623 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4624 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4625 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4626 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4627 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4628 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4629 		flags |= PIPE_CONTROL_QW_WRITE;
4630 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4631 
4632 		cs = intel_ring_begin(request, 6);
4633 		if (IS_ERR(cs))
4634 			return PTR_ERR(cs);
4635 
4636 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4637 		intel_ring_advance(request, cs);
4638 	}
4639 
4640 	return 0;
4641 }
4642 
4643 static u32 preparser_disable(bool state)
4644 {
4645 	return MI_ARB_CHECK | 1 << 8 | state;
4646 }
4647 
4648 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4649 {
4650 	static const i915_reg_t vd[] = {
4651 		GEN12_VD0_AUX_NV,
4652 		GEN12_VD1_AUX_NV,
4653 		GEN12_VD2_AUX_NV,
4654 		GEN12_VD3_AUX_NV,
4655 	};
4656 
4657 	static const i915_reg_t ve[] = {
4658 		GEN12_VE0_AUX_NV,
4659 		GEN12_VE1_AUX_NV,
4660 	};
4661 
4662 	if (engine->class == VIDEO_DECODE_CLASS)
4663 		return vd[engine->instance];
4664 
4665 	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4666 		return ve[engine->instance];
4667 
4668 	GEM_BUG_ON("unknown aux_inv_reg\n");
4669 
4670 	return INVALID_MMIO_REG;
4671 }
4672 
4673 static u32 *
4674 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4675 {
4676 	*cs++ = MI_LOAD_REGISTER_IMM(1);
4677 	*cs++ = i915_mmio_reg_offset(inv_reg);
4678 	*cs++ = AUX_INV;
4679 	*cs++ = MI_NOOP;
4680 
4681 	return cs;
4682 }
4683 
4684 static int gen12_emit_flush_render(struct i915_request *request,
4685 				   u32 mode)
4686 {
4687 	if (mode & EMIT_FLUSH) {
4688 		u32 flags = 0;
4689 		u32 *cs;
4690 
4691 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4692 		flags |= PIPE_CONTROL_FLUSH_L3;
4693 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4694 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4695 		/* Wa_1409600907:tgl */
4696 		flags |= PIPE_CONTROL_DEPTH_STALL;
4697 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4698 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4699 
4700 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4701 		flags |= PIPE_CONTROL_QW_WRITE;
4702 
4703 		flags |= PIPE_CONTROL_CS_STALL;
4704 
4705 		cs = intel_ring_begin(request, 6);
4706 		if (IS_ERR(cs))
4707 			return PTR_ERR(cs);
4708 
4709 		cs = gen12_emit_pipe_control(cs,
4710 					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4711 					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4712 		intel_ring_advance(request, cs);
4713 	}
4714 
4715 	if (mode & EMIT_INVALIDATE) {
4716 		u32 flags = 0;
4717 		u32 *cs;
4718 
4719 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4720 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4721 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4722 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4723 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4724 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4725 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4726 
4727 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4728 		flags |= PIPE_CONTROL_QW_WRITE;
4729 
4730 		flags |= PIPE_CONTROL_CS_STALL;
4731 
4732 		cs = intel_ring_begin(request, 8 + 4);
4733 		if (IS_ERR(cs))
4734 			return PTR_ERR(cs);
4735 
4736 		/*
4737 		 * Prevent the pre-parser from skipping past the TLB
4738 		 * invalidate and loading a stale page for the batch
4739 		 * buffer / request payload.
4740 		 */
4741 		*cs++ = preparser_disable(true);
4742 
4743 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4744 
4745 		/* hsdes: 1809175790 */
4746 		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4747 
4748 		*cs++ = preparser_disable(false);
4749 		intel_ring_advance(request, cs);
4750 	}
4751 
4752 	return 0;
4753 }
4754 
4755 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4756 {
4757 	intel_engine_mask_t aux_inv = 0;
4758 	u32 cmd, *cs;
4759 
4760 	if (mode & EMIT_INVALIDATE)
4761 		aux_inv = request->engine->mask & ~BIT(BCS0);
4762 
4763 	cs = intel_ring_begin(request,
4764 			      4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0));
4765 	if (IS_ERR(cs))
4766 		return PTR_ERR(cs);
4767 
4768 	cmd = MI_FLUSH_DW + 1;
4769 
4770 	/* We always require a command barrier so that subsequent
4771 	 * commands, such as breadcrumb interrupts, are strictly ordered
4772 	 * wrt the contents of the write cache being flushed to memory
4773 	 * (and thus being coherent from the CPU).
4774 	 */
4775 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4776 
4777 	if (mode & EMIT_INVALIDATE) {
4778 		cmd |= MI_INVALIDATE_TLB;
4779 		if (request->engine->class == VIDEO_DECODE_CLASS)
4780 			cmd |= MI_INVALIDATE_BSD;
4781 	}
4782 
4783 	*cs++ = cmd;
4784 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4785 	*cs++ = 0; /* upper addr */
4786 	*cs++ = 0; /* value */
4787 
4788 	if (aux_inv) { /* hsdes: 1809175790 */
4789 		struct intel_engine_cs *engine;
4790 		unsigned int tmp;
4791 
4792 		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4793 		for_each_engine_masked(engine, request->engine->gt,
4794 				       aux_inv, tmp) {
4795 			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4796 			*cs++ = AUX_INV;
4797 		}
4798 		*cs++ = MI_NOOP;
4799 	}
4800 	intel_ring_advance(request, cs);
4801 
4802 	return 0;
4803 }
4804 
4805 static void assert_request_valid(struct i915_request *rq)
4806 {
4807 	struct intel_ring *ring __maybe_unused = rq->ring;
4808 
4809 	/* Can we unwind this request without appearing to go forwards? */
4810 	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4811 }
4812 
4813 /*
4814  * Reserve space for 2 NOOPs at the end of each request to be
4815  * used as a workaround for not being allowed to do lite
4816  * restore with HEAD==TAIL (WaIdleLiteRestore).
4817  */
4818 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4819 {
4820 	/* Ensure there's always at least one preemption point per-request. */
4821 	*cs++ = MI_ARB_CHECK;
4822 	*cs++ = MI_NOOP;
4823 	request->wa_tail = intel_ring_offset(request, cs);
4824 
4825 	/* Check that entire request is less than half the ring */
4826 	assert_request_valid(request);
4827 
4828 	return cs;
4829 }
4830 
4831 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4832 {
4833 	*cs++ = MI_SEMAPHORE_WAIT |
4834 		MI_SEMAPHORE_GLOBAL_GTT |
4835 		MI_SEMAPHORE_POLL |
4836 		MI_SEMAPHORE_SAD_EQ_SDD;
4837 	*cs++ = 0;
4838 	*cs++ = intel_hws_preempt_address(request->engine);
4839 	*cs++ = 0;
4840 
4841 	return cs;
4842 }
4843 
4844 static __always_inline u32*
4845 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4846 {
4847 	*cs++ = MI_USER_INTERRUPT;
4848 
4849 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4850 	if (intel_engine_has_semaphores(request->engine))
4851 		cs = emit_preempt_busywait(request, cs);
4852 
4853 	request->tail = intel_ring_offset(request, cs);
4854 	assert_ring_tail_valid(request->ring, request->tail);
4855 
4856 	return gen8_emit_wa_tail(request, cs);
4857 }
4858 
4859 static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs)
4860 {
4861 	u32 addr = i915_request_active_timeline(request)->hwsp_offset;
4862 
4863 	return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0);
4864 }
4865 
4866 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4867 {
4868 	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4869 }
4870 
4871 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4872 {
4873 	cs = gen8_emit_pipe_control(cs,
4874 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4875 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4876 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4877 				    0);
4878 
4879 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4880 	cs = gen8_emit_ggtt_write_rcs(cs,
4881 				      request->fence.seqno,
4882 				      i915_request_active_timeline(request)->hwsp_offset,
4883 				      PIPE_CONTROL_FLUSH_ENABLE |
4884 				      PIPE_CONTROL_CS_STALL);
4885 
4886 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4887 }
4888 
4889 static u32 *
4890 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4891 {
4892 	cs = gen8_emit_ggtt_write_rcs(cs,
4893 				      request->fence.seqno,
4894 				      i915_request_active_timeline(request)->hwsp_offset,
4895 				      PIPE_CONTROL_CS_STALL |
4896 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4897 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4898 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4899 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4900 				      PIPE_CONTROL_FLUSH_ENABLE);
4901 
4902 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4903 }
4904 
4905 /*
4906  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4907  * flush and will continue pre-fetching the instructions after it before the
4908  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4909  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4910  * of the next request before the memory has been flushed, we're guaranteed that
4911  * we won't access the batch itself too early.
4912  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4913  * so, if the current request is modifying an instruction in the next request on
4914  * the same intel_context, we might pre-fetch and then execute the pre-update
4915  * instruction. To avoid this, the users of self-modifying code should either
4916  * disable the parser around the code emitting the memory writes, via a new flag
4917  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4918  * the in-kernel use-cases we've opted to use a separate context, see
4919  * reloc_gpu() as an example.
4920  * All the above applies only to the instructions themselves. Non-inline data
4921  * used by the instructions is not pre-fetched.
4922  */
4923 
4924 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4925 {
4926 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4927 		MI_SEMAPHORE_GLOBAL_GTT |
4928 		MI_SEMAPHORE_POLL |
4929 		MI_SEMAPHORE_SAD_EQ_SDD;
4930 	*cs++ = 0;
4931 	*cs++ = intel_hws_preempt_address(request->engine);
4932 	*cs++ = 0;
4933 	*cs++ = 0;
4934 	*cs++ = MI_NOOP;
4935 
4936 	return cs;
4937 }
4938 
4939 static __always_inline u32*
4940 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4941 {
4942 	*cs++ = MI_USER_INTERRUPT;
4943 
4944 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4945 	if (intel_engine_has_semaphores(request->engine))
4946 		cs = gen12_emit_preempt_busywait(request, cs);
4947 
4948 	request->tail = intel_ring_offset(request, cs);
4949 	assert_ring_tail_valid(request->ring, request->tail);
4950 
4951 	return gen8_emit_wa_tail(request, cs);
4952 }
4953 
4954 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4955 {
4956 	return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4957 }
4958 
4959 static u32 *
4960 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4961 {
4962 	cs = gen12_emit_ggtt_write_rcs(cs,
4963 				       request->fence.seqno,
4964 				       i915_request_active_timeline(request)->hwsp_offset,
4965 				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4966 				       PIPE_CONTROL_CS_STALL |
4967 				       PIPE_CONTROL_TILE_CACHE_FLUSH |
4968 				       PIPE_CONTROL_FLUSH_L3 |
4969 				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4970 				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4971 				       /* Wa_1409600907:tgl */
4972 				       PIPE_CONTROL_DEPTH_STALL |
4973 				       PIPE_CONTROL_DC_FLUSH_ENABLE |
4974 				       PIPE_CONTROL_FLUSH_ENABLE);
4975 
4976 	return gen12_emit_fini_breadcrumb_tail(request, cs);
4977 }
4978 
4979 static void execlists_park(struct intel_engine_cs *engine)
4980 {
4981 	cancel_timer(&engine->execlists.timer);
4982 	cancel_timer(&engine->execlists.preempt);
4983 }
4984 
4985 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4986 {
4987 	engine->submit_request = execlists_submit_request;
4988 	engine->schedule = i915_schedule;
4989 	engine->execlists.tasklet.func = execlists_submission_tasklet;
4990 
4991 	engine->reset.prepare = execlists_reset_prepare;
4992 	engine->reset.rewind = execlists_reset_rewind;
4993 	engine->reset.cancel = execlists_reset_cancel;
4994 	engine->reset.finish = execlists_reset_finish;
4995 
4996 	engine->park = execlists_park;
4997 	engine->unpark = NULL;
4998 
4999 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5000 	if (!intel_vgpu_active(engine->i915)) {
5001 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5002 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5003 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5004 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5005 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5006 		}
5007 	}
5008 
5009 	if (INTEL_GEN(engine->i915) >= 12)
5010 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5011 
5012 	if (intel_engine_has_preemption(engine))
5013 		engine->emit_bb_start = gen8_emit_bb_start;
5014 	else
5015 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
5016 }
5017 
5018 static void execlists_shutdown(struct intel_engine_cs *engine)
5019 {
5020 	/* Synchronise with residual timers and any softirq they raise */
5021 	del_timer_sync(&engine->execlists.timer);
5022 	del_timer_sync(&engine->execlists.preempt);
5023 	tasklet_kill(&engine->execlists.tasklet);
5024 }
5025 
5026 static void execlists_release(struct intel_engine_cs *engine)
5027 {
5028 	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5029 
5030 	execlists_shutdown(engine);
5031 
5032 	intel_engine_cleanup_common(engine);
5033 	lrc_destroy_wa_ctx(engine);
5034 }
5035 
5036 static void
5037 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5038 {
5039 	/* Default vfuncs which can be overriden by each engine. */
5040 
5041 	engine->resume = execlists_resume;
5042 
5043 	engine->cops = &execlists_context_ops;
5044 	engine->request_alloc = execlists_request_alloc;
5045 
5046 	engine->emit_flush = gen8_emit_flush;
5047 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5048 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5049 	if (INTEL_GEN(engine->i915) >= 12) {
5050 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5051 		engine->emit_flush = gen12_emit_flush;
5052 	}
5053 	engine->set_default_submission = intel_execlists_set_default_submission;
5054 
5055 	if (INTEL_GEN(engine->i915) < 11) {
5056 		engine->irq_enable = gen8_logical_ring_enable_irq;
5057 		engine->irq_disable = gen8_logical_ring_disable_irq;
5058 	} else {
5059 		/*
5060 		 * TODO: On Gen11 interrupt masks need to be clear
5061 		 * to allow C6 entry. Keep interrupts enabled at
5062 		 * and take the hit of generating extra interrupts
5063 		 * until a more refined solution exists.
5064 		 */
5065 	}
5066 }
5067 
5068 static inline void
5069 logical_ring_default_irqs(struct intel_engine_cs *engine)
5070 {
5071 	unsigned int shift = 0;
5072 
5073 	if (INTEL_GEN(engine->i915) < 11) {
5074 		const u8 irq_shifts[] = {
5075 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5076 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5077 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5078 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5079 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5080 		};
5081 
5082 		shift = irq_shifts[engine->id];
5083 	}
5084 
5085 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5086 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5087 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5088 	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5089 }
5090 
5091 static void rcs_submission_override(struct intel_engine_cs *engine)
5092 {
5093 	switch (INTEL_GEN(engine->i915)) {
5094 	case 12:
5095 		engine->emit_flush = gen12_emit_flush_render;
5096 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5097 		break;
5098 	case 11:
5099 		engine->emit_flush = gen11_emit_flush_render;
5100 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5101 		break;
5102 	default:
5103 		engine->emit_flush = gen8_emit_flush_render;
5104 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5105 		break;
5106 	}
5107 }
5108 
5109 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5110 {
5111 	struct intel_engine_execlists * const execlists = &engine->execlists;
5112 	struct drm_i915_private *i915 = engine->i915;
5113 	struct intel_uncore *uncore = engine->uncore;
5114 	u32 base = engine->mmio_base;
5115 
5116 	tasklet_init(&engine->execlists.tasklet,
5117 		     execlists_submission_tasklet, (unsigned long)engine);
5118 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5119 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5120 
5121 	logical_ring_default_vfuncs(engine);
5122 	logical_ring_default_irqs(engine);
5123 
5124 	if (engine->class == RENDER_CLASS)
5125 		rcs_submission_override(engine);
5126 
5127 	if (intel_init_workaround_bb(engine))
5128 		/*
5129 		 * We continue even if we fail to initialize WA batch
5130 		 * because we only expect rare glitches but nothing
5131 		 * critical to prevent us from using GPU
5132 		 */
5133 		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5134 
5135 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5136 		execlists->submit_reg = uncore->regs +
5137 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5138 		execlists->ctrl_reg = uncore->regs +
5139 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5140 	} else {
5141 		execlists->submit_reg = uncore->regs +
5142 			i915_mmio_reg_offset(RING_ELSP(base));
5143 	}
5144 
5145 	execlists->csb_status =
5146 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5147 
5148 	execlists->csb_write =
5149 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5150 
5151 	if (INTEL_GEN(i915) < 11)
5152 		execlists->csb_size = GEN8_CSB_ENTRIES;
5153 	else
5154 		execlists->csb_size = GEN11_CSB_ENTRIES;
5155 
5156 	if (INTEL_GEN(engine->i915) >= 11) {
5157 		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5158 		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5159 	}
5160 
5161 	/* Finally, take ownership and responsibility for cleanup! */
5162 	engine->sanitize = execlists_sanitize;
5163 	engine->release = execlists_release;
5164 
5165 	return 0;
5166 }
5167 
5168 static void init_common_reg_state(u32 * const regs,
5169 				  const struct intel_engine_cs *engine,
5170 				  const struct intel_ring *ring,
5171 				  bool inhibit)
5172 {
5173 	u32 ctl;
5174 
5175 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5176 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5177 	if (inhibit)
5178 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5179 	if (INTEL_GEN(engine->i915) < 11)
5180 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5181 					   CTX_CTRL_RS_CTX_ENABLE);
5182 	regs[CTX_CONTEXT_CONTROL] = ctl;
5183 
5184 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5185 	regs[CTX_TIMESTAMP] = 0;
5186 }
5187 
5188 static void init_wa_bb_reg_state(u32 * const regs,
5189 				 const struct intel_engine_cs *engine)
5190 {
5191 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5192 
5193 	if (wa_ctx->per_ctx.size) {
5194 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5195 
5196 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5197 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5198 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5199 	}
5200 
5201 	if (wa_ctx->indirect_ctx.size) {
5202 		lrc_ring_setup_indirect_ctx(regs, engine,
5203 					    i915_ggtt_offset(wa_ctx->vma) +
5204 					    wa_ctx->indirect_ctx.offset,
5205 					    wa_ctx->indirect_ctx.size);
5206 	}
5207 }
5208 
5209 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5210 {
5211 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5212 		/* 64b PPGTT (48bit canonical)
5213 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5214 		 * other PDP Descriptors are ignored.
5215 		 */
5216 		ASSIGN_CTX_PML4(ppgtt, regs);
5217 	} else {
5218 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5219 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5220 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5221 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5222 	}
5223 }
5224 
5225 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5226 {
5227 	if (i915_is_ggtt(vm))
5228 		return i915_vm_to_ggtt(vm)->alias;
5229 	else
5230 		return i915_vm_to_ppgtt(vm);
5231 }
5232 
5233 static void execlists_init_reg_state(u32 *regs,
5234 				     const struct intel_context *ce,
5235 				     const struct intel_engine_cs *engine,
5236 				     const struct intel_ring *ring,
5237 				     bool inhibit)
5238 {
5239 	/*
5240 	 * A context is actually a big batch buffer with several
5241 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5242 	 * values we are setting here are only for the first context restore:
5243 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5244 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5245 	 * we are not initializing here).
5246 	 *
5247 	 * Must keep consistent with virtual_update_register_offsets().
5248 	 */
5249 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5250 
5251 	init_common_reg_state(regs, engine, ring, inhibit);
5252 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5253 
5254 	init_wa_bb_reg_state(regs, engine);
5255 
5256 	__reset_stop_ring(regs, engine);
5257 }
5258 
5259 static int
5260 populate_lr_context(struct intel_context *ce,
5261 		    struct drm_i915_gem_object *ctx_obj,
5262 		    struct intel_engine_cs *engine,
5263 		    struct intel_ring *ring)
5264 {
5265 	bool inhibit = true;
5266 	void *vaddr;
5267 
5268 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5269 	if (IS_ERR(vaddr)) {
5270 		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5271 		return PTR_ERR(vaddr);
5272 	}
5273 
5274 	set_redzone(vaddr, engine);
5275 
5276 	if (engine->default_state) {
5277 		shmem_read(engine->default_state, 0,
5278 			   vaddr, engine->context_size);
5279 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5280 		inhibit = false;
5281 	}
5282 
5283 	/* Clear the ppHWSP (inc. per-context counters) */
5284 	memset(vaddr, 0, PAGE_SIZE);
5285 
5286 	/*
5287 	 * The second page of the context object contains some registers which
5288 	 * must be set up prior to the first execution.
5289 	 */
5290 	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5291 				 ce, engine, ring, inhibit);
5292 
5293 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5294 	i915_gem_object_unpin_map(ctx_obj);
5295 	return 0;
5296 }
5297 
5298 static int __execlists_context_alloc(struct intel_context *ce,
5299 				     struct intel_engine_cs *engine)
5300 {
5301 	struct drm_i915_gem_object *ctx_obj;
5302 	struct intel_ring *ring;
5303 	struct i915_vma *vma;
5304 	u32 context_size;
5305 	int ret;
5306 
5307 	GEM_BUG_ON(ce->state);
5308 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5309 
5310 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5311 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5312 
5313 	if (INTEL_GEN(engine->i915) == 12) {
5314 		ce->wa_bb_page = context_size / PAGE_SIZE;
5315 		context_size += PAGE_SIZE;
5316 	}
5317 
5318 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5319 	if (IS_ERR(ctx_obj))
5320 		return PTR_ERR(ctx_obj);
5321 
5322 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5323 	if (IS_ERR(vma)) {
5324 		ret = PTR_ERR(vma);
5325 		goto error_deref_obj;
5326 	}
5327 
5328 	if (!ce->timeline) {
5329 		struct intel_timeline *tl;
5330 		struct i915_vma *hwsp;
5331 
5332 		/*
5333 		 * Use the static global HWSP for the kernel context, and
5334 		 * a dynamically allocated cacheline for everyone else.
5335 		 */
5336 		hwsp = NULL;
5337 		if (unlikely(intel_context_is_barrier(ce)))
5338 			hwsp = engine->status_page.vma;
5339 
5340 		tl = intel_timeline_create(engine->gt, hwsp);
5341 		if (IS_ERR(tl)) {
5342 			ret = PTR_ERR(tl);
5343 			goto error_deref_obj;
5344 		}
5345 
5346 		ce->timeline = tl;
5347 	}
5348 
5349 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5350 	if (IS_ERR(ring)) {
5351 		ret = PTR_ERR(ring);
5352 		goto error_deref_obj;
5353 	}
5354 
5355 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5356 	if (ret) {
5357 		drm_dbg(&engine->i915->drm,
5358 			"Failed to populate LRC: %d\n", ret);
5359 		goto error_ring_free;
5360 	}
5361 
5362 	ce->ring = ring;
5363 	ce->state = vma;
5364 
5365 	return 0;
5366 
5367 error_ring_free:
5368 	intel_ring_put(ring);
5369 error_deref_obj:
5370 	i915_gem_object_put(ctx_obj);
5371 	return ret;
5372 }
5373 
5374 static struct list_head *virtual_queue(struct virtual_engine *ve)
5375 {
5376 	return &ve->base.execlists.default_priolist.requests[0];
5377 }
5378 
5379 static void virtual_context_destroy(struct kref *kref)
5380 {
5381 	struct virtual_engine *ve =
5382 		container_of(kref, typeof(*ve), context.ref);
5383 	unsigned int n;
5384 
5385 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5386 	GEM_BUG_ON(ve->request);
5387 	GEM_BUG_ON(ve->context.inflight);
5388 
5389 	for (n = 0; n < ve->num_siblings; n++) {
5390 		struct intel_engine_cs *sibling = ve->siblings[n];
5391 		struct rb_node *node = &ve->nodes[sibling->id].rb;
5392 		unsigned long flags;
5393 
5394 		if (RB_EMPTY_NODE(node))
5395 			continue;
5396 
5397 		spin_lock_irqsave(&sibling->active.lock, flags);
5398 
5399 		/* Detachment is lazily performed in the execlists tasklet */
5400 		if (!RB_EMPTY_NODE(node))
5401 			rb_erase_cached(node, &sibling->execlists.virtual);
5402 
5403 		spin_unlock_irqrestore(&sibling->active.lock, flags);
5404 	}
5405 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5406 
5407 	if (ve->context.state)
5408 		__execlists_context_fini(&ve->context);
5409 	intel_context_fini(&ve->context);
5410 
5411 	intel_engine_free_request_pool(&ve->base);
5412 
5413 	kfree(ve->bonds);
5414 	kfree(ve);
5415 }
5416 
5417 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5418 {
5419 	int swp;
5420 
5421 	/*
5422 	 * Pick a random sibling on starting to help spread the load around.
5423 	 *
5424 	 * New contexts are typically created with exactly the same order
5425 	 * of siblings, and often started in batches. Due to the way we iterate
5426 	 * the array of sibling when submitting requests, sibling[0] is
5427 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5428 	 * randomised across the system, we also help spread the load by the
5429 	 * first engine we inspect being different each time.
5430 	 *
5431 	 * NB This does not force us to execute on this engine, it will just
5432 	 * typically be the first we inspect for submission.
5433 	 */
5434 	swp = prandom_u32_max(ve->num_siblings);
5435 	if (swp)
5436 		swap(ve->siblings[swp], ve->siblings[0]);
5437 }
5438 
5439 static int virtual_context_alloc(struct intel_context *ce)
5440 {
5441 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5442 
5443 	return __execlists_context_alloc(ce, ve->siblings[0]);
5444 }
5445 
5446 static int virtual_context_pin(struct intel_context *ce)
5447 {
5448 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5449 
5450 	/* Note: we must use a real engine class for setting up reg state */
5451 	return __execlists_context_pin(ce, ve->siblings[0]);
5452 }
5453 
5454 static void virtual_context_enter(struct intel_context *ce)
5455 {
5456 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5457 	unsigned int n;
5458 
5459 	for (n = 0; n < ve->num_siblings; n++)
5460 		intel_engine_pm_get(ve->siblings[n]);
5461 
5462 	intel_timeline_enter(ce->timeline);
5463 }
5464 
5465 static void virtual_context_exit(struct intel_context *ce)
5466 {
5467 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5468 	unsigned int n;
5469 
5470 	intel_timeline_exit(ce->timeline);
5471 
5472 	for (n = 0; n < ve->num_siblings; n++)
5473 		intel_engine_pm_put(ve->siblings[n]);
5474 }
5475 
5476 static const struct intel_context_ops virtual_context_ops = {
5477 	.alloc = virtual_context_alloc,
5478 
5479 	.pin = virtual_context_pin,
5480 	.unpin = execlists_context_unpin,
5481 
5482 	.enter = virtual_context_enter,
5483 	.exit = virtual_context_exit,
5484 
5485 	.destroy = virtual_context_destroy,
5486 };
5487 
5488 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5489 {
5490 	struct i915_request *rq;
5491 	intel_engine_mask_t mask;
5492 
5493 	rq = READ_ONCE(ve->request);
5494 	if (!rq)
5495 		return 0;
5496 
5497 	/* The rq is ready for submission; rq->execution_mask is now stable. */
5498 	mask = rq->execution_mask;
5499 	if (unlikely(!mask)) {
5500 		/* Invalid selection, submit to a random engine in error */
5501 		i915_request_set_error_once(rq, -ENODEV);
5502 		mask = ve->siblings[0]->mask;
5503 	}
5504 
5505 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5506 		     rq->fence.context, rq->fence.seqno,
5507 		     mask, ve->base.execlists.queue_priority_hint);
5508 
5509 	return mask;
5510 }
5511 
5512 static void virtual_submission_tasklet(unsigned long data)
5513 {
5514 	struct virtual_engine * const ve = (struct virtual_engine *)data;
5515 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5516 	intel_engine_mask_t mask;
5517 	unsigned int n;
5518 
5519 	rcu_read_lock();
5520 	mask = virtual_submission_mask(ve);
5521 	rcu_read_unlock();
5522 	if (unlikely(!mask))
5523 		return;
5524 
5525 	local_irq_disable();
5526 	for (n = 0; n < ve->num_siblings; n++) {
5527 		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5528 		struct ve_node * const node = &ve->nodes[sibling->id];
5529 		struct rb_node **parent, *rb;
5530 		bool first;
5531 
5532 		if (!READ_ONCE(ve->request))
5533 			break; /* already handled by a sibling's tasklet */
5534 
5535 		if (unlikely(!(mask & sibling->mask))) {
5536 			if (!RB_EMPTY_NODE(&node->rb)) {
5537 				spin_lock(&sibling->active.lock);
5538 				rb_erase_cached(&node->rb,
5539 						&sibling->execlists.virtual);
5540 				RB_CLEAR_NODE(&node->rb);
5541 				spin_unlock(&sibling->active.lock);
5542 			}
5543 			continue;
5544 		}
5545 
5546 		spin_lock(&sibling->active.lock);
5547 
5548 		if (!RB_EMPTY_NODE(&node->rb)) {
5549 			/*
5550 			 * Cheat and avoid rebalancing the tree if we can
5551 			 * reuse this node in situ.
5552 			 */
5553 			first = rb_first_cached(&sibling->execlists.virtual) ==
5554 				&node->rb;
5555 			if (prio == node->prio || (prio > node->prio && first))
5556 				goto submit_engine;
5557 
5558 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5559 		}
5560 
5561 		rb = NULL;
5562 		first = true;
5563 		parent = &sibling->execlists.virtual.rb_root.rb_node;
5564 		while (*parent) {
5565 			struct ve_node *other;
5566 
5567 			rb = *parent;
5568 			other = rb_entry(rb, typeof(*other), rb);
5569 			if (prio > other->prio) {
5570 				parent = &rb->rb_left;
5571 			} else {
5572 				parent = &rb->rb_right;
5573 				first = false;
5574 			}
5575 		}
5576 
5577 		rb_link_node(&node->rb, rb, parent);
5578 		rb_insert_color_cached(&node->rb,
5579 				       &sibling->execlists.virtual,
5580 				       first);
5581 
5582 submit_engine:
5583 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5584 		node->prio = prio;
5585 		if (first && prio > sibling->execlists.queue_priority_hint)
5586 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5587 
5588 		spin_unlock(&sibling->active.lock);
5589 	}
5590 	local_irq_enable();
5591 }
5592 
5593 static void virtual_submit_request(struct i915_request *rq)
5594 {
5595 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5596 	struct i915_request *old;
5597 	unsigned long flags;
5598 
5599 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5600 		     rq->fence.context,
5601 		     rq->fence.seqno);
5602 
5603 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5604 
5605 	spin_lock_irqsave(&ve->base.active.lock, flags);
5606 
5607 	old = ve->request;
5608 	if (old) { /* background completion event from preempt-to-busy */
5609 		GEM_BUG_ON(!i915_request_completed(old));
5610 		__i915_request_submit(old);
5611 		i915_request_put(old);
5612 	}
5613 
5614 	if (i915_request_completed(rq)) {
5615 		__i915_request_submit(rq);
5616 
5617 		ve->base.execlists.queue_priority_hint = INT_MIN;
5618 		ve->request = NULL;
5619 	} else {
5620 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5621 		ve->request = i915_request_get(rq);
5622 
5623 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5624 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5625 
5626 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
5627 	}
5628 
5629 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5630 }
5631 
5632 static struct ve_bond *
5633 virtual_find_bond(struct virtual_engine *ve,
5634 		  const struct intel_engine_cs *master)
5635 {
5636 	int i;
5637 
5638 	for (i = 0; i < ve->num_bonds; i++) {
5639 		if (ve->bonds[i].master == master)
5640 			return &ve->bonds[i];
5641 	}
5642 
5643 	return NULL;
5644 }
5645 
5646 static void
5647 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5648 {
5649 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5650 	intel_engine_mask_t allowed, exec;
5651 	struct ve_bond *bond;
5652 
5653 	allowed = ~to_request(signal)->engine->mask;
5654 
5655 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5656 	if (bond)
5657 		allowed &= bond->sibling_mask;
5658 
5659 	/* Restrict the bonded request to run on only the available engines */
5660 	exec = READ_ONCE(rq->execution_mask);
5661 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5662 		;
5663 
5664 	/* Prevent the master from being re-run on the bonded engines */
5665 	to_request(signal)->execution_mask &= ~allowed;
5666 }
5667 
5668 struct intel_context *
5669 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5670 			       unsigned int count)
5671 {
5672 	struct virtual_engine *ve;
5673 	unsigned int n;
5674 	int err;
5675 
5676 	if (count == 0)
5677 		return ERR_PTR(-EINVAL);
5678 
5679 	if (count == 1)
5680 		return intel_context_create(siblings[0]);
5681 
5682 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5683 	if (!ve)
5684 		return ERR_PTR(-ENOMEM);
5685 
5686 	ve->base.i915 = siblings[0]->i915;
5687 	ve->base.gt = siblings[0]->gt;
5688 	ve->base.uncore = siblings[0]->uncore;
5689 	ve->base.id = -1;
5690 
5691 	ve->base.class = OTHER_CLASS;
5692 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5693 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5694 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5695 
5696 	/*
5697 	 * The decision on whether to submit a request using semaphores
5698 	 * depends on the saturated state of the engine. We only compute
5699 	 * this during HW submission of the request, and we need for this
5700 	 * state to be globally applied to all requests being submitted
5701 	 * to this engine. Virtual engines encompass more than one physical
5702 	 * engine and so we cannot accurately tell in advance if one of those
5703 	 * engines is already saturated and so cannot afford to use a semaphore
5704 	 * and be pessimized in priority for doing so -- if we are the only
5705 	 * context using semaphores after all other clients have stopped, we
5706 	 * will be starved on the saturated system. Such a global switch for
5707 	 * semaphores is less than ideal, but alas is the current compromise.
5708 	 */
5709 	ve->base.saturated = ALL_ENGINES;
5710 
5711 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5712 
5713 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5714 	intel_engine_init_breadcrumbs(&ve->base);
5715 	intel_engine_init_execlists(&ve->base);
5716 	ve->base.breadcrumbs.irq_armed = true; /* fake HW, used for irq_work */
5717 
5718 	ve->base.cops = &virtual_context_ops;
5719 	ve->base.request_alloc = execlists_request_alloc;
5720 
5721 	ve->base.schedule = i915_schedule;
5722 	ve->base.submit_request = virtual_submit_request;
5723 	ve->base.bond_execute = virtual_bond_execute;
5724 
5725 	INIT_LIST_HEAD(virtual_queue(ve));
5726 	ve->base.execlists.queue_priority_hint = INT_MIN;
5727 	tasklet_init(&ve->base.execlists.tasklet,
5728 		     virtual_submission_tasklet,
5729 		     (unsigned long)ve);
5730 
5731 	intel_context_init(&ve->context, &ve->base);
5732 
5733 	for (n = 0; n < count; n++) {
5734 		struct intel_engine_cs *sibling = siblings[n];
5735 
5736 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5737 		if (sibling->mask & ve->base.mask) {
5738 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5739 				  sibling->name);
5740 			err = -EINVAL;
5741 			goto err_put;
5742 		}
5743 
5744 		/*
5745 		 * The virtual engine implementation is tightly coupled to
5746 		 * the execlists backend -- we push out request directly
5747 		 * into a tree inside each physical engine. We could support
5748 		 * layering if we handle cloning of the requests and
5749 		 * submitting a copy into each backend.
5750 		 */
5751 		if (sibling->execlists.tasklet.func !=
5752 		    execlists_submission_tasklet) {
5753 			err = -ENODEV;
5754 			goto err_put;
5755 		}
5756 
5757 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5758 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5759 
5760 		ve->siblings[ve->num_siblings++] = sibling;
5761 		ve->base.mask |= sibling->mask;
5762 
5763 		/*
5764 		 * All physical engines must be compatible for their emission
5765 		 * functions (as we build the instructions during request
5766 		 * construction and do not alter them before submission
5767 		 * on the physical engine). We use the engine class as a guide
5768 		 * here, although that could be refined.
5769 		 */
5770 		if (ve->base.class != OTHER_CLASS) {
5771 			if (ve->base.class != sibling->class) {
5772 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5773 					  sibling->class, ve->base.class);
5774 				err = -EINVAL;
5775 				goto err_put;
5776 			}
5777 			continue;
5778 		}
5779 
5780 		ve->base.class = sibling->class;
5781 		ve->base.uabi_class = sibling->uabi_class;
5782 		snprintf(ve->base.name, sizeof(ve->base.name),
5783 			 "v%dx%d", ve->base.class, count);
5784 		ve->base.context_size = sibling->context_size;
5785 
5786 		ve->base.emit_bb_start = sibling->emit_bb_start;
5787 		ve->base.emit_flush = sibling->emit_flush;
5788 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5789 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5790 		ve->base.emit_fini_breadcrumb_dw =
5791 			sibling->emit_fini_breadcrumb_dw;
5792 
5793 		ve->base.flags = sibling->flags;
5794 	}
5795 
5796 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5797 
5798 	virtual_engine_initial_hint(ve);
5799 	return &ve->context;
5800 
5801 err_put:
5802 	intel_context_put(&ve->context);
5803 	return ERR_PTR(err);
5804 }
5805 
5806 struct intel_context *
5807 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5808 {
5809 	struct virtual_engine *se = to_virtual_engine(src);
5810 	struct intel_context *dst;
5811 
5812 	dst = intel_execlists_create_virtual(se->siblings,
5813 					     se->num_siblings);
5814 	if (IS_ERR(dst))
5815 		return dst;
5816 
5817 	if (se->num_bonds) {
5818 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5819 
5820 		de->bonds = kmemdup(se->bonds,
5821 				    sizeof(*se->bonds) * se->num_bonds,
5822 				    GFP_KERNEL);
5823 		if (!de->bonds) {
5824 			intel_context_put(dst);
5825 			return ERR_PTR(-ENOMEM);
5826 		}
5827 
5828 		de->num_bonds = se->num_bonds;
5829 	}
5830 
5831 	return dst;
5832 }
5833 
5834 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5835 				     const struct intel_engine_cs *master,
5836 				     const struct intel_engine_cs *sibling)
5837 {
5838 	struct virtual_engine *ve = to_virtual_engine(engine);
5839 	struct ve_bond *bond;
5840 	int n;
5841 
5842 	/* Sanity check the sibling is part of the virtual engine */
5843 	for (n = 0; n < ve->num_siblings; n++)
5844 		if (sibling == ve->siblings[n])
5845 			break;
5846 	if (n == ve->num_siblings)
5847 		return -EINVAL;
5848 
5849 	bond = virtual_find_bond(ve, master);
5850 	if (bond) {
5851 		bond->sibling_mask |= sibling->mask;
5852 		return 0;
5853 	}
5854 
5855 	bond = krealloc(ve->bonds,
5856 			sizeof(*bond) * (ve->num_bonds + 1),
5857 			GFP_KERNEL);
5858 	if (!bond)
5859 		return -ENOMEM;
5860 
5861 	bond[ve->num_bonds].master = master;
5862 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5863 
5864 	ve->bonds = bond;
5865 	ve->num_bonds++;
5866 
5867 	return 0;
5868 }
5869 
5870 struct intel_engine_cs *
5871 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5872 				 unsigned int sibling)
5873 {
5874 	struct virtual_engine *ve = to_virtual_engine(engine);
5875 
5876 	if (sibling >= ve->num_siblings)
5877 		return NULL;
5878 
5879 	return ve->siblings[sibling];
5880 }
5881 
5882 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5883 				   struct drm_printer *m,
5884 				   void (*show_request)(struct drm_printer *m,
5885 							struct i915_request *rq,
5886 							const char *prefix),
5887 				   unsigned int max)
5888 {
5889 	const struct intel_engine_execlists *execlists = &engine->execlists;
5890 	struct i915_request *rq, *last;
5891 	unsigned long flags;
5892 	unsigned int count;
5893 	struct rb_node *rb;
5894 
5895 	spin_lock_irqsave(&engine->active.lock, flags);
5896 
5897 	last = NULL;
5898 	count = 0;
5899 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5900 		if (count++ < max - 1)
5901 			show_request(m, rq, "\t\tE ");
5902 		else
5903 			last = rq;
5904 	}
5905 	if (last) {
5906 		if (count > max) {
5907 			drm_printf(m,
5908 				   "\t\t...skipping %d executing requests...\n",
5909 				   count - max);
5910 		}
5911 		show_request(m, last, "\t\tE ");
5912 	}
5913 
5914 	if (execlists->switch_priority_hint != INT_MIN)
5915 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5916 			   READ_ONCE(execlists->switch_priority_hint));
5917 	if (execlists->queue_priority_hint != INT_MIN)
5918 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5919 			   READ_ONCE(execlists->queue_priority_hint));
5920 
5921 	last = NULL;
5922 	count = 0;
5923 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5924 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5925 		int i;
5926 
5927 		priolist_for_each_request(rq, p, i) {
5928 			if (count++ < max - 1)
5929 				show_request(m, rq, "\t\tQ ");
5930 			else
5931 				last = rq;
5932 		}
5933 	}
5934 	if (last) {
5935 		if (count > max) {
5936 			drm_printf(m,
5937 				   "\t\t...skipping %d queued requests...\n",
5938 				   count - max);
5939 		}
5940 		show_request(m, last, "\t\tQ ");
5941 	}
5942 
5943 	last = NULL;
5944 	count = 0;
5945 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5946 		struct virtual_engine *ve =
5947 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5948 		struct i915_request *rq = READ_ONCE(ve->request);
5949 
5950 		if (rq) {
5951 			if (count++ < max - 1)
5952 				show_request(m, rq, "\t\tV ");
5953 			else
5954 				last = rq;
5955 		}
5956 	}
5957 	if (last) {
5958 		if (count > max) {
5959 			drm_printf(m,
5960 				   "\t\t...skipping %d virtual requests...\n",
5961 				   count - max);
5962 		}
5963 		show_request(m, last, "\t\tV ");
5964 	}
5965 
5966 	spin_unlock_irqrestore(&engine->active.lock, flags);
5967 }
5968 
5969 void intel_lr_context_reset(struct intel_engine_cs *engine,
5970 			    struct intel_context *ce,
5971 			    u32 head,
5972 			    bool scrub)
5973 {
5974 	GEM_BUG_ON(!intel_context_is_pinned(ce));
5975 
5976 	/*
5977 	 * We want a simple context + ring to execute the breadcrumb update.
5978 	 * We cannot rely on the context being intact across the GPU hang,
5979 	 * so clear it and rebuild just what we need for the breadcrumb.
5980 	 * All pending requests for this context will be zapped, and any
5981 	 * future request will be after userspace has had the opportunity
5982 	 * to recreate its own state.
5983 	 */
5984 	if (scrub)
5985 		restore_default_state(ce, engine);
5986 
5987 	/* Rerun the request; its payload has been neutered (if guilty). */
5988 	__execlists_update_reg_state(ce, engine, head);
5989 }
5990 
5991 bool
5992 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5993 {
5994 	return engine->set_default_submission ==
5995 	       intel_execlists_set_default_submission;
5996 }
5997 
5998 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5999 #include "selftest_lrc.c"
6000 #endif
6001