xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 45e50f48)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_breadcrumbs.h"
141 #include "intel_context.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_gt_requests.h"
146 #include "intel_lrc_reg.h"
147 #include "intel_mocs.h"
148 #include "intel_reset.h"
149 #include "intel_ring.h"
150 #include "intel_workarounds.h"
151 #include "shmem_utils.h"
152 
153 #define RING_EXECLIST_QFULL		(1 << 0x2)
154 #define RING_EXECLIST1_VALID		(1 << 0x3)
155 #define RING_EXECLIST0_VALID		(1 << 0x4)
156 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
157 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
158 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
159 
160 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
161 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
163 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
164 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
165 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
166 
167 #define GEN8_CTX_STATUS_COMPLETED_MASK \
168 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
169 
170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
171 
172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
174 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
175 #define GEN12_IDLE_CTX_ID		0x7FF
176 #define GEN12_CSB_CTX_VALID(csb_dw) \
177 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
178 
179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
181 
182 struct virtual_engine {
183 	struct intel_engine_cs base;
184 	struct intel_context context;
185 
186 	/*
187 	 * We allow only a single request through the virtual engine at a time
188 	 * (each request in the timeline waits for the completion fence of
189 	 * the previous before being submitted). By restricting ourselves to
190 	 * only submitting a single request, each request is placed on to a
191 	 * physical to maximise load spreading (by virtue of the late greedy
192 	 * scheduling -- each real engine takes the next available request
193 	 * upon idling).
194 	 */
195 	struct i915_request *request;
196 
197 	/*
198 	 * We keep a rbtree of available virtual engines inside each physical
199 	 * engine, sorted by priority. Here we preallocate the nodes we need
200 	 * for the virtual engine, indexed by physical_engine->id.
201 	 */
202 	struct ve_node {
203 		struct rb_node rb;
204 		int prio;
205 	} nodes[I915_NUM_ENGINES];
206 
207 	/*
208 	 * Keep track of bonded pairs -- restrictions upon on our selection
209 	 * of physical engines any particular request may be submitted to.
210 	 * If we receive a submit-fence from a master engine, we will only
211 	 * use one of sibling_mask physical engines.
212 	 */
213 	struct ve_bond {
214 		const struct intel_engine_cs *master;
215 		intel_engine_mask_t sibling_mask;
216 	} *bonds;
217 	unsigned int num_bonds;
218 
219 	/* And finally, which physical engines this virtual engine maps onto. */
220 	unsigned int num_siblings;
221 	struct intel_engine_cs *siblings[];
222 };
223 
224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
225 {
226 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
227 	return container_of(engine, struct virtual_engine, base);
228 }
229 
230 static int __execlists_context_alloc(struct intel_context *ce,
231 				     struct intel_engine_cs *engine);
232 
233 static void execlists_init_reg_state(u32 *reg_state,
234 				     const struct intel_context *ce,
235 				     const struct intel_engine_cs *engine,
236 				     const struct intel_ring *ring,
237 				     bool close);
238 static void
239 __execlists_update_reg_state(const struct intel_context *ce,
240 			     const struct intel_engine_cs *engine,
241 			     u32 head);
242 
243 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
244 {
245 	if (INTEL_GEN(engine->i915) >= 12)
246 		return 0x60;
247 	else if (INTEL_GEN(engine->i915) >= 9)
248 		return 0x54;
249 	else if (engine->class == RENDER_CLASS)
250 		return 0x58;
251 	else
252 		return -1;
253 }
254 
255 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
256 {
257 	if (INTEL_GEN(engine->i915) >= 12)
258 		return 0x74;
259 	else if (INTEL_GEN(engine->i915) >= 9)
260 		return 0x68;
261 	else if (engine->class == RENDER_CLASS)
262 		return 0xd8;
263 	else
264 		return -1;
265 }
266 
267 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
268 {
269 	if (INTEL_GEN(engine->i915) >= 12)
270 		return 0x12;
271 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
272 		return 0x18;
273 	else
274 		return -1;
275 }
276 
277 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
278 {
279 	int x;
280 
281 	x = lrc_ring_wa_bb_per_ctx(engine);
282 	if (x < 0)
283 		return x;
284 
285 	return x + 2;
286 }
287 
288 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
289 {
290 	int x;
291 
292 	x = lrc_ring_indirect_ptr(engine);
293 	if (x < 0)
294 		return x;
295 
296 	return x + 2;
297 }
298 
299 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
300 {
301 	if (engine->class != RENDER_CLASS)
302 		return -1;
303 
304 	if (INTEL_GEN(engine->i915) >= 12)
305 		return 0xb6;
306 	else if (INTEL_GEN(engine->i915) >= 11)
307 		return 0xaa;
308 	else
309 		return -1;
310 }
311 
312 static u32
313 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
314 {
315 	switch (INTEL_GEN(engine->i915)) {
316 	default:
317 		MISSING_CASE(INTEL_GEN(engine->i915));
318 		fallthrough;
319 	case 12:
320 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
321 	case 11:
322 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
323 	case 10:
324 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
325 	case 9:
326 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
327 	case 8:
328 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
329 	}
330 }
331 
332 static void
333 lrc_ring_setup_indirect_ctx(u32 *regs,
334 			    const struct intel_engine_cs *engine,
335 			    u32 ctx_bb_ggtt_addr,
336 			    u32 size)
337 {
338 	GEM_BUG_ON(!size);
339 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
340 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
341 	regs[lrc_ring_indirect_ptr(engine) + 1] =
342 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
343 
344 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
345 	regs[lrc_ring_indirect_offset(engine) + 1] =
346 		lrc_ring_indirect_offset_default(engine) << 6;
347 }
348 
349 static u32 intel_context_get_runtime(const struct intel_context *ce)
350 {
351 	/*
352 	 * We can use either ppHWSP[16] which is recorded before the context
353 	 * switch (and so excludes the cost of context switches) or use the
354 	 * value from the context image itself, which is saved/restored earlier
355 	 * and so includes the cost of the save.
356 	 */
357 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
358 }
359 
360 static void mark_eio(struct i915_request *rq)
361 {
362 	if (i915_request_completed(rq))
363 		return;
364 
365 	GEM_BUG_ON(i915_request_signaled(rq));
366 
367 	i915_request_set_error_once(rq, -EIO);
368 	i915_request_mark_complete(rq);
369 }
370 
371 static struct i915_request *
372 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
373 {
374 	struct i915_request *active = rq;
375 
376 	rcu_read_lock();
377 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
378 		if (i915_request_completed(rq))
379 			break;
380 
381 		active = rq;
382 	}
383 	rcu_read_unlock();
384 
385 	return active;
386 }
387 
388 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
389 {
390 	return (i915_ggtt_offset(engine->status_page.vma) +
391 		I915_GEM_HWS_PREEMPT_ADDR);
392 }
393 
394 static inline void
395 ring_set_paused(const struct intel_engine_cs *engine, int state)
396 {
397 	/*
398 	 * We inspect HWS_PREEMPT with a semaphore inside
399 	 * engine->emit_fini_breadcrumb. If the dword is true,
400 	 * the ring is paused as the semaphore will busywait
401 	 * until the dword is false.
402 	 */
403 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
404 	if (state)
405 		wmb();
406 }
407 
408 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
409 {
410 	return rb_entry(rb, struct i915_priolist, node);
411 }
412 
413 static inline int rq_prio(const struct i915_request *rq)
414 {
415 	return READ_ONCE(rq->sched.attr.priority);
416 }
417 
418 static int effective_prio(const struct i915_request *rq)
419 {
420 	int prio = rq_prio(rq);
421 
422 	/*
423 	 * If this request is special and must not be interrupted at any
424 	 * cost, so be it. Note we are only checking the most recent request
425 	 * in the context and so may be masking an earlier vip request. It
426 	 * is hoped that under the conditions where nopreempt is used, this
427 	 * will not matter (i.e. all requests to that context will be
428 	 * nopreempt for as long as desired).
429 	 */
430 	if (i915_request_has_nopreempt(rq))
431 		prio = I915_PRIORITY_UNPREEMPTABLE;
432 
433 	return prio;
434 }
435 
436 static int queue_prio(const struct intel_engine_execlists *execlists)
437 {
438 	struct i915_priolist *p;
439 	struct rb_node *rb;
440 
441 	rb = rb_first_cached(&execlists->queue);
442 	if (!rb)
443 		return INT_MIN;
444 
445 	/*
446 	 * As the priolist[] are inverted, with the highest priority in [0],
447 	 * we have to flip the index value to become priority.
448 	 */
449 	p = to_priolist(rb);
450 	if (!I915_USER_PRIORITY_SHIFT)
451 		return p->priority;
452 
453 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
454 }
455 
456 static inline bool need_preempt(const struct intel_engine_cs *engine,
457 				const struct i915_request *rq,
458 				struct rb_node *rb)
459 {
460 	int last_prio;
461 
462 	if (!intel_engine_has_semaphores(engine))
463 		return false;
464 
465 	/*
466 	 * Check if the current priority hint merits a preemption attempt.
467 	 *
468 	 * We record the highest value priority we saw during rescheduling
469 	 * prior to this dequeue, therefore we know that if it is strictly
470 	 * less than the current tail of ESLP[0], we do not need to force
471 	 * a preempt-to-idle cycle.
472 	 *
473 	 * However, the priority hint is a mere hint that we may need to
474 	 * preempt. If that hint is stale or we may be trying to preempt
475 	 * ourselves, ignore the request.
476 	 *
477 	 * More naturally we would write
478 	 *      prio >= max(0, last);
479 	 * except that we wish to prevent triggering preemption at the same
480 	 * priority level: the task that is running should remain running
481 	 * to preserve FIFO ordering of dependencies.
482 	 */
483 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
484 	if (engine->execlists.queue_priority_hint <= last_prio)
485 		return false;
486 
487 	/*
488 	 * Check against the first request in ELSP[1], it will, thanks to the
489 	 * power of PI, be the highest priority of that context.
490 	 */
491 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
492 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
493 		return true;
494 
495 	if (rb) {
496 		struct virtual_engine *ve =
497 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
498 		bool preempt = false;
499 
500 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
501 			struct i915_request *next;
502 
503 			rcu_read_lock();
504 			next = READ_ONCE(ve->request);
505 			if (next)
506 				preempt = rq_prio(next) > last_prio;
507 			rcu_read_unlock();
508 		}
509 
510 		if (preempt)
511 			return preempt;
512 	}
513 
514 	/*
515 	 * If the inflight context did not trigger the preemption, then maybe
516 	 * it was the set of queued requests? Pick the highest priority in
517 	 * the queue (the first active priolist) and see if it deserves to be
518 	 * running instead of ELSP[0].
519 	 *
520 	 * The highest priority request in the queue can not be either
521 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
522 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
523 	 */
524 	return queue_prio(&engine->execlists) > last_prio;
525 }
526 
527 __maybe_unused static inline bool
528 assert_priority_queue(const struct i915_request *prev,
529 		      const struct i915_request *next)
530 {
531 	/*
532 	 * Without preemption, the prev may refer to the still active element
533 	 * which we refuse to let go.
534 	 *
535 	 * Even with preemption, there are times when we think it is better not
536 	 * to preempt and leave an ostensibly lower priority request in flight.
537 	 */
538 	if (i915_request_is_active(prev))
539 		return true;
540 
541 	return rq_prio(prev) >= rq_prio(next);
542 }
543 
544 /*
545  * The context descriptor encodes various attributes of a context,
546  * including its GTT address and some flags. Because it's fairly
547  * expensive to calculate, we'll just do it once and cache the result,
548  * which remains valid until the context is unpinned.
549  *
550  * This is what a descriptor looks like, from LSB to MSB::
551  *
552  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
553  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
554  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
555  *      bits 53-54:    mbz, reserved for use by hardware
556  *      bits 55-63:    group ID, currently unused and set to 0
557  *
558  * Starting from Gen11, the upper dword of the descriptor has a new format:
559  *
560  *      bits 32-36:    reserved
561  *      bits 37-47:    SW context ID
562  *      bits 48:53:    engine instance
563  *      bit 54:        mbz, reserved for use by hardware
564  *      bits 55-60:    SW counter
565  *      bits 61-63:    engine class
566  *
567  * engine info, SW context ID and SW counter need to form a unique number
568  * (Context ID) per lrc.
569  */
570 static u32
571 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
572 {
573 	u32 desc;
574 
575 	desc = INTEL_LEGACY_32B_CONTEXT;
576 	if (i915_vm_is_4lvl(ce->vm))
577 		desc = INTEL_LEGACY_64B_CONTEXT;
578 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
579 
580 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
581 	if (IS_GEN(engine->i915, 8))
582 		desc |= GEN8_CTX_L3LLC_COHERENT;
583 
584 	return i915_ggtt_offset(ce->state) | desc;
585 }
586 
587 static inline unsigned int dword_in_page(void *addr)
588 {
589 	return offset_in_page(addr) / sizeof(u32);
590 }
591 
592 static void set_offsets(u32 *regs,
593 			const u8 *data,
594 			const struct intel_engine_cs *engine,
595 			bool clear)
596 #define NOP(x) (BIT(7) | (x))
597 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
598 #define POSTED BIT(0)
599 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
600 #define REG16(x) \
601 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
602 	(((x) >> 2) & 0x7f)
603 #define END(total_state_size) 0, (total_state_size)
604 {
605 	const u32 base = engine->mmio_base;
606 
607 	while (*data) {
608 		u8 count, flags;
609 
610 		if (*data & BIT(7)) { /* skip */
611 			count = *data++ & ~BIT(7);
612 			if (clear)
613 				memset32(regs, MI_NOOP, count);
614 			regs += count;
615 			continue;
616 		}
617 
618 		count = *data & 0x3f;
619 		flags = *data >> 6;
620 		data++;
621 
622 		*regs = MI_LOAD_REGISTER_IMM(count);
623 		if (flags & POSTED)
624 			*regs |= MI_LRI_FORCE_POSTED;
625 		if (INTEL_GEN(engine->i915) >= 11)
626 			*regs |= MI_LRI_LRM_CS_MMIO;
627 		regs++;
628 
629 		GEM_BUG_ON(!count);
630 		do {
631 			u32 offset = 0;
632 			u8 v;
633 
634 			do {
635 				v = *data++;
636 				offset <<= 7;
637 				offset |= v & ~BIT(7);
638 			} while (v & BIT(7));
639 
640 			regs[0] = base + (offset << 2);
641 			if (clear)
642 				regs[1] = 0;
643 			regs += 2;
644 		} while (--count);
645 	}
646 
647 	if (clear) {
648 		u8 count = *++data;
649 
650 		/* Clear past the tail for HW access */
651 		GEM_BUG_ON(dword_in_page(regs) > count);
652 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
653 
654 		/* Close the batch; used mainly by live_lrc_layout() */
655 		*regs = MI_BATCH_BUFFER_END;
656 		if (INTEL_GEN(engine->i915) >= 10)
657 			*regs |= BIT(0);
658 	}
659 }
660 
661 static const u8 gen8_xcs_offsets[] = {
662 	NOP(1),
663 	LRI(11, 0),
664 	REG16(0x244),
665 	REG(0x034),
666 	REG(0x030),
667 	REG(0x038),
668 	REG(0x03c),
669 	REG(0x168),
670 	REG(0x140),
671 	REG(0x110),
672 	REG(0x11c),
673 	REG(0x114),
674 	REG(0x118),
675 
676 	NOP(9),
677 	LRI(9, 0),
678 	REG16(0x3a8),
679 	REG16(0x28c),
680 	REG16(0x288),
681 	REG16(0x284),
682 	REG16(0x280),
683 	REG16(0x27c),
684 	REG16(0x278),
685 	REG16(0x274),
686 	REG16(0x270),
687 
688 	NOP(13),
689 	LRI(2, 0),
690 	REG16(0x200),
691 	REG(0x028),
692 
693 	END(80)
694 };
695 
696 static const u8 gen9_xcs_offsets[] = {
697 	NOP(1),
698 	LRI(14, POSTED),
699 	REG16(0x244),
700 	REG(0x034),
701 	REG(0x030),
702 	REG(0x038),
703 	REG(0x03c),
704 	REG(0x168),
705 	REG(0x140),
706 	REG(0x110),
707 	REG(0x11c),
708 	REG(0x114),
709 	REG(0x118),
710 	REG(0x1c0),
711 	REG(0x1c4),
712 	REG(0x1c8),
713 
714 	NOP(3),
715 	LRI(9, POSTED),
716 	REG16(0x3a8),
717 	REG16(0x28c),
718 	REG16(0x288),
719 	REG16(0x284),
720 	REG16(0x280),
721 	REG16(0x27c),
722 	REG16(0x278),
723 	REG16(0x274),
724 	REG16(0x270),
725 
726 	NOP(13),
727 	LRI(1, POSTED),
728 	REG16(0x200),
729 
730 	NOP(13),
731 	LRI(44, POSTED),
732 	REG(0x028),
733 	REG(0x09c),
734 	REG(0x0c0),
735 	REG(0x178),
736 	REG(0x17c),
737 	REG16(0x358),
738 	REG(0x170),
739 	REG(0x150),
740 	REG(0x154),
741 	REG(0x158),
742 	REG16(0x41c),
743 	REG16(0x600),
744 	REG16(0x604),
745 	REG16(0x608),
746 	REG16(0x60c),
747 	REG16(0x610),
748 	REG16(0x614),
749 	REG16(0x618),
750 	REG16(0x61c),
751 	REG16(0x620),
752 	REG16(0x624),
753 	REG16(0x628),
754 	REG16(0x62c),
755 	REG16(0x630),
756 	REG16(0x634),
757 	REG16(0x638),
758 	REG16(0x63c),
759 	REG16(0x640),
760 	REG16(0x644),
761 	REG16(0x648),
762 	REG16(0x64c),
763 	REG16(0x650),
764 	REG16(0x654),
765 	REG16(0x658),
766 	REG16(0x65c),
767 	REG16(0x660),
768 	REG16(0x664),
769 	REG16(0x668),
770 	REG16(0x66c),
771 	REG16(0x670),
772 	REG16(0x674),
773 	REG16(0x678),
774 	REG16(0x67c),
775 	REG(0x068),
776 
777 	END(176)
778 };
779 
780 static const u8 gen12_xcs_offsets[] = {
781 	NOP(1),
782 	LRI(13, POSTED),
783 	REG16(0x244),
784 	REG(0x034),
785 	REG(0x030),
786 	REG(0x038),
787 	REG(0x03c),
788 	REG(0x168),
789 	REG(0x140),
790 	REG(0x110),
791 	REG(0x1c0),
792 	REG(0x1c4),
793 	REG(0x1c8),
794 	REG(0x180),
795 	REG16(0x2b4),
796 
797 	NOP(5),
798 	LRI(9, POSTED),
799 	REG16(0x3a8),
800 	REG16(0x28c),
801 	REG16(0x288),
802 	REG16(0x284),
803 	REG16(0x280),
804 	REG16(0x27c),
805 	REG16(0x278),
806 	REG16(0x274),
807 	REG16(0x270),
808 
809 	END(80)
810 };
811 
812 static const u8 gen8_rcs_offsets[] = {
813 	NOP(1),
814 	LRI(14, POSTED),
815 	REG16(0x244),
816 	REG(0x034),
817 	REG(0x030),
818 	REG(0x038),
819 	REG(0x03c),
820 	REG(0x168),
821 	REG(0x140),
822 	REG(0x110),
823 	REG(0x11c),
824 	REG(0x114),
825 	REG(0x118),
826 	REG(0x1c0),
827 	REG(0x1c4),
828 	REG(0x1c8),
829 
830 	NOP(3),
831 	LRI(9, POSTED),
832 	REG16(0x3a8),
833 	REG16(0x28c),
834 	REG16(0x288),
835 	REG16(0x284),
836 	REG16(0x280),
837 	REG16(0x27c),
838 	REG16(0x278),
839 	REG16(0x274),
840 	REG16(0x270),
841 
842 	NOP(13),
843 	LRI(1, 0),
844 	REG(0x0c8),
845 
846 	END(80)
847 };
848 
849 static const u8 gen9_rcs_offsets[] = {
850 	NOP(1),
851 	LRI(14, POSTED),
852 	REG16(0x244),
853 	REG(0x34),
854 	REG(0x30),
855 	REG(0x38),
856 	REG(0x3c),
857 	REG(0x168),
858 	REG(0x140),
859 	REG(0x110),
860 	REG(0x11c),
861 	REG(0x114),
862 	REG(0x118),
863 	REG(0x1c0),
864 	REG(0x1c4),
865 	REG(0x1c8),
866 
867 	NOP(3),
868 	LRI(9, POSTED),
869 	REG16(0x3a8),
870 	REG16(0x28c),
871 	REG16(0x288),
872 	REG16(0x284),
873 	REG16(0x280),
874 	REG16(0x27c),
875 	REG16(0x278),
876 	REG16(0x274),
877 	REG16(0x270),
878 
879 	NOP(13),
880 	LRI(1, 0),
881 	REG(0xc8),
882 
883 	NOP(13),
884 	LRI(44, POSTED),
885 	REG(0x28),
886 	REG(0x9c),
887 	REG(0xc0),
888 	REG(0x178),
889 	REG(0x17c),
890 	REG16(0x358),
891 	REG(0x170),
892 	REG(0x150),
893 	REG(0x154),
894 	REG(0x158),
895 	REG16(0x41c),
896 	REG16(0x600),
897 	REG16(0x604),
898 	REG16(0x608),
899 	REG16(0x60c),
900 	REG16(0x610),
901 	REG16(0x614),
902 	REG16(0x618),
903 	REG16(0x61c),
904 	REG16(0x620),
905 	REG16(0x624),
906 	REG16(0x628),
907 	REG16(0x62c),
908 	REG16(0x630),
909 	REG16(0x634),
910 	REG16(0x638),
911 	REG16(0x63c),
912 	REG16(0x640),
913 	REG16(0x644),
914 	REG16(0x648),
915 	REG16(0x64c),
916 	REG16(0x650),
917 	REG16(0x654),
918 	REG16(0x658),
919 	REG16(0x65c),
920 	REG16(0x660),
921 	REG16(0x664),
922 	REG16(0x668),
923 	REG16(0x66c),
924 	REG16(0x670),
925 	REG16(0x674),
926 	REG16(0x678),
927 	REG16(0x67c),
928 	REG(0x68),
929 
930 	END(176)
931 };
932 
933 static const u8 gen11_rcs_offsets[] = {
934 	NOP(1),
935 	LRI(15, POSTED),
936 	REG16(0x244),
937 	REG(0x034),
938 	REG(0x030),
939 	REG(0x038),
940 	REG(0x03c),
941 	REG(0x168),
942 	REG(0x140),
943 	REG(0x110),
944 	REG(0x11c),
945 	REG(0x114),
946 	REG(0x118),
947 	REG(0x1c0),
948 	REG(0x1c4),
949 	REG(0x1c8),
950 	REG(0x180),
951 
952 	NOP(1),
953 	LRI(9, POSTED),
954 	REG16(0x3a8),
955 	REG16(0x28c),
956 	REG16(0x288),
957 	REG16(0x284),
958 	REG16(0x280),
959 	REG16(0x27c),
960 	REG16(0x278),
961 	REG16(0x274),
962 	REG16(0x270),
963 
964 	LRI(1, POSTED),
965 	REG(0x1b0),
966 
967 	NOP(10),
968 	LRI(1, 0),
969 	REG(0x0c8),
970 
971 	END(80)
972 };
973 
974 static const u8 gen12_rcs_offsets[] = {
975 	NOP(1),
976 	LRI(13, POSTED),
977 	REG16(0x244),
978 	REG(0x034),
979 	REG(0x030),
980 	REG(0x038),
981 	REG(0x03c),
982 	REG(0x168),
983 	REG(0x140),
984 	REG(0x110),
985 	REG(0x1c0),
986 	REG(0x1c4),
987 	REG(0x1c8),
988 	REG(0x180),
989 	REG16(0x2b4),
990 
991 	NOP(5),
992 	LRI(9, POSTED),
993 	REG16(0x3a8),
994 	REG16(0x28c),
995 	REG16(0x288),
996 	REG16(0x284),
997 	REG16(0x280),
998 	REG16(0x27c),
999 	REG16(0x278),
1000 	REG16(0x274),
1001 	REG16(0x270),
1002 
1003 	LRI(3, POSTED),
1004 	REG(0x1b0),
1005 	REG16(0x5a8),
1006 	REG16(0x5ac),
1007 
1008 	NOP(6),
1009 	LRI(1, 0),
1010 	REG(0x0c8),
1011 	NOP(3 + 9 + 1),
1012 
1013 	LRI(51, POSTED),
1014 	REG16(0x588),
1015 	REG16(0x588),
1016 	REG16(0x588),
1017 	REG16(0x588),
1018 	REG16(0x588),
1019 	REG16(0x588),
1020 	REG(0x028),
1021 	REG(0x09c),
1022 	REG(0x0c0),
1023 	REG(0x178),
1024 	REG(0x17c),
1025 	REG16(0x358),
1026 	REG(0x170),
1027 	REG(0x150),
1028 	REG(0x154),
1029 	REG(0x158),
1030 	REG16(0x41c),
1031 	REG16(0x600),
1032 	REG16(0x604),
1033 	REG16(0x608),
1034 	REG16(0x60c),
1035 	REG16(0x610),
1036 	REG16(0x614),
1037 	REG16(0x618),
1038 	REG16(0x61c),
1039 	REG16(0x620),
1040 	REG16(0x624),
1041 	REG16(0x628),
1042 	REG16(0x62c),
1043 	REG16(0x630),
1044 	REG16(0x634),
1045 	REG16(0x638),
1046 	REG16(0x63c),
1047 	REG16(0x640),
1048 	REG16(0x644),
1049 	REG16(0x648),
1050 	REG16(0x64c),
1051 	REG16(0x650),
1052 	REG16(0x654),
1053 	REG16(0x658),
1054 	REG16(0x65c),
1055 	REG16(0x660),
1056 	REG16(0x664),
1057 	REG16(0x668),
1058 	REG16(0x66c),
1059 	REG16(0x670),
1060 	REG16(0x674),
1061 	REG16(0x678),
1062 	REG16(0x67c),
1063 	REG(0x068),
1064 	REG(0x084),
1065 	NOP(1),
1066 
1067 	END(192)
1068 };
1069 
1070 #undef END
1071 #undef REG16
1072 #undef REG
1073 #undef LRI
1074 #undef NOP
1075 
1076 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1077 {
1078 	/*
1079 	 * The gen12+ lists only have the registers we program in the basic
1080 	 * default state. We rely on the context image using relative
1081 	 * addressing to automatic fixup the register state between the
1082 	 * physical engines for virtual engine.
1083 	 */
1084 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1085 		   !intel_engine_has_relative_mmio(engine));
1086 
1087 	if (engine->class == RENDER_CLASS) {
1088 		if (INTEL_GEN(engine->i915) >= 12)
1089 			return gen12_rcs_offsets;
1090 		else if (INTEL_GEN(engine->i915) >= 11)
1091 			return gen11_rcs_offsets;
1092 		else if (INTEL_GEN(engine->i915) >= 9)
1093 			return gen9_rcs_offsets;
1094 		else
1095 			return gen8_rcs_offsets;
1096 	} else {
1097 		if (INTEL_GEN(engine->i915) >= 12)
1098 			return gen12_xcs_offsets;
1099 		else if (INTEL_GEN(engine->i915) >= 9)
1100 			return gen9_xcs_offsets;
1101 		else
1102 			return gen8_xcs_offsets;
1103 	}
1104 }
1105 
1106 static struct i915_request *
1107 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1108 {
1109 	struct i915_request *rq, *rn, *active = NULL;
1110 	struct list_head *pl;
1111 	int prio = I915_PRIORITY_INVALID;
1112 
1113 	lockdep_assert_held(&engine->active.lock);
1114 
1115 	list_for_each_entry_safe_reverse(rq, rn,
1116 					 &engine->active.requests,
1117 					 sched.link) {
1118 		if (i915_request_completed(rq))
1119 			continue; /* XXX */
1120 
1121 		__i915_request_unsubmit(rq);
1122 
1123 		/*
1124 		 * Push the request back into the queue for later resubmission.
1125 		 * If this request is not native to this physical engine (i.e.
1126 		 * it came from a virtual source), push it back onto the virtual
1127 		 * engine so that it can be moved across onto another physical
1128 		 * engine as load dictates.
1129 		 */
1130 		if (likely(rq->execution_mask == engine->mask)) {
1131 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1132 			if (rq_prio(rq) != prio) {
1133 				prio = rq_prio(rq);
1134 				pl = i915_sched_lookup_priolist(engine, prio);
1135 			}
1136 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1137 
1138 			list_move(&rq->sched.link, pl);
1139 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1140 
1141 			/* Check in case we rollback so far we wrap [size/2] */
1142 			if (intel_ring_direction(rq->ring,
1143 						 rq->tail,
1144 						 rq->ring->tail + 8) > 0)
1145 				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1146 
1147 			active = rq;
1148 		} else {
1149 			struct intel_engine_cs *owner = rq->context->engine;
1150 
1151 			WRITE_ONCE(rq->engine, owner);
1152 			owner->submit_request(rq);
1153 			active = NULL;
1154 		}
1155 	}
1156 
1157 	return active;
1158 }
1159 
1160 struct i915_request *
1161 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1162 {
1163 	struct intel_engine_cs *engine =
1164 		container_of(execlists, typeof(*engine), execlists);
1165 
1166 	return __unwind_incomplete_requests(engine);
1167 }
1168 
1169 static inline void
1170 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1171 {
1172 	/*
1173 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1174 	 * The compiler should eliminate this function as dead-code.
1175 	 */
1176 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1177 		return;
1178 
1179 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1180 				   status, rq);
1181 }
1182 
1183 static void intel_engine_context_in(struct intel_engine_cs *engine)
1184 {
1185 	unsigned long flags;
1186 
1187 	if (atomic_add_unless(&engine->stats.active, 1, 0))
1188 		return;
1189 
1190 	write_seqlock_irqsave(&engine->stats.lock, flags);
1191 	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1192 		engine->stats.start = ktime_get();
1193 		atomic_inc(&engine->stats.active);
1194 	}
1195 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1196 }
1197 
1198 static void intel_engine_context_out(struct intel_engine_cs *engine)
1199 {
1200 	unsigned long flags;
1201 
1202 	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1203 
1204 	if (atomic_add_unless(&engine->stats.active, -1, 1))
1205 		return;
1206 
1207 	write_seqlock_irqsave(&engine->stats.lock, flags);
1208 	if (atomic_dec_and_test(&engine->stats.active)) {
1209 		engine->stats.total =
1210 			ktime_add(engine->stats.total,
1211 				  ktime_sub(ktime_get(), engine->stats.start));
1212 	}
1213 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1214 }
1215 
1216 static void
1217 execlists_check_context(const struct intel_context *ce,
1218 			const struct intel_engine_cs *engine,
1219 			const char *when)
1220 {
1221 	const struct intel_ring *ring = ce->ring;
1222 	u32 *regs = ce->lrc_reg_state;
1223 	bool valid = true;
1224 	int x;
1225 
1226 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1227 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1228 		       engine->name,
1229 		       regs[CTX_RING_START],
1230 		       i915_ggtt_offset(ring->vma));
1231 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1232 		valid = false;
1233 	}
1234 
1235 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1236 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1237 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1238 		       engine->name,
1239 		       regs[CTX_RING_CTL],
1240 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1241 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1242 		valid = false;
1243 	}
1244 
1245 	x = lrc_ring_mi_mode(engine);
1246 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1247 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1248 		       engine->name, regs[x + 1]);
1249 		regs[x + 1] &= ~STOP_RING;
1250 		regs[x + 1] |= STOP_RING << 16;
1251 		valid = false;
1252 	}
1253 
1254 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1255 }
1256 
1257 static void restore_default_state(struct intel_context *ce,
1258 				  struct intel_engine_cs *engine)
1259 {
1260 	u32 *regs;
1261 
1262 	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1263 	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1264 
1265 	ce->runtime.last = intel_context_get_runtime(ce);
1266 }
1267 
1268 static void reset_active(struct i915_request *rq,
1269 			 struct intel_engine_cs *engine)
1270 {
1271 	struct intel_context * const ce = rq->context;
1272 	u32 head;
1273 
1274 	/*
1275 	 * The executing context has been cancelled. We want to prevent
1276 	 * further execution along this context and propagate the error on
1277 	 * to anything depending on its results.
1278 	 *
1279 	 * In __i915_request_submit(), we apply the -EIO and remove the
1280 	 * requests' payloads for any banned requests. But first, we must
1281 	 * rewind the context back to the start of the incomplete request so
1282 	 * that we do not jump back into the middle of the batch.
1283 	 *
1284 	 * We preserve the breadcrumbs and semaphores of the incomplete
1285 	 * requests so that inter-timeline dependencies (i.e other timelines)
1286 	 * remain correctly ordered. And we defer to __i915_request_submit()
1287 	 * so that all asynchronous waits are correctly handled.
1288 	 */
1289 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1290 		     rq->fence.context, rq->fence.seqno);
1291 
1292 	/* On resubmission of the active request, payload will be scrubbed */
1293 	if (i915_request_completed(rq))
1294 		head = rq->tail;
1295 	else
1296 		head = active_request(ce->timeline, rq)->head;
1297 	head = intel_ring_wrap(ce->ring, head);
1298 
1299 	/* Scrub the context image to prevent replaying the previous batch */
1300 	restore_default_state(ce, engine);
1301 	__execlists_update_reg_state(ce, engine, head);
1302 
1303 	/* We've switched away, so this should be a no-op, but intent matters */
1304 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1305 }
1306 
1307 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1308 {
1309 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1310 	ce->runtime.num_underflow++;
1311 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1312 #endif
1313 }
1314 
1315 static void intel_context_update_runtime(struct intel_context *ce)
1316 {
1317 	u32 old;
1318 	s32 dt;
1319 
1320 	if (intel_context_is_barrier(ce))
1321 		return;
1322 
1323 	old = ce->runtime.last;
1324 	ce->runtime.last = intel_context_get_runtime(ce);
1325 	dt = ce->runtime.last - old;
1326 
1327 	if (unlikely(dt < 0)) {
1328 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1329 			 old, ce->runtime.last, dt);
1330 		st_update_runtime_underflow(ce, dt);
1331 		return;
1332 	}
1333 
1334 	ewma_runtime_add(&ce->runtime.avg, dt);
1335 	ce->runtime.total += dt;
1336 }
1337 
1338 static inline struct intel_engine_cs *
1339 __execlists_schedule_in(struct i915_request *rq)
1340 {
1341 	struct intel_engine_cs * const engine = rq->engine;
1342 	struct intel_context * const ce = rq->context;
1343 
1344 	intel_context_get(ce);
1345 
1346 	if (unlikely(intel_context_is_banned(ce)))
1347 		reset_active(rq, engine);
1348 
1349 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1350 		execlists_check_context(ce, engine, "before");
1351 
1352 	if (ce->tag) {
1353 		/* Use a fixed tag for OA and friends */
1354 		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1355 		ce->lrc.ccid = ce->tag;
1356 	} else {
1357 		/* We don't need a strict matching tag, just different values */
1358 		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1359 
1360 		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1361 		clear_bit(tag - 1, &engine->context_tag);
1362 		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1363 
1364 		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1365 	}
1366 
1367 	ce->lrc.ccid |= engine->execlists.ccid;
1368 
1369 	__intel_gt_pm_get(engine->gt);
1370 	if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1371 		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1372 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1373 	intel_engine_context_in(engine);
1374 
1375 	return engine;
1376 }
1377 
1378 static inline struct i915_request *
1379 execlists_schedule_in(struct i915_request *rq, int idx)
1380 {
1381 	struct intel_context * const ce = rq->context;
1382 	struct intel_engine_cs *old;
1383 
1384 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1385 	trace_i915_request_in(rq, idx);
1386 
1387 	old = READ_ONCE(ce->inflight);
1388 	do {
1389 		if (!old) {
1390 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1391 			break;
1392 		}
1393 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1394 
1395 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1396 	return i915_request_get(rq);
1397 }
1398 
1399 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1400 {
1401 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1402 	struct i915_request *next = READ_ONCE(ve->request);
1403 
1404 	if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1405 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
1406 }
1407 
1408 static inline void
1409 __execlists_schedule_out(struct i915_request *rq,
1410 			 struct intel_engine_cs * const engine,
1411 			 unsigned int ccid)
1412 {
1413 	struct intel_context * const ce = rq->context;
1414 
1415 	/*
1416 	 * NB process_csb() is not under the engine->active.lock and hence
1417 	 * schedule_out can race with schedule_in meaning that we should
1418 	 * refrain from doing non-trivial work here.
1419 	 */
1420 
1421 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1422 		execlists_check_context(ce, engine, "after");
1423 
1424 	/*
1425 	 * If we have just completed this context, the engine may now be
1426 	 * idle and we want to re-enter powersaving.
1427 	 */
1428 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1429 	    i915_request_completed(rq))
1430 		intel_engine_add_retire(engine, ce->timeline);
1431 
1432 	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1433 	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1434 	if (ccid < BITS_PER_LONG) {
1435 		GEM_BUG_ON(ccid == 0);
1436 		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1437 		set_bit(ccid - 1, &engine->context_tag);
1438 	}
1439 
1440 	intel_context_update_runtime(ce);
1441 	intel_engine_context_out(engine);
1442 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1443 	if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1444 		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1445 	intel_gt_pm_put_async(engine->gt);
1446 
1447 	/*
1448 	 * If this is part of a virtual engine, its next request may
1449 	 * have been blocked waiting for access to the active context.
1450 	 * We have to kick all the siblings again in case we need to
1451 	 * switch (e.g. the next request is not runnable on this
1452 	 * engine). Hopefully, we will already have submitted the next
1453 	 * request before the tasklet runs and do not need to rebuild
1454 	 * each virtual tree and kick everyone again.
1455 	 */
1456 	if (ce->engine != engine)
1457 		kick_siblings(rq, ce);
1458 
1459 	intel_context_put(ce);
1460 }
1461 
1462 static inline void
1463 execlists_schedule_out(struct i915_request *rq)
1464 {
1465 	struct intel_context * const ce = rq->context;
1466 	struct intel_engine_cs *cur, *old;
1467 	u32 ccid;
1468 
1469 	trace_i915_request_out(rq);
1470 
1471 	ccid = rq->context->lrc.ccid;
1472 	old = READ_ONCE(ce->inflight);
1473 	do
1474 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1475 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1476 	if (!cur)
1477 		__execlists_schedule_out(rq, old, ccid);
1478 
1479 	i915_request_put(rq);
1480 }
1481 
1482 static u64 execlists_update_context(struct i915_request *rq)
1483 {
1484 	struct intel_context *ce = rq->context;
1485 	u64 desc = ce->lrc.desc;
1486 	u32 tail, prev;
1487 
1488 	/*
1489 	 * WaIdleLiteRestore:bdw,skl
1490 	 *
1491 	 * We should never submit the context with the same RING_TAIL twice
1492 	 * just in case we submit an empty ring, which confuses the HW.
1493 	 *
1494 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1495 	 * the normal request to be able to always advance the RING_TAIL on
1496 	 * subsequent resubmissions (for lite restore). Should that fail us,
1497 	 * and we try and submit the same tail again, force the context
1498 	 * reload.
1499 	 *
1500 	 * If we need to return to a preempted context, we need to skip the
1501 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1502 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1503 	 * an earlier request.
1504 	 */
1505 	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1506 	prev = rq->ring->tail;
1507 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1508 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1509 		desc |= CTX_DESC_FORCE_RESTORE;
1510 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1511 	rq->tail = rq->wa_tail;
1512 
1513 	/*
1514 	 * Make sure the context image is complete before we submit it to HW.
1515 	 *
1516 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1517 	 * an uncached write such as our mmio register access, the empirical
1518 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1519 	 * may not be visible to the HW prior to the completion of the UC
1520 	 * register write and that we may begin execution from the context
1521 	 * before its image is complete leading to invalid PD chasing.
1522 	 */
1523 	wmb();
1524 
1525 	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1526 	return desc;
1527 }
1528 
1529 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1530 {
1531 	if (execlists->ctrl_reg) {
1532 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1533 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1534 	} else {
1535 		writel(upper_32_bits(desc), execlists->submit_reg);
1536 		writel(lower_32_bits(desc), execlists->submit_reg);
1537 	}
1538 }
1539 
1540 static __maybe_unused char *
1541 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1542 {
1543 	if (!rq)
1544 		return "";
1545 
1546 	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1547 		 prefix,
1548 		 rq->context->lrc.ccid,
1549 		 rq->fence.context, rq->fence.seqno,
1550 		 i915_request_completed(rq) ? "!" :
1551 		 i915_request_started(rq) ? "*" :
1552 		 "",
1553 		 rq_prio(rq));
1554 
1555 	return buf;
1556 }
1557 
1558 static __maybe_unused void
1559 trace_ports(const struct intel_engine_execlists *execlists,
1560 	    const char *msg,
1561 	    struct i915_request * const *ports)
1562 {
1563 	const struct intel_engine_cs *engine =
1564 		container_of(execlists, typeof(*engine), execlists);
1565 	char __maybe_unused p0[40], p1[40];
1566 
1567 	if (!ports[0])
1568 		return;
1569 
1570 	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1571 		     dump_port(p0, sizeof(p0), "", ports[0]),
1572 		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1573 }
1574 
1575 static inline bool
1576 reset_in_progress(const struct intel_engine_execlists *execlists)
1577 {
1578 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1579 }
1580 
1581 static __maybe_unused bool
1582 assert_pending_valid(const struct intel_engine_execlists *execlists,
1583 		     const char *msg)
1584 {
1585 	struct intel_engine_cs *engine =
1586 		container_of(execlists, typeof(*engine), execlists);
1587 	struct i915_request * const *port, *rq;
1588 	struct intel_context *ce = NULL;
1589 	bool sentinel = false;
1590 	u32 ccid = -1;
1591 
1592 	trace_ports(execlists, msg, execlists->pending);
1593 
1594 	/* We may be messing around with the lists during reset, lalala */
1595 	if (reset_in_progress(execlists))
1596 		return true;
1597 
1598 	if (!execlists->pending[0]) {
1599 		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1600 			      engine->name);
1601 		return false;
1602 	}
1603 
1604 	if (execlists->pending[execlists_num_ports(execlists)]) {
1605 		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1606 			      engine->name, execlists_num_ports(execlists));
1607 		return false;
1608 	}
1609 
1610 	for (port = execlists->pending; (rq = *port); port++) {
1611 		unsigned long flags;
1612 		bool ok = true;
1613 
1614 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1615 		GEM_BUG_ON(!i915_request_is_active(rq));
1616 
1617 		if (ce == rq->context) {
1618 			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1619 				      engine->name,
1620 				      ce->timeline->fence_context,
1621 				      port - execlists->pending);
1622 			return false;
1623 		}
1624 		ce = rq->context;
1625 
1626 		if (ccid == ce->lrc.ccid) {
1627 			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1628 				      engine->name,
1629 				      ccid, ce->timeline->fence_context,
1630 				      port - execlists->pending);
1631 			return false;
1632 		}
1633 		ccid = ce->lrc.ccid;
1634 
1635 		/*
1636 		 * Sentinels are supposed to be the last request so they flush
1637 		 * the current execution off the HW. Check that they are the only
1638 		 * request in the pending submission.
1639 		 */
1640 		if (sentinel) {
1641 			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1642 				      engine->name,
1643 				      ce->timeline->fence_context,
1644 				      port - execlists->pending);
1645 			return false;
1646 		}
1647 		sentinel = i915_request_has_sentinel(rq);
1648 
1649 		/* Hold tightly onto the lock to prevent concurrent retires! */
1650 		if (!spin_trylock_irqsave(&rq->lock, flags))
1651 			continue;
1652 
1653 		if (i915_request_completed(rq))
1654 			goto unlock;
1655 
1656 		if (i915_active_is_idle(&ce->active) &&
1657 		    !intel_context_is_barrier(ce)) {
1658 			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1659 				      engine->name,
1660 				      ce->timeline->fence_context,
1661 				      port - execlists->pending);
1662 			ok = false;
1663 			goto unlock;
1664 		}
1665 
1666 		if (!i915_vma_is_pinned(ce->state)) {
1667 			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1668 				      engine->name,
1669 				      ce->timeline->fence_context,
1670 				      port - execlists->pending);
1671 			ok = false;
1672 			goto unlock;
1673 		}
1674 
1675 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1676 			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1677 				      engine->name,
1678 				      ce->timeline->fence_context,
1679 				      port - execlists->pending);
1680 			ok = false;
1681 			goto unlock;
1682 		}
1683 
1684 unlock:
1685 		spin_unlock_irqrestore(&rq->lock, flags);
1686 		if (!ok)
1687 			return false;
1688 	}
1689 
1690 	return ce;
1691 }
1692 
1693 static void execlists_submit_ports(struct intel_engine_cs *engine)
1694 {
1695 	struct intel_engine_execlists *execlists = &engine->execlists;
1696 	unsigned int n;
1697 
1698 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1699 
1700 	/*
1701 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1702 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1703 	 * not be relinquished until the device is idle (see
1704 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1705 	 * that all ELSP are drained i.e. we have processed the CSB,
1706 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1707 	 */
1708 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1709 
1710 	/*
1711 	 * ELSQ note: the submit queue is not cleared after being submitted
1712 	 * to the HW so we need to make sure we always clean it up. This is
1713 	 * currently ensured by the fact that we always write the same number
1714 	 * of elsq entries, keep this in mind before changing the loop below.
1715 	 */
1716 	for (n = execlists_num_ports(execlists); n--; ) {
1717 		struct i915_request *rq = execlists->pending[n];
1718 
1719 		write_desc(execlists,
1720 			   rq ? execlists_update_context(rq) : 0,
1721 			   n);
1722 	}
1723 
1724 	/* we need to manually load the submit queue */
1725 	if (execlists->ctrl_reg)
1726 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1727 }
1728 
1729 static bool ctx_single_port_submission(const struct intel_context *ce)
1730 {
1731 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1732 		intel_context_force_single_submission(ce));
1733 }
1734 
1735 static bool can_merge_ctx(const struct intel_context *prev,
1736 			  const struct intel_context *next)
1737 {
1738 	if (prev != next)
1739 		return false;
1740 
1741 	if (ctx_single_port_submission(prev))
1742 		return false;
1743 
1744 	return true;
1745 }
1746 
1747 static unsigned long i915_request_flags(const struct i915_request *rq)
1748 {
1749 	return READ_ONCE(rq->fence.flags);
1750 }
1751 
1752 static bool can_merge_rq(const struct i915_request *prev,
1753 			 const struct i915_request *next)
1754 {
1755 	GEM_BUG_ON(prev == next);
1756 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1757 
1758 	/*
1759 	 * We do not submit known completed requests. Therefore if the next
1760 	 * request is already completed, we can pretend to merge it in
1761 	 * with the previous context (and we will skip updating the ELSP
1762 	 * and tracking). Thus hopefully keeping the ELSP full with active
1763 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1764 	 * us.
1765 	 */
1766 	if (i915_request_completed(next))
1767 		return true;
1768 
1769 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1770 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1771 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1772 		return false;
1773 
1774 	if (!can_merge_ctx(prev->context, next->context))
1775 		return false;
1776 
1777 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1778 	return true;
1779 }
1780 
1781 static void virtual_update_register_offsets(u32 *regs,
1782 					    struct intel_engine_cs *engine)
1783 {
1784 	set_offsets(regs, reg_offsets(engine), engine, false);
1785 }
1786 
1787 static bool virtual_matches(const struct virtual_engine *ve,
1788 			    const struct i915_request *rq,
1789 			    const struct intel_engine_cs *engine)
1790 {
1791 	const struct intel_engine_cs *inflight;
1792 
1793 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1794 		return false;
1795 
1796 	/*
1797 	 * We track when the HW has completed saving the context image
1798 	 * (i.e. when we have seen the final CS event switching out of
1799 	 * the context) and must not overwrite the context image before
1800 	 * then. This restricts us to only using the active engine
1801 	 * while the previous virtualized request is inflight (so
1802 	 * we reuse the register offsets). This is a very small
1803 	 * hystersis on the greedy seelction algorithm.
1804 	 */
1805 	inflight = intel_context_inflight(&ve->context);
1806 	if (inflight && inflight != engine)
1807 		return false;
1808 
1809 	return true;
1810 }
1811 
1812 static void virtual_xfer_context(struct virtual_engine *ve,
1813 				 struct intel_engine_cs *engine)
1814 {
1815 	unsigned int n;
1816 
1817 	if (likely(engine == ve->siblings[0]))
1818 		return;
1819 
1820 	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1821 	if (!intel_engine_has_relative_mmio(engine))
1822 		virtual_update_register_offsets(ve->context.lrc_reg_state,
1823 						engine);
1824 
1825 	/*
1826 	 * Move the bound engine to the top of the list for
1827 	 * future execution. We then kick this tasklet first
1828 	 * before checking others, so that we preferentially
1829 	 * reuse this set of bound registers.
1830 	 */
1831 	for (n = 1; n < ve->num_siblings; n++) {
1832 		if (ve->siblings[n] == engine) {
1833 			swap(ve->siblings[n], ve->siblings[0]);
1834 			break;
1835 		}
1836 	}
1837 }
1838 
1839 #define for_each_waiter(p__, rq__) \
1840 	list_for_each_entry_lockless(p__, \
1841 				     &(rq__)->sched.waiters_list, \
1842 				     wait_link)
1843 
1844 #define for_each_signaler(p__, rq__) \
1845 	list_for_each_entry_rcu(p__, \
1846 				&(rq__)->sched.signalers_list, \
1847 				signal_link)
1848 
1849 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1850 {
1851 	LIST_HEAD(list);
1852 
1853 	/*
1854 	 * We want to move the interrupted request to the back of
1855 	 * the round-robin list (i.e. its priority level), but
1856 	 * in doing so, we must then move all requests that were in
1857 	 * flight and were waiting for the interrupted request to
1858 	 * be run after it again.
1859 	 */
1860 	do {
1861 		struct i915_dependency *p;
1862 
1863 		GEM_BUG_ON(i915_request_is_active(rq));
1864 		list_move_tail(&rq->sched.link, pl);
1865 
1866 		for_each_waiter(p, rq) {
1867 			struct i915_request *w =
1868 				container_of(p->waiter, typeof(*w), sched);
1869 
1870 			if (p->flags & I915_DEPENDENCY_WEAK)
1871 				continue;
1872 
1873 			/* Leave semaphores spinning on the other engines */
1874 			if (w->engine != rq->engine)
1875 				continue;
1876 
1877 			/* No waiter should start before its signaler */
1878 			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1879 				   i915_request_started(w) &&
1880 				   !i915_request_completed(rq));
1881 
1882 			GEM_BUG_ON(i915_request_is_active(w));
1883 			if (!i915_request_is_ready(w))
1884 				continue;
1885 
1886 			if (rq_prio(w) < rq_prio(rq))
1887 				continue;
1888 
1889 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1890 			list_move_tail(&w->sched.link, &list);
1891 		}
1892 
1893 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1894 	} while (rq);
1895 }
1896 
1897 static void defer_active(struct intel_engine_cs *engine)
1898 {
1899 	struct i915_request *rq;
1900 
1901 	rq = __unwind_incomplete_requests(engine);
1902 	if (!rq)
1903 		return;
1904 
1905 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1906 }
1907 
1908 static bool
1909 need_timeslice(const struct intel_engine_cs *engine,
1910 	       const struct i915_request *rq,
1911 	       const struct rb_node *rb)
1912 {
1913 	int hint;
1914 
1915 	if (!intel_engine_has_timeslices(engine))
1916 		return false;
1917 
1918 	hint = engine->execlists.queue_priority_hint;
1919 
1920 	if (rb) {
1921 		const struct virtual_engine *ve =
1922 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1923 		const struct intel_engine_cs *inflight =
1924 			intel_context_inflight(&ve->context);
1925 
1926 		if (!inflight || inflight == engine) {
1927 			struct i915_request *next;
1928 
1929 			rcu_read_lock();
1930 			next = READ_ONCE(ve->request);
1931 			if (next)
1932 				hint = max(hint, rq_prio(next));
1933 			rcu_read_unlock();
1934 		}
1935 	}
1936 
1937 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1938 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1939 
1940 	GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1941 	return hint >= effective_prio(rq);
1942 }
1943 
1944 static bool
1945 timeslice_yield(const struct intel_engine_execlists *el,
1946 		const struct i915_request *rq)
1947 {
1948 	/*
1949 	 * Once bitten, forever smitten!
1950 	 *
1951 	 * If the active context ever busy-waited on a semaphore,
1952 	 * it will be treated as a hog until the end of its timeslice (i.e.
1953 	 * until it is scheduled out and replaced by a new submission,
1954 	 * possibly even its own lite-restore). The HW only sends an interrupt
1955 	 * on the first miss, and we do know if that semaphore has been
1956 	 * signaled, or even if it is now stuck on another semaphore. Play
1957 	 * safe, yield if it might be stuck -- it will be given a fresh
1958 	 * timeslice in the near future.
1959 	 */
1960 	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1961 }
1962 
1963 static bool
1964 timeslice_expired(const struct intel_engine_execlists *el,
1965 		  const struct i915_request *rq)
1966 {
1967 	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1968 }
1969 
1970 static int
1971 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1972 {
1973 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1974 		return engine->execlists.queue_priority_hint;
1975 
1976 	return rq_prio(list_next_entry(rq, sched.link));
1977 }
1978 
1979 static inline unsigned long
1980 timeslice(const struct intel_engine_cs *engine)
1981 {
1982 	return READ_ONCE(engine->props.timeslice_duration_ms);
1983 }
1984 
1985 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1986 {
1987 	const struct intel_engine_execlists *execlists = &engine->execlists;
1988 	const struct i915_request *rq = *execlists->active;
1989 
1990 	if (!rq || i915_request_completed(rq))
1991 		return 0;
1992 
1993 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1994 		return 0;
1995 
1996 	return timeslice(engine);
1997 }
1998 
1999 static void set_timeslice(struct intel_engine_cs *engine)
2000 {
2001 	unsigned long duration;
2002 
2003 	if (!intel_engine_has_timeslices(engine))
2004 		return;
2005 
2006 	duration = active_timeslice(engine);
2007 	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2008 
2009 	set_timer_ms(&engine->execlists.timer, duration);
2010 }
2011 
2012 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2013 {
2014 	struct intel_engine_execlists *execlists = &engine->execlists;
2015 	unsigned long duration;
2016 
2017 	if (!intel_engine_has_timeslices(engine))
2018 		return;
2019 
2020 	WRITE_ONCE(execlists->switch_priority_hint, prio);
2021 	if (prio == INT_MIN)
2022 		return;
2023 
2024 	if (timer_pending(&execlists->timer))
2025 		return;
2026 
2027 	duration = timeslice(engine);
2028 	ENGINE_TRACE(engine,
2029 		     "start timeslicing, prio:%d, interval:%lu",
2030 		     prio, duration);
2031 
2032 	set_timer_ms(&execlists->timer, duration);
2033 }
2034 
2035 static void record_preemption(struct intel_engine_execlists *execlists)
2036 {
2037 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2038 }
2039 
2040 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2041 					    const struct i915_request *rq)
2042 {
2043 	if (!rq)
2044 		return 0;
2045 
2046 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2047 	if (unlikely(intel_context_is_banned(rq->context)))
2048 		return 1;
2049 
2050 	return READ_ONCE(engine->props.preempt_timeout_ms);
2051 }
2052 
2053 static void set_preempt_timeout(struct intel_engine_cs *engine,
2054 				const struct i915_request *rq)
2055 {
2056 	if (!intel_engine_has_preempt_reset(engine))
2057 		return;
2058 
2059 	set_timer_ms(&engine->execlists.preempt,
2060 		     active_preempt_timeout(engine, rq));
2061 }
2062 
2063 static inline void clear_ports(struct i915_request **ports, int count)
2064 {
2065 	memset_p((void **)ports, NULL, count);
2066 }
2067 
2068 static inline void
2069 copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2070 {
2071 	/* A memcpy_p() would be very useful here! */
2072 	while (count--)
2073 		WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2074 }
2075 
2076 static void execlists_dequeue(struct intel_engine_cs *engine)
2077 {
2078 	struct intel_engine_execlists * const execlists = &engine->execlists;
2079 	struct i915_request **port = execlists->pending;
2080 	struct i915_request ** const last_port = port + execlists->port_mask;
2081 	struct i915_request * const *active;
2082 	struct i915_request *last;
2083 	struct rb_node *rb;
2084 	bool submit = false;
2085 
2086 	/*
2087 	 * Hardware submission is through 2 ports. Conceptually each port
2088 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2089 	 * static for a context, and unique to each, so we only execute
2090 	 * requests belonging to a single context from each ring. RING_HEAD
2091 	 * is maintained by the CS in the context image, it marks the place
2092 	 * where it got up to last time, and through RING_TAIL we tell the CS
2093 	 * where we want to execute up to this time.
2094 	 *
2095 	 * In this list the requests are in order of execution. Consecutive
2096 	 * requests from the same context are adjacent in the ringbuffer. We
2097 	 * can combine these requests into a single RING_TAIL update:
2098 	 *
2099 	 *              RING_HEAD...req1...req2
2100 	 *                                    ^- RING_TAIL
2101 	 * since to execute req2 the CS must first execute req1.
2102 	 *
2103 	 * Our goal then is to point each port to the end of a consecutive
2104 	 * sequence of requests as being the most optimal (fewest wake ups
2105 	 * and context switches) submission.
2106 	 */
2107 
2108 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2109 		struct virtual_engine *ve =
2110 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2111 		struct i915_request *rq = READ_ONCE(ve->request);
2112 
2113 		if (!rq) { /* lazily cleanup after another engine handled rq */
2114 			rb_erase_cached(rb, &execlists->virtual);
2115 			RB_CLEAR_NODE(rb);
2116 			rb = rb_first_cached(&execlists->virtual);
2117 			continue;
2118 		}
2119 
2120 		if (!virtual_matches(ve, rq, engine)) {
2121 			rb = rb_next(rb);
2122 			continue;
2123 		}
2124 
2125 		break;
2126 	}
2127 
2128 	/*
2129 	 * If the queue is higher priority than the last
2130 	 * request in the currently active context, submit afresh.
2131 	 * We will resubmit again afterwards in case we need to split
2132 	 * the active context to interject the preemption request,
2133 	 * i.e. we will retrigger preemption following the ack in case
2134 	 * of trouble.
2135 	 */
2136 	active = READ_ONCE(execlists->active);
2137 
2138 	/*
2139 	 * In theory we can skip over completed contexts that have not
2140 	 * yet been processed by events (as those events are in flight):
2141 	 *
2142 	 * while ((last = *active) && i915_request_completed(last))
2143 	 *	active++;
2144 	 *
2145 	 * However, the GPU cannot handle this as it will ultimately
2146 	 * find itself trying to jump back into a context it has just
2147 	 * completed and barf.
2148 	 */
2149 
2150 	if ((last = *active)) {
2151 		if (need_preempt(engine, last, rb)) {
2152 			if (i915_request_completed(last)) {
2153 				tasklet_hi_schedule(&execlists->tasklet);
2154 				return;
2155 			}
2156 
2157 			ENGINE_TRACE(engine,
2158 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2159 				     last->fence.context,
2160 				     last->fence.seqno,
2161 				     last->sched.attr.priority,
2162 				     execlists->queue_priority_hint);
2163 			record_preemption(execlists);
2164 
2165 			/*
2166 			 * Don't let the RING_HEAD advance past the breadcrumb
2167 			 * as we unwind (and until we resubmit) so that we do
2168 			 * not accidentally tell it to go backwards.
2169 			 */
2170 			ring_set_paused(engine, 1);
2171 
2172 			/*
2173 			 * Note that we have not stopped the GPU at this point,
2174 			 * so we are unwinding the incomplete requests as they
2175 			 * remain inflight and so by the time we do complete
2176 			 * the preemption, some of the unwound requests may
2177 			 * complete!
2178 			 */
2179 			__unwind_incomplete_requests(engine);
2180 
2181 			last = NULL;
2182 		} else if (need_timeslice(engine, last, rb) &&
2183 			   timeslice_expired(execlists, last)) {
2184 			if (i915_request_completed(last)) {
2185 				tasklet_hi_schedule(&execlists->tasklet);
2186 				return;
2187 			}
2188 
2189 			ENGINE_TRACE(engine,
2190 				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2191 				     last->fence.context,
2192 				     last->fence.seqno,
2193 				     last->sched.attr.priority,
2194 				     execlists->queue_priority_hint,
2195 				     yesno(timeslice_yield(execlists, last)));
2196 
2197 			ring_set_paused(engine, 1);
2198 			defer_active(engine);
2199 
2200 			/*
2201 			 * Unlike for preemption, if we rewind and continue
2202 			 * executing the same context as previously active,
2203 			 * the order of execution will remain the same and
2204 			 * the tail will only advance. We do not need to
2205 			 * force a full context restore, as a lite-restore
2206 			 * is sufficient to resample the monotonic TAIL.
2207 			 *
2208 			 * If we switch to any other context, similarly we
2209 			 * will not rewind TAIL of current context, and
2210 			 * normal save/restore will preserve state and allow
2211 			 * us to later continue executing the same request.
2212 			 */
2213 			last = NULL;
2214 		} else {
2215 			/*
2216 			 * Otherwise if we already have a request pending
2217 			 * for execution after the current one, we can
2218 			 * just wait until the next CS event before
2219 			 * queuing more. In either case we will force a
2220 			 * lite-restore preemption event, but if we wait
2221 			 * we hopefully coalesce several updates into a single
2222 			 * submission.
2223 			 */
2224 			if (!list_is_last(&last->sched.link,
2225 					  &engine->active.requests)) {
2226 				/*
2227 				 * Even if ELSP[1] is occupied and not worthy
2228 				 * of timeslices, our queue might be.
2229 				 */
2230 				start_timeslice(engine, queue_prio(execlists));
2231 				return;
2232 			}
2233 		}
2234 	}
2235 
2236 	while (rb) { /* XXX virtual is always taking precedence */
2237 		struct virtual_engine *ve =
2238 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2239 		struct i915_request *rq;
2240 
2241 		spin_lock(&ve->base.active.lock);
2242 
2243 		rq = ve->request;
2244 		if (unlikely(!rq)) { /* lost the race to a sibling */
2245 			spin_unlock(&ve->base.active.lock);
2246 			rb_erase_cached(rb, &execlists->virtual);
2247 			RB_CLEAR_NODE(rb);
2248 			rb = rb_first_cached(&execlists->virtual);
2249 			continue;
2250 		}
2251 
2252 		GEM_BUG_ON(rq != ve->request);
2253 		GEM_BUG_ON(rq->engine != &ve->base);
2254 		GEM_BUG_ON(rq->context != &ve->context);
2255 
2256 		if (rq_prio(rq) >= queue_prio(execlists)) {
2257 			if (!virtual_matches(ve, rq, engine)) {
2258 				spin_unlock(&ve->base.active.lock);
2259 				rb = rb_next(rb);
2260 				continue;
2261 			}
2262 
2263 			if (last && !can_merge_rq(last, rq)) {
2264 				spin_unlock(&ve->base.active.lock);
2265 				start_timeslice(engine, rq_prio(rq));
2266 				return; /* leave this for another sibling */
2267 			}
2268 
2269 			ENGINE_TRACE(engine,
2270 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2271 				     rq->fence.context,
2272 				     rq->fence.seqno,
2273 				     i915_request_completed(rq) ? "!" :
2274 				     i915_request_started(rq) ? "*" :
2275 				     "",
2276 				     yesno(engine != ve->siblings[0]));
2277 
2278 			WRITE_ONCE(ve->request, NULL);
2279 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2280 				   INT_MIN);
2281 			rb_erase_cached(rb, &execlists->virtual);
2282 			RB_CLEAR_NODE(rb);
2283 
2284 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2285 			WRITE_ONCE(rq->engine, engine);
2286 
2287 			if (__i915_request_submit(rq)) {
2288 				/*
2289 				 * Only after we confirm that we will submit
2290 				 * this request (i.e. it has not already
2291 				 * completed), do we want to update the context.
2292 				 *
2293 				 * This serves two purposes. It avoids
2294 				 * unnecessary work if we are resubmitting an
2295 				 * already completed request after timeslicing.
2296 				 * But more importantly, it prevents us altering
2297 				 * ve->siblings[] on an idle context, where
2298 				 * we may be using ve->siblings[] in
2299 				 * virtual_context_enter / virtual_context_exit.
2300 				 */
2301 				virtual_xfer_context(ve, engine);
2302 				GEM_BUG_ON(ve->siblings[0] != engine);
2303 
2304 				submit = true;
2305 				last = rq;
2306 			}
2307 			i915_request_put(rq);
2308 
2309 			/*
2310 			 * Hmm, we have a bunch of virtual engine requests,
2311 			 * but the first one was already completed (thanks
2312 			 * preempt-to-busy!). Keep looking at the veng queue
2313 			 * until we have no more relevant requests (i.e.
2314 			 * the normal submit queue has higher priority).
2315 			 */
2316 			if (!submit) {
2317 				spin_unlock(&ve->base.active.lock);
2318 				rb = rb_first_cached(&execlists->virtual);
2319 				continue;
2320 			}
2321 		}
2322 
2323 		spin_unlock(&ve->base.active.lock);
2324 		break;
2325 	}
2326 
2327 	while ((rb = rb_first_cached(&execlists->queue))) {
2328 		struct i915_priolist *p = to_priolist(rb);
2329 		struct i915_request *rq, *rn;
2330 		int i;
2331 
2332 		priolist_for_each_request_consume(rq, rn, p, i) {
2333 			bool merge = true;
2334 
2335 			/*
2336 			 * Can we combine this request with the current port?
2337 			 * It has to be the same context/ringbuffer and not
2338 			 * have any exceptions (e.g. GVT saying never to
2339 			 * combine contexts).
2340 			 *
2341 			 * If we can combine the requests, we can execute both
2342 			 * by updating the RING_TAIL to point to the end of the
2343 			 * second request, and so we never need to tell the
2344 			 * hardware about the first.
2345 			 */
2346 			if (last && !can_merge_rq(last, rq)) {
2347 				/*
2348 				 * If we are on the second port and cannot
2349 				 * combine this request with the last, then we
2350 				 * are done.
2351 				 */
2352 				if (port == last_port)
2353 					goto done;
2354 
2355 				/*
2356 				 * We must not populate both ELSP[] with the
2357 				 * same LRCA, i.e. we must submit 2 different
2358 				 * contexts if we submit 2 ELSP.
2359 				 */
2360 				if (last->context == rq->context)
2361 					goto done;
2362 
2363 				if (i915_request_has_sentinel(last))
2364 					goto done;
2365 
2366 				/*
2367 				 * If GVT overrides us we only ever submit
2368 				 * port[0], leaving port[1] empty. Note that we
2369 				 * also have to be careful that we don't queue
2370 				 * the same context (even though a different
2371 				 * request) to the second port.
2372 				 */
2373 				if (ctx_single_port_submission(last->context) ||
2374 				    ctx_single_port_submission(rq->context))
2375 					goto done;
2376 
2377 				merge = false;
2378 			}
2379 
2380 			if (__i915_request_submit(rq)) {
2381 				if (!merge) {
2382 					*port = execlists_schedule_in(last, port - execlists->pending);
2383 					port++;
2384 					last = NULL;
2385 				}
2386 
2387 				GEM_BUG_ON(last &&
2388 					   !can_merge_ctx(last->context,
2389 							  rq->context));
2390 				GEM_BUG_ON(last &&
2391 					   i915_seqno_passed(last->fence.seqno,
2392 							     rq->fence.seqno));
2393 
2394 				submit = true;
2395 				last = rq;
2396 			}
2397 		}
2398 
2399 		rb_erase_cached(&p->node, &execlists->queue);
2400 		i915_priolist_free(p);
2401 	}
2402 
2403 done:
2404 	/*
2405 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2406 	 *
2407 	 * We choose the priority hint such that if we add a request of greater
2408 	 * priority than this, we kick the submission tasklet to decide on
2409 	 * the right order of submitting the requests to hardware. We must
2410 	 * also be prepared to reorder requests as they are in-flight on the
2411 	 * HW. We derive the priority hint then as the first "hole" in
2412 	 * the HW submission ports and if there are no available slots,
2413 	 * the priority of the lowest executing request, i.e. last.
2414 	 *
2415 	 * When we do receive a higher priority request ready to run from the
2416 	 * user, see queue_request(), the priority hint is bumped to that
2417 	 * request triggering preemption on the next dequeue (or subsequent
2418 	 * interrupt for secondary ports).
2419 	 */
2420 	execlists->queue_priority_hint = queue_prio(execlists);
2421 
2422 	if (submit) {
2423 		*port = execlists_schedule_in(last, port - execlists->pending);
2424 		execlists->switch_priority_hint =
2425 			switch_prio(engine, *execlists->pending);
2426 
2427 		/*
2428 		 * Skip if we ended up with exactly the same set of requests,
2429 		 * e.g. trying to timeslice a pair of ordered contexts
2430 		 */
2431 		if (!memcmp(active, execlists->pending,
2432 			    (port - execlists->pending + 1) * sizeof(*port))) {
2433 			do
2434 				execlists_schedule_out(fetch_and_zero(port));
2435 			while (port-- != execlists->pending);
2436 
2437 			goto skip_submit;
2438 		}
2439 		clear_ports(port + 1, last_port - port);
2440 
2441 		WRITE_ONCE(execlists->yield, -1);
2442 		set_preempt_timeout(engine, *active);
2443 		execlists_submit_ports(engine);
2444 	} else {
2445 		start_timeslice(engine, execlists->queue_priority_hint);
2446 skip_submit:
2447 		ring_set_paused(engine, 0);
2448 	}
2449 }
2450 
2451 static void
2452 cancel_port_requests(struct intel_engine_execlists * const execlists)
2453 {
2454 	struct i915_request * const *port;
2455 
2456 	for (port = execlists->pending; *port; port++)
2457 		execlists_schedule_out(*port);
2458 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2459 
2460 	/* Mark the end of active before we overwrite *active */
2461 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2462 		execlists_schedule_out(*port);
2463 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2464 
2465 	smp_wmb(); /* complete the seqlock for execlists_active() */
2466 	WRITE_ONCE(execlists->active, execlists->inflight);
2467 }
2468 
2469 static inline void
2470 invalidate_csb_entries(const u64 *first, const u64 *last)
2471 {
2472 	clflush((void *)first);
2473 	clflush((void *)last);
2474 }
2475 
2476 /*
2477  * Starting with Gen12, the status has a new format:
2478  *
2479  *     bit  0:     switched to new queue
2480  *     bit  1:     reserved
2481  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2482  *                 switch detail is set to "wait on semaphore"
2483  *     bits 3-5:   engine class
2484  *     bits 6-11:  engine instance
2485  *     bits 12-14: reserved
2486  *     bits 15-25: sw context id of the lrc the GT switched to
2487  *     bits 26-31: sw counter of the lrc the GT switched to
2488  *     bits 32-35: context switch detail
2489  *                  - 0: ctx complete
2490  *                  - 1: wait on sync flip
2491  *                  - 2: wait on vblank
2492  *                  - 3: wait on scanline
2493  *                  - 4: wait on semaphore
2494  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2495  *                       WAIT_FOR_EVENT)
2496  *     bit  36:    reserved
2497  *     bits 37-43: wait detail (for switch detail 1 to 4)
2498  *     bits 44-46: reserved
2499  *     bits 47-57: sw context id of the lrc the GT switched away from
2500  *     bits 58-63: sw counter of the lrc the GT switched away from
2501  */
2502 static inline bool gen12_csb_parse(const u64 csb)
2503 {
2504 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(csb));
2505 	bool new_queue =
2506 		lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2507 
2508 	/*
2509 	 * The context switch detail is not guaranteed to be 5 when a preemption
2510 	 * occurs, so we can't just check for that. The check below works for
2511 	 * all the cases we care about, including preemptions of WAIT
2512 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2513 	 * would require some extra handling, but we don't support that.
2514 	 */
2515 	if (!ctx_away_valid || new_queue) {
2516 		GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(csb)));
2517 		return true;
2518 	}
2519 
2520 	/*
2521 	 * switch detail = 5 is covered by the case above and we do not expect a
2522 	 * context switch on an unsuccessful wait instruction since we always
2523 	 * use polling mode.
2524 	 */
2525 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb)));
2526 	return false;
2527 }
2528 
2529 static inline bool gen8_csb_parse(const u64 csb)
2530 {
2531 	return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2532 }
2533 
2534 static noinline u64
2535 wa_csb_read(const struct intel_engine_cs *engine, u64 * const csb)
2536 {
2537 	u64 entry;
2538 
2539 	/*
2540 	 * Reading from the HWSP has one particular advantage: we can detect
2541 	 * a stale entry. Since the write into HWSP is broken, we have no reason
2542 	 * to trust the HW at all, the mmio entry may equally be unordered, so
2543 	 * we prefer the path that is self-checking and as a last resort,
2544 	 * return the mmio value.
2545 	 *
2546 	 * tgl,dg1:HSDES#22011327657
2547 	 */
2548 	preempt_disable();
2549 	if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 10)) {
2550 		int idx = csb - engine->execlists.csb_status;
2551 		int status;
2552 
2553 		status = GEN8_EXECLISTS_STATUS_BUF;
2554 		if (idx >= 6) {
2555 			status = GEN11_EXECLISTS_STATUS_BUF2;
2556 			idx -= 6;
2557 		}
2558 		status += sizeof(u64) * idx;
2559 
2560 		entry = intel_uncore_read64(engine->uncore,
2561 					    _MMIO(engine->mmio_base + status));
2562 	}
2563 	preempt_enable();
2564 
2565 	return entry;
2566 }
2567 
2568 static inline u64
2569 csb_read(const struct intel_engine_cs *engine, u64 * const csb)
2570 {
2571 	u64 entry = READ_ONCE(*csb);
2572 
2573 	/*
2574 	 * Unfortunately, the GPU does not always serialise its write
2575 	 * of the CSB entries before its write of the CSB pointer, at least
2576 	 * from the perspective of the CPU, using what is known as a Global
2577 	 * Observation Point. We may read a new CSB tail pointer, but then
2578 	 * read the stale CSB entries, causing us to misinterpret the
2579 	 * context-switch events, and eventually declare the GPU hung.
2580 	 *
2581 	 * icl:HSDES#1806554093
2582 	 * tgl:HSDES#22011248461
2583 	 */
2584 	if (unlikely(entry == -1))
2585 		entry = wa_csb_read(engine, csb);
2586 
2587 	/* Consume this entry so that we can spot its future reuse. */
2588 	WRITE_ONCE(*csb, -1);
2589 
2590 	/* ELSP is an implicit wmb() before the GPU wraps and overwrites csb */
2591 	return entry;
2592 }
2593 
2594 static void process_csb(struct intel_engine_cs *engine)
2595 {
2596 	struct intel_engine_execlists * const execlists = &engine->execlists;
2597 	u64 * const buf = execlists->csb_status;
2598 	const u8 num_entries = execlists->csb_size;
2599 	u8 head, tail;
2600 
2601 	/*
2602 	 * As we modify our execlists state tracking we require exclusive
2603 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2604 	 * and we assume that is only inside the reset paths and so serialised.
2605 	 */
2606 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2607 		   !reset_in_progress(execlists));
2608 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2609 
2610 	/*
2611 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2612 	 * When reading from the csb_write mmio register, we have to be
2613 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2614 	 * the low 4bits. As it happens we know the next 4bits are always
2615 	 * zero and so we can simply masked off the low u8 of the register
2616 	 * and treat it identically to reading from the HWSP (without having
2617 	 * to use explicit shifting and masking, and probably bifurcating
2618 	 * the code to handle the legacy mmio read).
2619 	 */
2620 	head = execlists->csb_head;
2621 	tail = READ_ONCE(*execlists->csb_write);
2622 	if (unlikely(head == tail))
2623 		return;
2624 
2625 	/*
2626 	 * We will consume all events from HW, or at least pretend to.
2627 	 *
2628 	 * The sequence of events from the HW is deterministic, and derived
2629 	 * from our writes to the ELSP, with a smidgen of variability for
2630 	 * the arrival of the asynchronous requests wrt to the inflight
2631 	 * execution. If the HW sends an event that does not correspond with
2632 	 * the one we are expecting, we have to abandon all hope as we lose
2633 	 * all tracking of what the engine is actually executing. We will
2634 	 * only detect we are out of sequence with the HW when we get an
2635 	 * 'impossible' event because we have already drained our own
2636 	 * preemption/promotion queue. If this occurs, we know that we likely
2637 	 * lost track of execution earlier and must unwind and restart, the
2638 	 * simplest way is by stop processing the event queue and force the
2639 	 * engine to reset.
2640 	 */
2641 	execlists->csb_head = tail;
2642 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2643 
2644 	/*
2645 	 * Hopefully paired with a wmb() in HW!
2646 	 *
2647 	 * We must complete the read of the write pointer before any reads
2648 	 * from the CSB, so that we do not see stale values. Without an rmb
2649 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2650 	 * we perform the READ_ONCE(*csb_write).
2651 	 */
2652 	rmb();
2653 	do {
2654 		bool promote;
2655 		u64 csb;
2656 
2657 		if (++head == num_entries)
2658 			head = 0;
2659 
2660 		/*
2661 		 * We are flying near dragons again.
2662 		 *
2663 		 * We hold a reference to the request in execlist_port[]
2664 		 * but no more than that. We are operating in softirq
2665 		 * context and so cannot hold any mutex or sleep. That
2666 		 * prevents us stopping the requests we are processing
2667 		 * in port[] from being retired simultaneously (the
2668 		 * breadcrumb will be complete before we see the
2669 		 * context-switch). As we only hold the reference to the
2670 		 * request, any pointer chasing underneath the request
2671 		 * is subject to a potential use-after-free. Thus we
2672 		 * store all of the bookkeeping within port[] as
2673 		 * required, and avoid using unguarded pointers beneath
2674 		 * request itself. The same applies to the atomic
2675 		 * status notifier.
2676 		 */
2677 
2678 		csb = csb_read(engine, buf + head);
2679 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2680 			     head, upper_32_bits(csb), lower_32_bits(csb));
2681 
2682 		if (INTEL_GEN(engine->i915) >= 12)
2683 			promote = gen12_csb_parse(csb);
2684 		else
2685 			promote = gen8_csb_parse(csb);
2686 		if (promote) {
2687 			struct i915_request * const *old = execlists->active;
2688 
2689 			if (GEM_WARN_ON(!*execlists->pending)) {
2690 				execlists->error_interrupt |= ERROR_CSB;
2691 				break;
2692 			}
2693 
2694 			ring_set_paused(engine, 0);
2695 
2696 			/* Point active to the new ELSP; prevent overwriting */
2697 			WRITE_ONCE(execlists->active, execlists->pending);
2698 			smp_wmb(); /* notify execlists_active() */
2699 
2700 			/* cancel old inflight, prepare for switch */
2701 			trace_ports(execlists, "preempted", old);
2702 			while (*old)
2703 				execlists_schedule_out(*old++);
2704 
2705 			/* switch pending to inflight */
2706 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2707 			copy_ports(execlists->inflight,
2708 				   execlists->pending,
2709 				   execlists_num_ports(execlists));
2710 			smp_wmb(); /* complete the seqlock */
2711 			WRITE_ONCE(execlists->active, execlists->inflight);
2712 
2713 			/* XXX Magic delay for tgl */
2714 			ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2715 
2716 			WRITE_ONCE(execlists->pending[0], NULL);
2717 		} else {
2718 			if (GEM_WARN_ON(!*execlists->active)) {
2719 				execlists->error_interrupt |= ERROR_CSB;
2720 				break;
2721 			}
2722 
2723 			/* port0 completed, advanced to port1 */
2724 			trace_ports(execlists, "completed", execlists->active);
2725 
2726 			/*
2727 			 * We rely on the hardware being strongly
2728 			 * ordered, that the breadcrumb write is
2729 			 * coherent (visible from the CPU) before the
2730 			 * user interrupt is processed. One might assume
2731 			 * that the breadcrumb write being before the
2732 			 * user interrupt and the CS event for the context
2733 			 * switch would therefore be before the CS event
2734 			 * itself...
2735 			 */
2736 			if (GEM_SHOW_DEBUG() &&
2737 			    !i915_request_completed(*execlists->active)) {
2738 				struct i915_request *rq = *execlists->active;
2739 				const u32 *regs __maybe_unused =
2740 					rq->context->lrc_reg_state;
2741 
2742 				ENGINE_TRACE(engine,
2743 					     "context completed before request!\n");
2744 				ENGINE_TRACE(engine,
2745 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2746 					     ENGINE_READ(engine, RING_START),
2747 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2748 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2749 					     ENGINE_READ(engine, RING_CTL),
2750 					     ENGINE_READ(engine, RING_MI_MODE));
2751 				ENGINE_TRACE(engine,
2752 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2753 					     i915_ggtt_offset(rq->ring->vma),
2754 					     rq->head, rq->tail,
2755 					     rq->fence.context,
2756 					     lower_32_bits(rq->fence.seqno),
2757 					     hwsp_seqno(rq));
2758 				ENGINE_TRACE(engine,
2759 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2760 					     regs[CTX_RING_START],
2761 					     regs[CTX_RING_HEAD],
2762 					     regs[CTX_RING_TAIL]);
2763 			}
2764 
2765 			execlists_schedule_out(*execlists->active++);
2766 
2767 			GEM_BUG_ON(execlists->active - execlists->inflight >
2768 				   execlists_num_ports(execlists));
2769 		}
2770 	} while (head != tail);
2771 
2772 	set_timeslice(engine);
2773 
2774 	/*
2775 	 * Gen11 has proven to fail wrt global observation point between
2776 	 * entry and tail update, failing on the ordering and thus
2777 	 * we see an old entry in the context status buffer.
2778 	 *
2779 	 * Forcibly evict out entries for the next gpu csb update,
2780 	 * to increase the odds that we get a fresh entries with non
2781 	 * working hardware. The cost for doing so comes out mostly with
2782 	 * the wash as hardware, working or not, will need to do the
2783 	 * invalidation before.
2784 	 */
2785 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2786 }
2787 
2788 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2789 {
2790 	lockdep_assert_held(&engine->active.lock);
2791 	if (!READ_ONCE(engine->execlists.pending[0])) {
2792 		rcu_read_lock(); /* protect peeking at execlists->active */
2793 		execlists_dequeue(engine);
2794 		rcu_read_unlock();
2795 	}
2796 }
2797 
2798 static void __execlists_hold(struct i915_request *rq)
2799 {
2800 	LIST_HEAD(list);
2801 
2802 	do {
2803 		struct i915_dependency *p;
2804 
2805 		if (i915_request_is_active(rq))
2806 			__i915_request_unsubmit(rq);
2807 
2808 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2809 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2810 		i915_request_set_hold(rq);
2811 		RQ_TRACE(rq, "on hold\n");
2812 
2813 		for_each_waiter(p, rq) {
2814 			struct i915_request *w =
2815 				container_of(p->waiter, typeof(*w), sched);
2816 
2817 			/* Leave semaphores spinning on the other engines */
2818 			if (w->engine != rq->engine)
2819 				continue;
2820 
2821 			if (!i915_request_is_ready(w))
2822 				continue;
2823 
2824 			if (i915_request_completed(w))
2825 				continue;
2826 
2827 			if (i915_request_on_hold(w))
2828 				continue;
2829 
2830 			list_move_tail(&w->sched.link, &list);
2831 		}
2832 
2833 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2834 	} while (rq);
2835 }
2836 
2837 static bool execlists_hold(struct intel_engine_cs *engine,
2838 			   struct i915_request *rq)
2839 {
2840 	spin_lock_irq(&engine->active.lock);
2841 
2842 	if (i915_request_completed(rq)) { /* too late! */
2843 		rq = NULL;
2844 		goto unlock;
2845 	}
2846 
2847 	if (rq->engine != engine) { /* preempted virtual engine */
2848 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2849 
2850 		/*
2851 		 * intel_context_inflight() is only protected by virtue
2852 		 * of process_csb() being called only by the tasklet (or
2853 		 * directly from inside reset while the tasklet is suspended).
2854 		 * Assert that neither of those are allowed to run while we
2855 		 * poke at the request queues.
2856 		 */
2857 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2858 
2859 		/*
2860 		 * An unsubmitted request along a virtual engine will
2861 		 * remain on the active (this) engine until we are able
2862 		 * to process the context switch away (and so mark the
2863 		 * context as no longer in flight). That cannot have happened
2864 		 * yet, otherwise we would not be hanging!
2865 		 */
2866 		spin_lock(&ve->base.active.lock);
2867 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2868 		GEM_BUG_ON(ve->request != rq);
2869 		ve->request = NULL;
2870 		spin_unlock(&ve->base.active.lock);
2871 		i915_request_put(rq);
2872 
2873 		rq->engine = engine;
2874 	}
2875 
2876 	/*
2877 	 * Transfer this request onto the hold queue to prevent it
2878 	 * being resumbitted to HW (and potentially completed) before we have
2879 	 * released it. Since we may have already submitted following
2880 	 * requests, we need to remove those as well.
2881 	 */
2882 	GEM_BUG_ON(i915_request_on_hold(rq));
2883 	GEM_BUG_ON(rq->engine != engine);
2884 	__execlists_hold(rq);
2885 	GEM_BUG_ON(list_empty(&engine->active.hold));
2886 
2887 unlock:
2888 	spin_unlock_irq(&engine->active.lock);
2889 	return rq;
2890 }
2891 
2892 static bool hold_request(const struct i915_request *rq)
2893 {
2894 	struct i915_dependency *p;
2895 	bool result = false;
2896 
2897 	/*
2898 	 * If one of our ancestors is on hold, we must also be on hold,
2899 	 * otherwise we will bypass it and execute before it.
2900 	 */
2901 	rcu_read_lock();
2902 	for_each_signaler(p, rq) {
2903 		const struct i915_request *s =
2904 			container_of(p->signaler, typeof(*s), sched);
2905 
2906 		if (s->engine != rq->engine)
2907 			continue;
2908 
2909 		result = i915_request_on_hold(s);
2910 		if (result)
2911 			break;
2912 	}
2913 	rcu_read_unlock();
2914 
2915 	return result;
2916 }
2917 
2918 static void __execlists_unhold(struct i915_request *rq)
2919 {
2920 	LIST_HEAD(list);
2921 
2922 	do {
2923 		struct i915_dependency *p;
2924 
2925 		RQ_TRACE(rq, "hold release\n");
2926 
2927 		GEM_BUG_ON(!i915_request_on_hold(rq));
2928 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2929 
2930 		i915_request_clear_hold(rq);
2931 		list_move_tail(&rq->sched.link,
2932 			       i915_sched_lookup_priolist(rq->engine,
2933 							  rq_prio(rq)));
2934 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2935 
2936 		/* Also release any children on this engine that are ready */
2937 		for_each_waiter(p, rq) {
2938 			struct i915_request *w =
2939 				container_of(p->waiter, typeof(*w), sched);
2940 
2941 			/* Propagate any change in error status */
2942 			if (rq->fence.error)
2943 				i915_request_set_error_once(w, rq->fence.error);
2944 
2945 			if (w->engine != rq->engine)
2946 				continue;
2947 
2948 			if (!i915_request_on_hold(w))
2949 				continue;
2950 
2951 			/* Check that no other parents are also on hold */
2952 			if (hold_request(w))
2953 				continue;
2954 
2955 			list_move_tail(&w->sched.link, &list);
2956 		}
2957 
2958 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2959 	} while (rq);
2960 }
2961 
2962 static void execlists_unhold(struct intel_engine_cs *engine,
2963 			     struct i915_request *rq)
2964 {
2965 	spin_lock_irq(&engine->active.lock);
2966 
2967 	/*
2968 	 * Move this request back to the priority queue, and all of its
2969 	 * children and grandchildren that were suspended along with it.
2970 	 */
2971 	__execlists_unhold(rq);
2972 
2973 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2974 		engine->execlists.queue_priority_hint = rq_prio(rq);
2975 		tasklet_hi_schedule(&engine->execlists.tasklet);
2976 	}
2977 
2978 	spin_unlock_irq(&engine->active.lock);
2979 }
2980 
2981 struct execlists_capture {
2982 	struct work_struct work;
2983 	struct i915_request *rq;
2984 	struct i915_gpu_coredump *error;
2985 };
2986 
2987 static void execlists_capture_work(struct work_struct *work)
2988 {
2989 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2990 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2991 	struct intel_engine_cs *engine = cap->rq->engine;
2992 	struct intel_gt_coredump *gt = cap->error->gt;
2993 	struct intel_engine_capture_vma *vma;
2994 
2995 	/* Compress all the objects attached to the request, slow! */
2996 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2997 	if (vma) {
2998 		struct i915_vma_compress *compress =
2999 			i915_vma_capture_prepare(gt);
3000 
3001 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
3002 		i915_vma_capture_finish(gt, compress);
3003 	}
3004 
3005 	gt->simulated = gt->engine->simulated;
3006 	cap->error->simulated = gt->simulated;
3007 
3008 	/* Publish the error state, and announce it to the world */
3009 	i915_error_state_store(cap->error);
3010 	i915_gpu_coredump_put(cap->error);
3011 
3012 	/* Return this request and all that depend upon it for signaling */
3013 	execlists_unhold(engine, cap->rq);
3014 	i915_request_put(cap->rq);
3015 
3016 	kfree(cap);
3017 }
3018 
3019 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
3020 {
3021 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
3022 	struct execlists_capture *cap;
3023 
3024 	cap = kmalloc(sizeof(*cap), gfp);
3025 	if (!cap)
3026 		return NULL;
3027 
3028 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
3029 	if (!cap->error)
3030 		goto err_cap;
3031 
3032 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
3033 	if (!cap->error->gt)
3034 		goto err_gpu;
3035 
3036 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
3037 	if (!cap->error->gt->engine)
3038 		goto err_gt;
3039 
3040 	cap->error->gt->engine->hung = true;
3041 
3042 	return cap;
3043 
3044 err_gt:
3045 	kfree(cap->error->gt);
3046 err_gpu:
3047 	kfree(cap->error);
3048 err_cap:
3049 	kfree(cap);
3050 	return NULL;
3051 }
3052 
3053 static struct i915_request *
3054 active_context(struct intel_engine_cs *engine, u32 ccid)
3055 {
3056 	const struct intel_engine_execlists * const el = &engine->execlists;
3057 	struct i915_request * const *port, *rq;
3058 
3059 	/*
3060 	 * Use the most recent result from process_csb(), but just in case
3061 	 * we trigger an error (via interrupt) before the first CS event has
3062 	 * been written, peek at the next submission.
3063 	 */
3064 
3065 	for (port = el->active; (rq = *port); port++) {
3066 		if (rq->context->lrc.ccid == ccid) {
3067 			ENGINE_TRACE(engine,
3068 				     "ccid found at active:%zd\n",
3069 				     port - el->active);
3070 			return rq;
3071 		}
3072 	}
3073 
3074 	for (port = el->pending; (rq = *port); port++) {
3075 		if (rq->context->lrc.ccid == ccid) {
3076 			ENGINE_TRACE(engine,
3077 				     "ccid found at pending:%zd\n",
3078 				     port - el->pending);
3079 			return rq;
3080 		}
3081 	}
3082 
3083 	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3084 	return NULL;
3085 }
3086 
3087 static u32 active_ccid(struct intel_engine_cs *engine)
3088 {
3089 	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3090 }
3091 
3092 static void execlists_capture(struct intel_engine_cs *engine)
3093 {
3094 	struct execlists_capture *cap;
3095 
3096 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3097 		return;
3098 
3099 	/*
3100 	 * We need to _quickly_ capture the engine state before we reset.
3101 	 * We are inside an atomic section (softirq) here and we are delaying
3102 	 * the forced preemption event.
3103 	 */
3104 	cap = capture_regs(engine);
3105 	if (!cap)
3106 		return;
3107 
3108 	spin_lock_irq(&engine->active.lock);
3109 	cap->rq = active_context(engine, active_ccid(engine));
3110 	if (cap->rq) {
3111 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3112 		cap->rq = i915_request_get_rcu(cap->rq);
3113 	}
3114 	spin_unlock_irq(&engine->active.lock);
3115 	if (!cap->rq)
3116 		goto err_free;
3117 
3118 	/*
3119 	 * Remove the request from the execlists queue, and take ownership
3120 	 * of the request. We pass it to our worker who will _slowly_ compress
3121 	 * all the pages the _user_ requested for debugging their batch, after
3122 	 * which we return it to the queue for signaling.
3123 	 *
3124 	 * By removing them from the execlists queue, we also remove the
3125 	 * requests from being processed by __unwind_incomplete_requests()
3126 	 * during the intel_engine_reset(), and so they will *not* be replayed
3127 	 * afterwards.
3128 	 *
3129 	 * Note that because we have not yet reset the engine at this point,
3130 	 * it is possible for the request that we have identified as being
3131 	 * guilty, did in fact complete and we will then hit an arbitration
3132 	 * point allowing the outstanding preemption to succeed. The likelihood
3133 	 * of that is very low (as capturing of the engine registers should be
3134 	 * fast enough to run inside an irq-off atomic section!), so we will
3135 	 * simply hold that request accountable for being non-preemptible
3136 	 * long enough to force the reset.
3137 	 */
3138 	if (!execlists_hold(engine, cap->rq))
3139 		goto err_rq;
3140 
3141 	INIT_WORK(&cap->work, execlists_capture_work);
3142 	schedule_work(&cap->work);
3143 	return;
3144 
3145 err_rq:
3146 	i915_request_put(cap->rq);
3147 err_free:
3148 	i915_gpu_coredump_put(cap->error);
3149 	kfree(cap);
3150 }
3151 
3152 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3153 {
3154 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3155 	unsigned long *lock = &engine->gt->reset.flags;
3156 
3157 	if (!intel_has_reset_engine(engine->gt))
3158 		return;
3159 
3160 	if (test_and_set_bit(bit, lock))
3161 		return;
3162 
3163 	ENGINE_TRACE(engine, "reset for %s\n", msg);
3164 
3165 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3166 	tasklet_disable_nosync(&engine->execlists.tasklet);
3167 
3168 	ring_set_paused(engine, 1); /* Freeze the current request in place */
3169 	execlists_capture(engine);
3170 	intel_engine_reset(engine, msg);
3171 
3172 	tasklet_enable(&engine->execlists.tasklet);
3173 	clear_and_wake_up_bit(bit, lock);
3174 }
3175 
3176 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3177 {
3178 	const struct timer_list *t = &engine->execlists.preempt;
3179 
3180 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3181 		return false;
3182 
3183 	if (!timer_expired(t))
3184 		return false;
3185 
3186 	return READ_ONCE(engine->execlists.pending[0]);
3187 }
3188 
3189 /*
3190  * Check the unread Context Status Buffers and manage the submission of new
3191  * contexts to the ELSP accordingly.
3192  */
3193 static void execlists_submission_tasklet(unsigned long data)
3194 {
3195 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3196 	bool timeout = preempt_timeout(engine);
3197 
3198 	process_csb(engine);
3199 
3200 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3201 		const char *msg;
3202 
3203 		/* Generate the error message in priority wrt to the user! */
3204 		if (engine->execlists.error_interrupt & GENMASK(15, 0))
3205 			msg = "CS error"; /* thrown by a user payload */
3206 		else if (engine->execlists.error_interrupt & ERROR_CSB)
3207 			msg = "invalid CSB event";
3208 		else
3209 			msg = "internal error";
3210 
3211 		engine->execlists.error_interrupt = 0;
3212 		execlists_reset(engine, msg);
3213 	}
3214 
3215 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3216 		unsigned long flags;
3217 
3218 		spin_lock_irqsave(&engine->active.lock, flags);
3219 		__execlists_submission_tasklet(engine);
3220 		spin_unlock_irqrestore(&engine->active.lock, flags);
3221 
3222 		/* Recheck after serialising with direct-submission */
3223 		if (unlikely(timeout && preempt_timeout(engine)))
3224 			execlists_reset(engine, "preemption time out");
3225 	}
3226 }
3227 
3228 static void __execlists_kick(struct intel_engine_execlists *execlists)
3229 {
3230 	/* Kick the tasklet for some interrupt coalescing and reset handling */
3231 	tasklet_hi_schedule(&execlists->tasklet);
3232 }
3233 
3234 #define execlists_kick(t, member) \
3235 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3236 
3237 static void execlists_timeslice(struct timer_list *timer)
3238 {
3239 	execlists_kick(timer, timer);
3240 }
3241 
3242 static void execlists_preempt(struct timer_list *timer)
3243 {
3244 	execlists_kick(timer, preempt);
3245 }
3246 
3247 static void queue_request(struct intel_engine_cs *engine,
3248 			  struct i915_request *rq)
3249 {
3250 	GEM_BUG_ON(!list_empty(&rq->sched.link));
3251 	list_add_tail(&rq->sched.link,
3252 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3253 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3254 }
3255 
3256 static void __submit_queue_imm(struct intel_engine_cs *engine)
3257 {
3258 	struct intel_engine_execlists * const execlists = &engine->execlists;
3259 
3260 	if (reset_in_progress(execlists))
3261 		return; /* defer until we restart the engine following reset */
3262 
3263 	__execlists_submission_tasklet(engine);
3264 }
3265 
3266 static void submit_queue(struct intel_engine_cs *engine,
3267 			 const struct i915_request *rq)
3268 {
3269 	struct intel_engine_execlists *execlists = &engine->execlists;
3270 
3271 	if (rq_prio(rq) <= execlists->queue_priority_hint)
3272 		return;
3273 
3274 	execlists->queue_priority_hint = rq_prio(rq);
3275 	__submit_queue_imm(engine);
3276 }
3277 
3278 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3279 			     const struct i915_request *rq)
3280 {
3281 	GEM_BUG_ON(i915_request_on_hold(rq));
3282 	return !list_empty(&engine->active.hold) && hold_request(rq);
3283 }
3284 
3285 static void flush_csb(struct intel_engine_cs *engine)
3286 {
3287 	struct intel_engine_execlists *el = &engine->execlists;
3288 
3289 	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3290 		if (!reset_in_progress(el))
3291 			process_csb(engine);
3292 		tasklet_unlock(&el->tasklet);
3293 	}
3294 }
3295 
3296 static void execlists_submit_request(struct i915_request *request)
3297 {
3298 	struct intel_engine_cs *engine = request->engine;
3299 	unsigned long flags;
3300 
3301 	/* Hopefully we clear execlists->pending[] to let us through */
3302 	flush_csb(engine);
3303 
3304 	/* Will be called from irq-context when using foreign fences. */
3305 	spin_lock_irqsave(&engine->active.lock, flags);
3306 
3307 	if (unlikely(ancestor_on_hold(engine, request))) {
3308 		RQ_TRACE(request, "ancestor on hold\n");
3309 		list_add_tail(&request->sched.link, &engine->active.hold);
3310 		i915_request_set_hold(request);
3311 	} else {
3312 		queue_request(engine, request);
3313 
3314 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3315 		GEM_BUG_ON(list_empty(&request->sched.link));
3316 
3317 		submit_queue(engine, request);
3318 	}
3319 
3320 	spin_unlock_irqrestore(&engine->active.lock, flags);
3321 }
3322 
3323 static void __execlists_context_fini(struct intel_context *ce)
3324 {
3325 	intel_ring_put(ce->ring);
3326 	i915_vma_put(ce->state);
3327 }
3328 
3329 static void execlists_context_destroy(struct kref *kref)
3330 {
3331 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3332 
3333 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3334 	GEM_BUG_ON(intel_context_is_pinned(ce));
3335 
3336 	if (ce->state)
3337 		__execlists_context_fini(ce);
3338 
3339 	intel_context_fini(ce);
3340 	intel_context_free(ce);
3341 }
3342 
3343 static void
3344 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3345 {
3346 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3347 		return;
3348 
3349 	vaddr += engine->context_size;
3350 
3351 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3352 }
3353 
3354 static void
3355 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3356 {
3357 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3358 		return;
3359 
3360 	vaddr += engine->context_size;
3361 
3362 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3363 		drm_err_once(&engine->i915->drm,
3364 			     "%s context redzone overwritten!\n",
3365 			     engine->name);
3366 }
3367 
3368 static void execlists_context_unpin(struct intel_context *ce)
3369 {
3370 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3371 		      ce->engine);
3372 }
3373 
3374 static void execlists_context_post_unpin(struct intel_context *ce)
3375 {
3376 	i915_gem_object_unpin_map(ce->state->obj);
3377 }
3378 
3379 static u32 *
3380 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3381 {
3382 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3383 		MI_SRM_LRM_GLOBAL_GTT |
3384 		MI_LRI_LRM_CS_MMIO;
3385 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3386 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3387 		CTX_TIMESTAMP * sizeof(u32);
3388 	*cs++ = 0;
3389 
3390 	*cs++ = MI_LOAD_REGISTER_REG |
3391 		MI_LRR_SOURCE_CS_MMIO |
3392 		MI_LRI_LRM_CS_MMIO;
3393 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3394 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3395 
3396 	*cs++ = MI_LOAD_REGISTER_REG |
3397 		MI_LRR_SOURCE_CS_MMIO |
3398 		MI_LRI_LRM_CS_MMIO;
3399 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3400 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3401 
3402 	return cs;
3403 }
3404 
3405 static u32 *
3406 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3407 {
3408 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3409 
3410 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3411 		MI_SRM_LRM_GLOBAL_GTT |
3412 		MI_LRI_LRM_CS_MMIO;
3413 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3414 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3415 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3416 	*cs++ = 0;
3417 
3418 	return cs;
3419 }
3420 
3421 static u32 *
3422 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3423 {
3424 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3425 
3426 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3427 		MI_SRM_LRM_GLOBAL_GTT |
3428 		MI_LRI_LRM_CS_MMIO;
3429 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3430 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3431 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3432 	*cs++ = 0;
3433 
3434 	*cs++ = MI_LOAD_REGISTER_REG |
3435 		MI_LRR_SOURCE_CS_MMIO |
3436 		MI_LRI_LRM_CS_MMIO;
3437 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3438 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3439 
3440 	return cs;
3441 }
3442 
3443 static u32 *
3444 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3445 {
3446 	cs = gen12_emit_timestamp_wa(ce, cs);
3447 	cs = gen12_emit_cmd_buf_wa(ce, cs);
3448 	cs = gen12_emit_restore_scratch(ce, cs);
3449 
3450 	return cs;
3451 }
3452 
3453 static u32 *
3454 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3455 {
3456 	cs = gen12_emit_timestamp_wa(ce, cs);
3457 	cs = gen12_emit_restore_scratch(ce, cs);
3458 
3459 	return cs;
3460 }
3461 
3462 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3463 {
3464 	return PAGE_SIZE * ce->wa_bb_page;
3465 }
3466 
3467 static u32 *context_indirect_bb(const struct intel_context *ce)
3468 {
3469 	void *ptr;
3470 
3471 	GEM_BUG_ON(!ce->wa_bb_page);
3472 
3473 	ptr = ce->lrc_reg_state;
3474 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3475 	ptr += context_wa_bb_offset(ce);
3476 
3477 	return ptr;
3478 }
3479 
3480 static void
3481 setup_indirect_ctx_bb(const struct intel_context *ce,
3482 		      const struct intel_engine_cs *engine,
3483 		      u32 *(*emit)(const struct intel_context *, u32 *))
3484 {
3485 	u32 * const start = context_indirect_bb(ce);
3486 	u32 *cs;
3487 
3488 	cs = emit(ce, start);
3489 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3490 	while ((unsigned long)cs % CACHELINE_BYTES)
3491 		*cs++ = MI_NOOP;
3492 
3493 	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3494 				    i915_ggtt_offset(ce->state) +
3495 				    context_wa_bb_offset(ce),
3496 				    (cs - start) * sizeof(*cs));
3497 }
3498 
3499 static void
3500 __execlists_update_reg_state(const struct intel_context *ce,
3501 			     const struct intel_engine_cs *engine,
3502 			     u32 head)
3503 {
3504 	struct intel_ring *ring = ce->ring;
3505 	u32 *regs = ce->lrc_reg_state;
3506 
3507 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3508 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3509 
3510 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3511 	regs[CTX_RING_HEAD] = head;
3512 	regs[CTX_RING_TAIL] = ring->tail;
3513 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3514 
3515 	/* RPCS */
3516 	if (engine->class == RENDER_CLASS) {
3517 		regs[CTX_R_PWR_CLK_STATE] =
3518 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3519 
3520 		i915_oa_init_reg_state(ce, engine);
3521 	}
3522 
3523 	if (ce->wa_bb_page) {
3524 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3525 
3526 		fn = gen12_emit_indirect_ctx_xcs;
3527 		if (ce->engine->class == RENDER_CLASS)
3528 			fn = gen12_emit_indirect_ctx_rcs;
3529 
3530 		/* Mutually exclusive wrt to global indirect bb */
3531 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3532 		setup_indirect_ctx_bb(ce, engine, fn);
3533 	}
3534 }
3535 
3536 static int
3537 execlists_context_pre_pin(struct intel_context *ce,
3538 			  struct i915_gem_ww_ctx *ww, void **vaddr)
3539 {
3540 	GEM_BUG_ON(!ce->state);
3541 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3542 
3543 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
3544 					i915_coherent_map_type(ce->engine->i915) |
3545 					I915_MAP_OVERRIDE);
3546 
3547 	return PTR_ERR_OR_ZERO(*vaddr);
3548 }
3549 
3550 static int
3551 __execlists_context_pin(struct intel_context *ce,
3552 			struct intel_engine_cs *engine,
3553 			void *vaddr)
3554 {
3555 	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3556 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3557 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3558 
3559 	return 0;
3560 }
3561 
3562 static int execlists_context_pin(struct intel_context *ce, void *vaddr)
3563 {
3564 	return __execlists_context_pin(ce, ce->engine, vaddr);
3565 }
3566 
3567 static int execlists_context_alloc(struct intel_context *ce)
3568 {
3569 	return __execlists_context_alloc(ce, ce->engine);
3570 }
3571 
3572 static void execlists_context_reset(struct intel_context *ce)
3573 {
3574 	CE_TRACE(ce, "reset\n");
3575 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3576 
3577 	intel_ring_reset(ce->ring, ce->ring->emit);
3578 
3579 	/* Scrub away the garbage */
3580 	execlists_init_reg_state(ce->lrc_reg_state,
3581 				 ce, ce->engine, ce->ring, true);
3582 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3583 
3584 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3585 }
3586 
3587 static const struct intel_context_ops execlists_context_ops = {
3588 	.alloc = execlists_context_alloc,
3589 
3590 	.pre_pin = execlists_context_pre_pin,
3591 	.pin = execlists_context_pin,
3592 	.unpin = execlists_context_unpin,
3593 	.post_unpin = execlists_context_post_unpin,
3594 
3595 	.enter = intel_context_enter_engine,
3596 	.exit = intel_context_exit_engine,
3597 
3598 	.reset = execlists_context_reset,
3599 	.destroy = execlists_context_destroy,
3600 };
3601 
3602 static u32 hwsp_offset(const struct i915_request *rq)
3603 {
3604 	const struct intel_timeline_cacheline *cl;
3605 
3606 	/* Before the request is executed, the timeline/cachline is fixed */
3607 
3608 	cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
3609 	if (cl)
3610 		return cl->ggtt_offset;
3611 
3612 	return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
3613 }
3614 
3615 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3616 {
3617 	u32 *cs;
3618 
3619 	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3620 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3621 		return 0;
3622 
3623 	cs = intel_ring_begin(rq, 6);
3624 	if (IS_ERR(cs))
3625 		return PTR_ERR(cs);
3626 
3627 	/*
3628 	 * Check if we have been preempted before we even get started.
3629 	 *
3630 	 * After this point i915_request_started() reports true, even if
3631 	 * we get preempted and so are no longer running.
3632 	 */
3633 	*cs++ = MI_ARB_CHECK;
3634 	*cs++ = MI_NOOP;
3635 
3636 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3637 	*cs++ = hwsp_offset(rq);
3638 	*cs++ = 0;
3639 	*cs++ = rq->fence.seqno - 1;
3640 
3641 	intel_ring_advance(rq, cs);
3642 
3643 	/* Record the updated position of the request's payload */
3644 	rq->infix = intel_ring_offset(rq, cs);
3645 
3646 	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3647 
3648 	return 0;
3649 }
3650 
3651 static int emit_pdps(struct i915_request *rq)
3652 {
3653 	const struct intel_engine_cs * const engine = rq->engine;
3654 	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3655 	int err, i;
3656 	u32 *cs;
3657 
3658 	GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3659 
3660 	/*
3661 	 * Beware ye of the dragons, this sequence is magic!
3662 	 *
3663 	 * Small changes to this sequence can cause anything from
3664 	 * GPU hangs to forcewake errors and machine lockups!
3665 	 */
3666 
3667 	/* Flush any residual operations from the context load */
3668 	err = engine->emit_flush(rq, EMIT_FLUSH);
3669 	if (err)
3670 		return err;
3671 
3672 	/* Magic required to prevent forcewake errors! */
3673 	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3674 	if (err)
3675 		return err;
3676 
3677 	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3678 	if (IS_ERR(cs))
3679 		return PTR_ERR(cs);
3680 
3681 	/* Ensure the LRI have landed before we invalidate & continue */
3682 	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3683 	for (i = GEN8_3LVL_PDPES; i--; ) {
3684 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3685 		u32 base = engine->mmio_base;
3686 
3687 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3688 		*cs++ = upper_32_bits(pd_daddr);
3689 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3690 		*cs++ = lower_32_bits(pd_daddr);
3691 	}
3692 	*cs++ = MI_NOOP;
3693 
3694 	intel_ring_advance(rq, cs);
3695 
3696 	return 0;
3697 }
3698 
3699 static int execlists_request_alloc(struct i915_request *request)
3700 {
3701 	int ret;
3702 
3703 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3704 
3705 	/*
3706 	 * Flush enough space to reduce the likelihood of waiting after
3707 	 * we start building the request - in which case we will just
3708 	 * have to repeat work.
3709 	 */
3710 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3711 
3712 	/*
3713 	 * Note that after this point, we have committed to using
3714 	 * this request as it is being used to both track the
3715 	 * state of engine initialisation and liveness of the
3716 	 * golden renderstate above. Think twice before you try
3717 	 * to cancel/unwind this request now.
3718 	 */
3719 
3720 	if (!i915_vm_is_4lvl(request->context->vm)) {
3721 		ret = emit_pdps(request);
3722 		if (ret)
3723 			return ret;
3724 	}
3725 
3726 	/* Unconditionally invalidate GPU caches and TLBs. */
3727 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3728 	if (ret)
3729 		return ret;
3730 
3731 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3732 	return 0;
3733 }
3734 
3735 /*
3736  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3737  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3738  * but there is a slight complication as this is applied in WA batch where the
3739  * values are only initialized once so we cannot take register value at the
3740  * beginning and reuse it further; hence we save its value to memory, upload a
3741  * constant value with bit21 set and then we restore it back with the saved value.
3742  * To simplify the WA, a constant value is formed by using the default value
3743  * of this register. This shouldn't be a problem because we are only modifying
3744  * it for a short period and this batch in non-premptible. We can ofcourse
3745  * use additional instructions that read the actual value of the register
3746  * at that time and set our bit of interest but it makes the WA complicated.
3747  *
3748  * This WA is also required for Gen9 so extracting as a function avoids
3749  * code duplication.
3750  */
3751 static u32 *
3752 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3753 {
3754 	/* NB no one else is allowed to scribble over scratch + 256! */
3755 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3756 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3757 	*batch++ = intel_gt_scratch_offset(engine->gt,
3758 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3759 	*batch++ = 0;
3760 
3761 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3762 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3763 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3764 
3765 	batch = gen8_emit_pipe_control(batch,
3766 				       PIPE_CONTROL_CS_STALL |
3767 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3768 				       0);
3769 
3770 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3771 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3772 	*batch++ = intel_gt_scratch_offset(engine->gt,
3773 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3774 	*batch++ = 0;
3775 
3776 	return batch;
3777 }
3778 
3779 /*
3780  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3781  * initialized at the beginning and shared across all contexts but this field
3782  * helps us to have multiple batches at different offsets and select them based
3783  * on a criteria. At the moment this batch always start at the beginning of the page
3784  * and at this point we don't have multiple wa_ctx batch buffers.
3785  *
3786  * The number of WA applied are not known at the beginning; we use this field
3787  * to return the no of DWORDS written.
3788  *
3789  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3790  * so it adds NOOPs as padding to make it cacheline aligned.
3791  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3792  * makes a complete batch buffer.
3793  */
3794 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3795 {
3796 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3797 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3798 
3799 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3800 	if (IS_BROADWELL(engine->i915))
3801 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3802 
3803 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3804 	/* Actual scratch location is at 128 bytes offset */
3805 	batch = gen8_emit_pipe_control(batch,
3806 				       PIPE_CONTROL_FLUSH_L3 |
3807 				       PIPE_CONTROL_STORE_DATA_INDEX |
3808 				       PIPE_CONTROL_CS_STALL |
3809 				       PIPE_CONTROL_QW_WRITE,
3810 				       LRC_PPHWSP_SCRATCH_ADDR);
3811 
3812 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3813 
3814 	/* Pad to end of cacheline */
3815 	while ((unsigned long)batch % CACHELINE_BYTES)
3816 		*batch++ = MI_NOOP;
3817 
3818 	/*
3819 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3820 	 * execution depends on the length specified in terms of cache lines
3821 	 * in the register CTX_RCS_INDIRECT_CTX
3822 	 */
3823 
3824 	return batch;
3825 }
3826 
3827 struct lri {
3828 	i915_reg_t reg;
3829 	u32 value;
3830 };
3831 
3832 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3833 {
3834 	GEM_BUG_ON(!count || count > 63);
3835 
3836 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3837 	do {
3838 		*batch++ = i915_mmio_reg_offset(lri->reg);
3839 		*batch++ = lri->value;
3840 	} while (lri++, --count);
3841 	*batch++ = MI_NOOP;
3842 
3843 	return batch;
3844 }
3845 
3846 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3847 {
3848 	static const struct lri lri[] = {
3849 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3850 		{
3851 			COMMON_SLICE_CHICKEN2,
3852 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3853 				       0),
3854 		},
3855 
3856 		/* BSpec: 11391 */
3857 		{
3858 			FF_SLICE_CHICKEN,
3859 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3860 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3861 		},
3862 
3863 		/* BSpec: 11299 */
3864 		{
3865 			_3D_CHICKEN3,
3866 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3867 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3868 		}
3869 	};
3870 
3871 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3872 
3873 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3874 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3875 
3876 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3877 	batch = gen8_emit_pipe_control(batch,
3878 				       PIPE_CONTROL_FLUSH_L3 |
3879 				       PIPE_CONTROL_STORE_DATA_INDEX |
3880 				       PIPE_CONTROL_CS_STALL |
3881 				       PIPE_CONTROL_QW_WRITE,
3882 				       LRC_PPHWSP_SCRATCH_ADDR);
3883 
3884 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3885 
3886 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3887 	if (HAS_POOLED_EU(engine->i915)) {
3888 		/*
3889 		 * EU pool configuration is setup along with golden context
3890 		 * during context initialization. This value depends on
3891 		 * device type (2x6 or 3x6) and needs to be updated based
3892 		 * on which subslice is disabled especially for 2x6
3893 		 * devices, however it is safe to load default
3894 		 * configuration of 3x6 device instead of masking off
3895 		 * corresponding bits because HW ignores bits of a disabled
3896 		 * subslice and drops down to appropriate config. Please
3897 		 * see render_state_setup() in i915_gem_render_state.c for
3898 		 * possible configurations, to avoid duplication they are
3899 		 * not shown here again.
3900 		 */
3901 		*batch++ = GEN9_MEDIA_POOL_STATE;
3902 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3903 		*batch++ = 0x00777000;
3904 		*batch++ = 0;
3905 		*batch++ = 0;
3906 		*batch++ = 0;
3907 	}
3908 
3909 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3910 
3911 	/* Pad to end of cacheline */
3912 	while ((unsigned long)batch % CACHELINE_BYTES)
3913 		*batch++ = MI_NOOP;
3914 
3915 	return batch;
3916 }
3917 
3918 static u32 *
3919 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3920 {
3921 	int i;
3922 
3923 	/*
3924 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3925 	 *
3926 	 * Ensure the engine is idle prior to programming a
3927 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3928 	 */
3929 	batch = gen8_emit_pipe_control(batch,
3930 				       PIPE_CONTROL_CS_STALL,
3931 				       0);
3932 	/*
3933 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3934 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3935 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3936 	 * confusing. Since gen8_emit_pipe_control() already advances the
3937 	 * batch by 6 dwords, we advance the other 10 here, completing a
3938 	 * cacheline. It's not clear if the workaround requires this padding
3939 	 * before other commands, or if it's just the regular padding we would
3940 	 * already have for the workaround bb, so leave it here for now.
3941 	 */
3942 	for (i = 0; i < 10; i++)
3943 		*batch++ = MI_NOOP;
3944 
3945 	/* Pad to end of cacheline */
3946 	while ((unsigned long)batch % CACHELINE_BYTES)
3947 		*batch++ = MI_NOOP;
3948 
3949 	return batch;
3950 }
3951 
3952 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3953 
3954 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3955 {
3956 	struct drm_i915_gem_object *obj;
3957 	struct i915_vma *vma;
3958 	int err;
3959 
3960 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3961 	if (IS_ERR(obj))
3962 		return PTR_ERR(obj);
3963 
3964 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3965 	if (IS_ERR(vma)) {
3966 		err = PTR_ERR(vma);
3967 		goto err;
3968 	}
3969 
3970 	err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
3971 	if (err)
3972 		goto err;
3973 
3974 	engine->wa_ctx.vma = vma;
3975 	return 0;
3976 
3977 err:
3978 	i915_gem_object_put(obj);
3979 	return err;
3980 }
3981 
3982 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3983 {
3984 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3985 }
3986 
3987 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3988 
3989 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3990 {
3991 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3992 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3993 					    &wa_ctx->per_ctx };
3994 	wa_bb_func_t wa_bb_fn[2];
3995 	void *batch, *batch_ptr;
3996 	unsigned int i;
3997 	int ret;
3998 
3999 	if (engine->class != RENDER_CLASS)
4000 		return 0;
4001 
4002 	switch (INTEL_GEN(engine->i915)) {
4003 	case 12:
4004 	case 11:
4005 		return 0;
4006 	case 10:
4007 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
4008 		wa_bb_fn[1] = NULL;
4009 		break;
4010 	case 9:
4011 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
4012 		wa_bb_fn[1] = NULL;
4013 		break;
4014 	case 8:
4015 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
4016 		wa_bb_fn[1] = NULL;
4017 		break;
4018 	default:
4019 		MISSING_CASE(INTEL_GEN(engine->i915));
4020 		return 0;
4021 	}
4022 
4023 	ret = lrc_setup_wa_ctx(engine);
4024 	if (ret) {
4025 		drm_dbg(&engine->i915->drm,
4026 			"Failed to setup context WA page: %d\n", ret);
4027 		return ret;
4028 	}
4029 
4030 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
4031 
4032 	/*
4033 	 * Emit the two workaround batch buffers, recording the offset from the
4034 	 * start of the workaround batch buffer object for each and their
4035 	 * respective sizes.
4036 	 */
4037 	batch_ptr = batch;
4038 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
4039 		wa_bb[i]->offset = batch_ptr - batch;
4040 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
4041 						  CACHELINE_BYTES))) {
4042 			ret = -EINVAL;
4043 			break;
4044 		}
4045 		if (wa_bb_fn[i])
4046 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
4047 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
4048 	}
4049 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
4050 
4051 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
4052 	__i915_gem_object_release_map(wa_ctx->vma->obj);
4053 	if (ret)
4054 		lrc_destroy_wa_ctx(engine);
4055 
4056 	return ret;
4057 }
4058 
4059 static void reset_csb_pointers(struct intel_engine_cs *engine)
4060 {
4061 	struct intel_engine_execlists * const execlists = &engine->execlists;
4062 	const unsigned int reset_value = execlists->csb_size - 1;
4063 
4064 	ring_set_paused(engine, 0);
4065 
4066 	/*
4067 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
4068 	 * Bludgeon them with a mmio update to be sure.
4069 	 */
4070 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4071 		     0xffff << 16 | reset_value << 8 | reset_value);
4072 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4073 
4074 	/*
4075 	 * After a reset, the HW starts writing into CSB entry [0]. We
4076 	 * therefore have to set our HEAD pointer back one entry so that
4077 	 * the *first* entry we check is entry 0. To complicate this further,
4078 	 * as we don't wait for the first interrupt after reset, we have to
4079 	 * fake the HW write to point back to the last entry so that our
4080 	 * inline comparison of our cached head position against the last HW
4081 	 * write works even before the first interrupt.
4082 	 */
4083 	execlists->csb_head = reset_value;
4084 	WRITE_ONCE(*execlists->csb_write, reset_value);
4085 	wmb(); /* Make sure this is visible to HW (paranoia?) */
4086 
4087 	/* Check that the GPU does indeed update the CSB entries! */
4088 	memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64));
4089 	invalidate_csb_entries(&execlists->csb_status[0],
4090 			       &execlists->csb_status[reset_value]);
4091 
4092 	/* Once more for luck and our trusty paranoia */
4093 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4094 		     0xffff << 16 | reset_value << 8 | reset_value);
4095 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4096 
4097 	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4098 }
4099 
4100 static void execlists_sanitize(struct intel_engine_cs *engine)
4101 {
4102 	GEM_BUG_ON(execlists_active(&engine->execlists));
4103 
4104 	/*
4105 	 * Poison residual state on resume, in case the suspend didn't!
4106 	 *
4107 	 * We have to assume that across suspend/resume (or other loss
4108 	 * of control) that the contents of our pinned buffers has been
4109 	 * lost, replaced by garbage. Since this doesn't always happen,
4110 	 * let's poison such state so that we more quickly spot when
4111 	 * we falsely assume it has been preserved.
4112 	 */
4113 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4114 		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4115 
4116 	reset_csb_pointers(engine);
4117 
4118 	/*
4119 	 * The kernel_context HWSP is stored in the status_page. As above,
4120 	 * that may be lost on resume/initialisation, and so we need to
4121 	 * reset the value in the HWSP.
4122 	 */
4123 	intel_timeline_reset_seqno(engine->kernel_context->timeline);
4124 
4125 	/* And scrub the dirty cachelines for the HWSP */
4126 	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4127 }
4128 
4129 static void enable_error_interrupt(struct intel_engine_cs *engine)
4130 {
4131 	u32 status;
4132 
4133 	engine->execlists.error_interrupt = 0;
4134 	ENGINE_WRITE(engine, RING_EMR, ~0u);
4135 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4136 
4137 	status = ENGINE_READ(engine, RING_ESR);
4138 	if (unlikely(status)) {
4139 		drm_err(&engine->i915->drm,
4140 			"engine '%s' resumed still in error: %08x\n",
4141 			engine->name, status);
4142 		__intel_gt_reset(engine->gt, engine->mask);
4143 	}
4144 
4145 	/*
4146 	 * On current gen8+, we have 2 signals to play with
4147 	 *
4148 	 * - I915_ERROR_INSTUCTION (bit 0)
4149 	 *
4150 	 *    Generate an error if the command parser encounters an invalid
4151 	 *    instruction
4152 	 *
4153 	 *    This is a fatal error.
4154 	 *
4155 	 * - CP_PRIV (bit 2)
4156 	 *
4157 	 *    Generate an error on privilege violation (where the CP replaces
4158 	 *    the instruction with a no-op). This also fires for writes into
4159 	 *    read-only scratch pages.
4160 	 *
4161 	 *    This is a non-fatal error, parsing continues.
4162 	 *
4163 	 * * there are a few others defined for odd HW that we do not use
4164 	 *
4165 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4166 	 * error (as the HW is validating and suppressing the mistakes), we
4167 	 * only unmask the instruction error bit.
4168 	 */
4169 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4170 }
4171 
4172 static void enable_execlists(struct intel_engine_cs *engine)
4173 {
4174 	u32 mode;
4175 
4176 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4177 
4178 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4179 
4180 	if (INTEL_GEN(engine->i915) >= 11)
4181 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4182 	else
4183 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4184 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4185 
4186 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4187 
4188 	ENGINE_WRITE_FW(engine,
4189 			RING_HWS_PGA,
4190 			i915_ggtt_offset(engine->status_page.vma));
4191 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4192 
4193 	enable_error_interrupt(engine);
4194 
4195 	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4196 }
4197 
4198 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4199 {
4200 	bool unexpected = false;
4201 
4202 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4203 		drm_dbg(&engine->i915->drm,
4204 			"STOP_RING still set in RING_MI_MODE\n");
4205 		unexpected = true;
4206 	}
4207 
4208 	return unexpected;
4209 }
4210 
4211 static int execlists_resume(struct intel_engine_cs *engine)
4212 {
4213 	intel_mocs_init_engine(engine);
4214 
4215 	intel_breadcrumbs_reset(engine->breadcrumbs);
4216 
4217 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4218 		struct drm_printer p = drm_debug_printer(__func__);
4219 
4220 		intel_engine_dump(engine, &p, NULL);
4221 	}
4222 
4223 	enable_execlists(engine);
4224 
4225 	return 0;
4226 }
4227 
4228 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4229 {
4230 	struct intel_engine_execlists * const execlists = &engine->execlists;
4231 	unsigned long flags;
4232 
4233 	ENGINE_TRACE(engine, "depth<-%d\n",
4234 		     atomic_read(&execlists->tasklet.count));
4235 
4236 	/*
4237 	 * Prevent request submission to the hardware until we have
4238 	 * completed the reset in i915_gem_reset_finish(). If a request
4239 	 * is completed by one engine, it may then queue a request
4240 	 * to a second via its execlists->tasklet *just* as we are
4241 	 * calling engine->resume() and also writing the ELSP.
4242 	 * Turning off the execlists->tasklet until the reset is over
4243 	 * prevents the race.
4244 	 */
4245 	__tasklet_disable_sync_once(&execlists->tasklet);
4246 	GEM_BUG_ON(!reset_in_progress(execlists));
4247 
4248 	/* And flush any current direct submission. */
4249 	spin_lock_irqsave(&engine->active.lock, flags);
4250 	spin_unlock_irqrestore(&engine->active.lock, flags);
4251 
4252 	/*
4253 	 * We stop engines, otherwise we might get failed reset and a
4254 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4255 	 * from system hang if batchbuffer is progressing when
4256 	 * the reset is issued, regardless of READY_TO_RESET ack.
4257 	 * Thus assume it is best to stop engines on all gens
4258 	 * where we have a gpu reset.
4259 	 *
4260 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4261 	 *
4262 	 * FIXME: Wa for more modern gens needs to be validated
4263 	 */
4264 	ring_set_paused(engine, 1);
4265 	intel_engine_stop_cs(engine);
4266 
4267 	engine->execlists.reset_ccid = active_ccid(engine);
4268 }
4269 
4270 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4271 {
4272 	int x;
4273 
4274 	x = lrc_ring_mi_mode(engine);
4275 	if (x != -1) {
4276 		regs[x + 1] &= ~STOP_RING;
4277 		regs[x + 1] |= STOP_RING << 16;
4278 	}
4279 }
4280 
4281 static void __execlists_reset_reg_state(const struct intel_context *ce,
4282 					const struct intel_engine_cs *engine)
4283 {
4284 	u32 *regs = ce->lrc_reg_state;
4285 
4286 	__reset_stop_ring(regs, engine);
4287 }
4288 
4289 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4290 {
4291 	struct intel_engine_execlists * const execlists = &engine->execlists;
4292 	struct intel_context *ce;
4293 	struct i915_request *rq;
4294 	u32 head;
4295 
4296 	mb(); /* paranoia: read the CSB pointers from after the reset */
4297 	clflush(execlists->csb_write);
4298 	mb();
4299 
4300 	process_csb(engine); /* drain preemption events */
4301 
4302 	/* Following the reset, we need to reload the CSB read/write pointers */
4303 	reset_csb_pointers(engine);
4304 
4305 	/*
4306 	 * Save the currently executing context, even if we completed
4307 	 * its request, it was still running at the time of the
4308 	 * reset and will have been clobbered.
4309 	 */
4310 	rq = active_context(engine, engine->execlists.reset_ccid);
4311 	if (!rq)
4312 		goto unwind;
4313 
4314 	ce = rq->context;
4315 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4316 
4317 	if (i915_request_completed(rq)) {
4318 		/* Idle context; tidy up the ring so we can restart afresh */
4319 		head = intel_ring_wrap(ce->ring, rq->tail);
4320 		goto out_replay;
4321 	}
4322 
4323 	/* We still have requests in-flight; the engine should be active */
4324 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4325 
4326 	/* Context has requests still in-flight; it should not be idle! */
4327 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4328 
4329 	rq = active_request(ce->timeline, rq);
4330 	head = intel_ring_wrap(ce->ring, rq->head);
4331 	GEM_BUG_ON(head == ce->ring->tail);
4332 
4333 	/*
4334 	 * If this request hasn't started yet, e.g. it is waiting on a
4335 	 * semaphore, we need to avoid skipping the request or else we
4336 	 * break the signaling chain. However, if the context is corrupt
4337 	 * the request will not restart and we will be stuck with a wedged
4338 	 * device. It is quite often the case that if we issue a reset
4339 	 * while the GPU is loading the context image, that the context
4340 	 * image becomes corrupt.
4341 	 *
4342 	 * Otherwise, if we have not started yet, the request should replay
4343 	 * perfectly and we do not need to flag the result as being erroneous.
4344 	 */
4345 	if (!i915_request_started(rq))
4346 		goto out_replay;
4347 
4348 	/*
4349 	 * If the request was innocent, we leave the request in the ELSP
4350 	 * and will try to replay it on restarting. The context image may
4351 	 * have been corrupted by the reset, in which case we may have
4352 	 * to service a new GPU hang, but more likely we can continue on
4353 	 * without impact.
4354 	 *
4355 	 * If the request was guilty, we presume the context is corrupt
4356 	 * and have to at least restore the RING register in the context
4357 	 * image back to the expected values to skip over the guilty request.
4358 	 */
4359 	__i915_request_reset(rq, stalled);
4360 
4361 	/*
4362 	 * We want a simple context + ring to execute the breadcrumb update.
4363 	 * We cannot rely on the context being intact across the GPU hang,
4364 	 * so clear it and rebuild just what we need for the breadcrumb.
4365 	 * All pending requests for this context will be zapped, and any
4366 	 * future request will be after userspace has had the opportunity
4367 	 * to recreate its own state.
4368 	 */
4369 out_replay:
4370 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4371 		     head, ce->ring->tail);
4372 	__execlists_reset_reg_state(ce, engine);
4373 	__execlists_update_reg_state(ce, engine, head);
4374 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4375 
4376 unwind:
4377 	/* Push back any incomplete requests for replay after the reset. */
4378 	cancel_port_requests(execlists);
4379 	__unwind_incomplete_requests(engine);
4380 }
4381 
4382 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4383 {
4384 	unsigned long flags;
4385 
4386 	ENGINE_TRACE(engine, "\n");
4387 
4388 	spin_lock_irqsave(&engine->active.lock, flags);
4389 
4390 	__execlists_reset(engine, stalled);
4391 
4392 	spin_unlock_irqrestore(&engine->active.lock, flags);
4393 }
4394 
4395 static void nop_submission_tasklet(unsigned long data)
4396 {
4397 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4398 
4399 	/* The driver is wedged; don't process any more events. */
4400 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4401 }
4402 
4403 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4404 {
4405 	struct intel_engine_execlists * const execlists = &engine->execlists;
4406 	struct i915_request *rq, *rn;
4407 	struct rb_node *rb;
4408 	unsigned long flags;
4409 
4410 	ENGINE_TRACE(engine, "\n");
4411 
4412 	/*
4413 	 * Before we call engine->cancel_requests(), we should have exclusive
4414 	 * access to the submission state. This is arranged for us by the
4415 	 * caller disabling the interrupt generation, the tasklet and other
4416 	 * threads that may then access the same state, giving us a free hand
4417 	 * to reset state. However, we still need to let lockdep be aware that
4418 	 * we know this state may be accessed in hardirq context, so we
4419 	 * disable the irq around this manipulation and we want to keep
4420 	 * the spinlock focused on its duties and not accidentally conflate
4421 	 * coverage to the submission's irq state. (Similarly, although we
4422 	 * shouldn't need to disable irq around the manipulation of the
4423 	 * submission's irq state, we also wish to remind ourselves that
4424 	 * it is irq state.)
4425 	 */
4426 	spin_lock_irqsave(&engine->active.lock, flags);
4427 
4428 	__execlists_reset(engine, true);
4429 
4430 	/* Mark all executing requests as skipped. */
4431 	list_for_each_entry(rq, &engine->active.requests, sched.link)
4432 		mark_eio(rq);
4433 	intel_engine_signal_breadcrumbs(engine);
4434 
4435 	/* Flush the queued requests to the timeline list (for retiring). */
4436 	while ((rb = rb_first_cached(&execlists->queue))) {
4437 		struct i915_priolist *p = to_priolist(rb);
4438 		int i;
4439 
4440 		priolist_for_each_request_consume(rq, rn, p, i) {
4441 			mark_eio(rq);
4442 			__i915_request_submit(rq);
4443 		}
4444 
4445 		rb_erase_cached(&p->node, &execlists->queue);
4446 		i915_priolist_free(p);
4447 	}
4448 
4449 	/* On-hold requests will be flushed to timeline upon their release */
4450 	list_for_each_entry(rq, &engine->active.hold, sched.link)
4451 		mark_eio(rq);
4452 
4453 	/* Cancel all attached virtual engines */
4454 	while ((rb = rb_first_cached(&execlists->virtual))) {
4455 		struct virtual_engine *ve =
4456 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4457 
4458 		rb_erase_cached(rb, &execlists->virtual);
4459 		RB_CLEAR_NODE(rb);
4460 
4461 		spin_lock(&ve->base.active.lock);
4462 		rq = fetch_and_zero(&ve->request);
4463 		if (rq) {
4464 			mark_eio(rq);
4465 
4466 			rq->engine = engine;
4467 			__i915_request_submit(rq);
4468 			i915_request_put(rq);
4469 
4470 			ve->base.execlists.queue_priority_hint = INT_MIN;
4471 		}
4472 		spin_unlock(&ve->base.active.lock);
4473 	}
4474 
4475 	/* Remaining _unready_ requests will be nop'ed when submitted */
4476 
4477 	execlists->queue_priority_hint = INT_MIN;
4478 	execlists->queue = RB_ROOT_CACHED;
4479 
4480 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4481 	execlists->tasklet.func = nop_submission_tasklet;
4482 
4483 	spin_unlock_irqrestore(&engine->active.lock, flags);
4484 }
4485 
4486 static void execlists_reset_finish(struct intel_engine_cs *engine)
4487 {
4488 	struct intel_engine_execlists * const execlists = &engine->execlists;
4489 
4490 	/*
4491 	 * After a GPU reset, we may have requests to replay. Do so now while
4492 	 * we still have the forcewake to be sure that the GPU is not allowed
4493 	 * to sleep before we restart and reload a context.
4494 	 */
4495 	GEM_BUG_ON(!reset_in_progress(execlists));
4496 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4497 		execlists->tasklet.func(execlists->tasklet.data);
4498 
4499 	if (__tasklet_enable(&execlists->tasklet))
4500 		/* And kick in case we missed a new request submission. */
4501 		tasklet_hi_schedule(&execlists->tasklet);
4502 	ENGINE_TRACE(engine, "depth->%d\n",
4503 		     atomic_read(&execlists->tasklet.count));
4504 }
4505 
4506 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4507 				    u64 offset, u32 len,
4508 				    const unsigned int flags)
4509 {
4510 	u32 *cs;
4511 
4512 	cs = intel_ring_begin(rq, 4);
4513 	if (IS_ERR(cs))
4514 		return PTR_ERR(cs);
4515 
4516 	/*
4517 	 * WaDisableCtxRestoreArbitration:bdw,chv
4518 	 *
4519 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4520 	 * particular all the gen that do not need the w/a at all!), if we
4521 	 * took care to make sure that on every switch into this context
4522 	 * (both ordinary and for preemption) that arbitrartion was enabled
4523 	 * we would be fine.  However, for gen8 there is another w/a that
4524 	 * requires us to not preempt inside GPGPU execution, so we keep
4525 	 * arbitration disabled for gen8 batches. Arbitration will be
4526 	 * re-enabled before we close the request
4527 	 * (engine->emit_fini_breadcrumb).
4528 	 */
4529 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4530 
4531 	/* FIXME(BDW+): Address space and security selectors. */
4532 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4533 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4534 	*cs++ = lower_32_bits(offset);
4535 	*cs++ = upper_32_bits(offset);
4536 
4537 	intel_ring_advance(rq, cs);
4538 
4539 	return 0;
4540 }
4541 
4542 static int gen8_emit_bb_start(struct i915_request *rq,
4543 			      u64 offset, u32 len,
4544 			      const unsigned int flags)
4545 {
4546 	u32 *cs;
4547 
4548 	cs = intel_ring_begin(rq, 6);
4549 	if (IS_ERR(cs))
4550 		return PTR_ERR(cs);
4551 
4552 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4553 
4554 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4555 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4556 	*cs++ = lower_32_bits(offset);
4557 	*cs++ = upper_32_bits(offset);
4558 
4559 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4560 	*cs++ = MI_NOOP;
4561 
4562 	intel_ring_advance(rq, cs);
4563 
4564 	return 0;
4565 }
4566 
4567 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4568 {
4569 	ENGINE_WRITE(engine, RING_IMR,
4570 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4571 	ENGINE_POSTING_READ(engine, RING_IMR);
4572 }
4573 
4574 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4575 {
4576 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4577 }
4578 
4579 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4580 {
4581 	u32 cmd, *cs;
4582 
4583 	cs = intel_ring_begin(request, 4);
4584 	if (IS_ERR(cs))
4585 		return PTR_ERR(cs);
4586 
4587 	cmd = MI_FLUSH_DW + 1;
4588 
4589 	/* We always require a command barrier so that subsequent
4590 	 * commands, such as breadcrumb interrupts, are strictly ordered
4591 	 * wrt the contents of the write cache being flushed to memory
4592 	 * (and thus being coherent from the CPU).
4593 	 */
4594 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4595 
4596 	if (mode & EMIT_INVALIDATE) {
4597 		cmd |= MI_INVALIDATE_TLB;
4598 		if (request->engine->class == VIDEO_DECODE_CLASS)
4599 			cmd |= MI_INVALIDATE_BSD;
4600 	}
4601 
4602 	*cs++ = cmd;
4603 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4604 	*cs++ = 0; /* upper addr */
4605 	*cs++ = 0; /* value */
4606 	intel_ring_advance(request, cs);
4607 
4608 	return 0;
4609 }
4610 
4611 static int gen8_emit_flush_render(struct i915_request *request,
4612 				  u32 mode)
4613 {
4614 	bool vf_flush_wa = false, dc_flush_wa = false;
4615 	u32 *cs, flags = 0;
4616 	int len;
4617 
4618 	flags |= PIPE_CONTROL_CS_STALL;
4619 
4620 	if (mode & EMIT_FLUSH) {
4621 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4622 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4623 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4624 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4625 	}
4626 
4627 	if (mode & EMIT_INVALIDATE) {
4628 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4629 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4630 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4631 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4632 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4633 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4634 		flags |= PIPE_CONTROL_QW_WRITE;
4635 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4636 
4637 		/*
4638 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4639 		 * pipe control.
4640 		 */
4641 		if (IS_GEN(request->engine->i915, 9))
4642 			vf_flush_wa = true;
4643 
4644 		/* WaForGAMHang:kbl */
4645 		if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0))
4646 			dc_flush_wa = true;
4647 	}
4648 
4649 	len = 6;
4650 
4651 	if (vf_flush_wa)
4652 		len += 6;
4653 
4654 	if (dc_flush_wa)
4655 		len += 12;
4656 
4657 	cs = intel_ring_begin(request, len);
4658 	if (IS_ERR(cs))
4659 		return PTR_ERR(cs);
4660 
4661 	if (vf_flush_wa)
4662 		cs = gen8_emit_pipe_control(cs, 0, 0);
4663 
4664 	if (dc_flush_wa)
4665 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4666 					    0);
4667 
4668 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4669 
4670 	if (dc_flush_wa)
4671 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4672 
4673 	intel_ring_advance(request, cs);
4674 
4675 	return 0;
4676 }
4677 
4678 static int gen11_emit_flush_render(struct i915_request *request,
4679 				   u32 mode)
4680 {
4681 	if (mode & EMIT_FLUSH) {
4682 		u32 *cs;
4683 		u32 flags = 0;
4684 
4685 		flags |= PIPE_CONTROL_CS_STALL;
4686 
4687 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4688 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4689 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4690 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4691 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4692 		flags |= PIPE_CONTROL_QW_WRITE;
4693 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4694 
4695 		cs = intel_ring_begin(request, 6);
4696 		if (IS_ERR(cs))
4697 			return PTR_ERR(cs);
4698 
4699 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4700 		intel_ring_advance(request, cs);
4701 	}
4702 
4703 	if (mode & EMIT_INVALIDATE) {
4704 		u32 *cs;
4705 		u32 flags = 0;
4706 
4707 		flags |= PIPE_CONTROL_CS_STALL;
4708 
4709 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4710 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4711 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4712 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4713 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4714 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4715 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4716 		flags |= PIPE_CONTROL_QW_WRITE;
4717 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4718 
4719 		cs = intel_ring_begin(request, 6);
4720 		if (IS_ERR(cs))
4721 			return PTR_ERR(cs);
4722 
4723 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4724 		intel_ring_advance(request, cs);
4725 	}
4726 
4727 	return 0;
4728 }
4729 
4730 static u32 preparser_disable(bool state)
4731 {
4732 	return MI_ARB_CHECK | 1 << 8 | state;
4733 }
4734 
4735 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4736 {
4737 	static const i915_reg_t vd[] = {
4738 		GEN12_VD0_AUX_NV,
4739 		GEN12_VD1_AUX_NV,
4740 		GEN12_VD2_AUX_NV,
4741 		GEN12_VD3_AUX_NV,
4742 	};
4743 
4744 	static const i915_reg_t ve[] = {
4745 		GEN12_VE0_AUX_NV,
4746 		GEN12_VE1_AUX_NV,
4747 	};
4748 
4749 	if (engine->class == VIDEO_DECODE_CLASS)
4750 		return vd[engine->instance];
4751 
4752 	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4753 		return ve[engine->instance];
4754 
4755 	GEM_BUG_ON("unknown aux_inv_reg\n");
4756 
4757 	return INVALID_MMIO_REG;
4758 }
4759 
4760 static u32 *
4761 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4762 {
4763 	*cs++ = MI_LOAD_REGISTER_IMM(1);
4764 	*cs++ = i915_mmio_reg_offset(inv_reg);
4765 	*cs++ = AUX_INV;
4766 	*cs++ = MI_NOOP;
4767 
4768 	return cs;
4769 }
4770 
4771 static int gen12_emit_flush_render(struct i915_request *request,
4772 				   u32 mode)
4773 {
4774 	if (mode & EMIT_FLUSH) {
4775 		u32 flags = 0;
4776 		u32 *cs;
4777 
4778 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4779 		flags |= PIPE_CONTROL_FLUSH_L3;
4780 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4781 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4782 		/* Wa_1409600907:tgl */
4783 		flags |= PIPE_CONTROL_DEPTH_STALL;
4784 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4785 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4786 
4787 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4788 		flags |= PIPE_CONTROL_QW_WRITE;
4789 
4790 		flags |= PIPE_CONTROL_CS_STALL;
4791 
4792 		cs = intel_ring_begin(request, 6);
4793 		if (IS_ERR(cs))
4794 			return PTR_ERR(cs);
4795 
4796 		cs = gen12_emit_pipe_control(cs,
4797 					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4798 					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4799 		intel_ring_advance(request, cs);
4800 	}
4801 
4802 	if (mode & EMIT_INVALIDATE) {
4803 		u32 flags = 0;
4804 		u32 *cs;
4805 
4806 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4807 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4808 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4809 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4810 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4811 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4812 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4813 
4814 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4815 		flags |= PIPE_CONTROL_QW_WRITE;
4816 
4817 		flags |= PIPE_CONTROL_CS_STALL;
4818 
4819 		cs = intel_ring_begin(request, 8 + 4);
4820 		if (IS_ERR(cs))
4821 			return PTR_ERR(cs);
4822 
4823 		/*
4824 		 * Prevent the pre-parser from skipping past the TLB
4825 		 * invalidate and loading a stale page for the batch
4826 		 * buffer / request payload.
4827 		 */
4828 		*cs++ = preparser_disable(true);
4829 
4830 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4831 
4832 		/* hsdes: 1809175790 */
4833 		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4834 
4835 		*cs++ = preparser_disable(false);
4836 		intel_ring_advance(request, cs);
4837 	}
4838 
4839 	return 0;
4840 }
4841 
4842 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4843 {
4844 	intel_engine_mask_t aux_inv = 0;
4845 	u32 cmd, *cs;
4846 
4847 	cmd = 4;
4848 	if (mode & EMIT_INVALIDATE)
4849 		cmd += 2;
4850 	if (mode & EMIT_INVALIDATE)
4851 		aux_inv = request->engine->mask & ~BIT(BCS0);
4852 	if (aux_inv)
4853 		cmd += 2 * hweight8(aux_inv) + 2;
4854 
4855 	cs = intel_ring_begin(request, cmd);
4856 	if (IS_ERR(cs))
4857 		return PTR_ERR(cs);
4858 
4859 	if (mode & EMIT_INVALIDATE)
4860 		*cs++ = preparser_disable(true);
4861 
4862 	cmd = MI_FLUSH_DW + 1;
4863 
4864 	/* We always require a command barrier so that subsequent
4865 	 * commands, such as breadcrumb interrupts, are strictly ordered
4866 	 * wrt the contents of the write cache being flushed to memory
4867 	 * (and thus being coherent from the CPU).
4868 	 */
4869 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4870 
4871 	if (mode & EMIT_INVALIDATE) {
4872 		cmd |= MI_INVALIDATE_TLB;
4873 		if (request->engine->class == VIDEO_DECODE_CLASS)
4874 			cmd |= MI_INVALIDATE_BSD;
4875 	}
4876 
4877 	*cs++ = cmd;
4878 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4879 	*cs++ = 0; /* upper addr */
4880 	*cs++ = 0; /* value */
4881 
4882 	if (aux_inv) { /* hsdes: 1809175790 */
4883 		struct intel_engine_cs *engine;
4884 		unsigned int tmp;
4885 
4886 		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4887 		for_each_engine_masked(engine, request->engine->gt,
4888 				       aux_inv, tmp) {
4889 			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4890 			*cs++ = AUX_INV;
4891 		}
4892 		*cs++ = MI_NOOP;
4893 	}
4894 
4895 	if (mode & EMIT_INVALIDATE)
4896 		*cs++ = preparser_disable(false);
4897 
4898 	intel_ring_advance(request, cs);
4899 
4900 	return 0;
4901 }
4902 
4903 static void assert_request_valid(struct i915_request *rq)
4904 {
4905 	struct intel_ring *ring __maybe_unused = rq->ring;
4906 
4907 	/* Can we unwind this request without appearing to go forwards? */
4908 	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4909 }
4910 
4911 /*
4912  * Reserve space for 2 NOOPs at the end of each request to be
4913  * used as a workaround for not being allowed to do lite
4914  * restore with HEAD==TAIL (WaIdleLiteRestore).
4915  */
4916 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4917 {
4918 	/* Ensure there's always at least one preemption point per-request. */
4919 	*cs++ = MI_ARB_CHECK;
4920 	*cs++ = MI_NOOP;
4921 	request->wa_tail = intel_ring_offset(request, cs);
4922 
4923 	/* Check that entire request is less than half the ring */
4924 	assert_request_valid(request);
4925 
4926 	return cs;
4927 }
4928 
4929 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4930 {
4931 	*cs++ = MI_SEMAPHORE_WAIT |
4932 		MI_SEMAPHORE_GLOBAL_GTT |
4933 		MI_SEMAPHORE_POLL |
4934 		MI_SEMAPHORE_SAD_EQ_SDD;
4935 	*cs++ = 0;
4936 	*cs++ = intel_hws_preempt_address(request->engine);
4937 	*cs++ = 0;
4938 
4939 	return cs;
4940 }
4941 
4942 static __always_inline u32*
4943 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4944 {
4945 	*cs++ = MI_USER_INTERRUPT;
4946 
4947 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4948 	if (intel_engine_has_semaphores(request->engine))
4949 		cs = emit_preempt_busywait(request, cs);
4950 
4951 	request->tail = intel_ring_offset(request, cs);
4952 	assert_ring_tail_valid(request->ring, request->tail);
4953 
4954 	return gen8_emit_wa_tail(request, cs);
4955 }
4956 
4957 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
4958 {
4959 	return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
4960 }
4961 
4962 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4963 {
4964 	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4965 }
4966 
4967 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4968 {
4969 	cs = gen8_emit_pipe_control(cs,
4970 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4971 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4972 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4973 				    0);
4974 
4975 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4976 	cs = gen8_emit_ggtt_write_rcs(cs,
4977 				      request->fence.seqno,
4978 				      hwsp_offset(request),
4979 				      PIPE_CONTROL_FLUSH_ENABLE |
4980 				      PIPE_CONTROL_CS_STALL);
4981 
4982 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4983 }
4984 
4985 static u32 *
4986 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4987 {
4988 	cs = gen8_emit_ggtt_write_rcs(cs,
4989 				      request->fence.seqno,
4990 				      hwsp_offset(request),
4991 				      PIPE_CONTROL_CS_STALL |
4992 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4993 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4994 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4995 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4996 				      PIPE_CONTROL_FLUSH_ENABLE);
4997 
4998 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4999 }
5000 
5001 /*
5002  * Note that the CS instruction pre-parser will not stall on the breadcrumb
5003  * flush and will continue pre-fetching the instructions after it before the
5004  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
5005  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
5006  * of the next request before the memory has been flushed, we're guaranteed that
5007  * we won't access the batch itself too early.
5008  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
5009  * so, if the current request is modifying an instruction in the next request on
5010  * the same intel_context, we might pre-fetch and then execute the pre-update
5011  * instruction. To avoid this, the users of self-modifying code should either
5012  * disable the parser around the code emitting the memory writes, via a new flag
5013  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
5014  * the in-kernel use-cases we've opted to use a separate context, see
5015  * reloc_gpu() as an example.
5016  * All the above applies only to the instructions themselves. Non-inline data
5017  * used by the instructions is not pre-fetched.
5018  */
5019 
5020 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
5021 {
5022 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
5023 		MI_SEMAPHORE_GLOBAL_GTT |
5024 		MI_SEMAPHORE_POLL |
5025 		MI_SEMAPHORE_SAD_EQ_SDD;
5026 	*cs++ = 0;
5027 	*cs++ = intel_hws_preempt_address(request->engine);
5028 	*cs++ = 0;
5029 	*cs++ = 0;
5030 	*cs++ = MI_NOOP;
5031 
5032 	return cs;
5033 }
5034 
5035 static __always_inline u32*
5036 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
5037 {
5038 	*cs++ = MI_USER_INTERRUPT;
5039 
5040 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
5041 	if (intel_engine_has_semaphores(request->engine))
5042 		cs = gen12_emit_preempt_busywait(request, cs);
5043 
5044 	request->tail = intel_ring_offset(request, cs);
5045 	assert_ring_tail_valid(request->ring, request->tail);
5046 
5047 	return gen8_emit_wa_tail(request, cs);
5048 }
5049 
5050 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
5051 {
5052 	/* XXX Stalling flush before seqno write; post-sync not */
5053 	cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
5054 	return gen12_emit_fini_breadcrumb_tail(rq, cs);
5055 }
5056 
5057 static u32 *
5058 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
5059 {
5060 	cs = gen12_emit_ggtt_write_rcs(cs,
5061 				       request->fence.seqno,
5062 				       hwsp_offset(request),
5063 				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
5064 				       PIPE_CONTROL_CS_STALL |
5065 				       PIPE_CONTROL_TILE_CACHE_FLUSH |
5066 				       PIPE_CONTROL_FLUSH_L3 |
5067 				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5068 				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5069 				       /* Wa_1409600907:tgl */
5070 				       PIPE_CONTROL_DEPTH_STALL |
5071 				       PIPE_CONTROL_DC_FLUSH_ENABLE |
5072 				       PIPE_CONTROL_FLUSH_ENABLE);
5073 
5074 	return gen12_emit_fini_breadcrumb_tail(request, cs);
5075 }
5076 
5077 static void execlists_park(struct intel_engine_cs *engine)
5078 {
5079 	cancel_timer(&engine->execlists.timer);
5080 	cancel_timer(&engine->execlists.preempt);
5081 }
5082 
5083 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
5084 {
5085 	engine->submit_request = execlists_submit_request;
5086 	engine->schedule = i915_schedule;
5087 	engine->execlists.tasklet.func = execlists_submission_tasklet;
5088 
5089 	engine->reset.prepare = execlists_reset_prepare;
5090 	engine->reset.rewind = execlists_reset_rewind;
5091 	engine->reset.cancel = execlists_reset_cancel;
5092 	engine->reset.finish = execlists_reset_finish;
5093 
5094 	engine->park = execlists_park;
5095 	engine->unpark = NULL;
5096 
5097 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5098 	if (!intel_vgpu_active(engine->i915)) {
5099 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5100 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5101 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5102 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5103 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5104 		}
5105 	}
5106 
5107 	if (INTEL_GEN(engine->i915) >= 12)
5108 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5109 
5110 	if (intel_engine_has_preemption(engine))
5111 		engine->emit_bb_start = gen8_emit_bb_start;
5112 	else
5113 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
5114 }
5115 
5116 static void execlists_shutdown(struct intel_engine_cs *engine)
5117 {
5118 	/* Synchronise with residual timers and any softirq they raise */
5119 	del_timer_sync(&engine->execlists.timer);
5120 	del_timer_sync(&engine->execlists.preempt);
5121 	tasklet_kill(&engine->execlists.tasklet);
5122 }
5123 
5124 static void execlists_release(struct intel_engine_cs *engine)
5125 {
5126 	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5127 
5128 	execlists_shutdown(engine);
5129 
5130 	intel_engine_cleanup_common(engine);
5131 	lrc_destroy_wa_ctx(engine);
5132 }
5133 
5134 static void
5135 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5136 {
5137 	/* Default vfuncs which can be overriden by each engine. */
5138 
5139 	engine->resume = execlists_resume;
5140 
5141 	engine->cops = &execlists_context_ops;
5142 	engine->request_alloc = execlists_request_alloc;
5143 
5144 	engine->emit_flush = gen8_emit_flush;
5145 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5146 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5147 	if (INTEL_GEN(engine->i915) >= 12) {
5148 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5149 		engine->emit_flush = gen12_emit_flush;
5150 	}
5151 	engine->set_default_submission = intel_execlists_set_default_submission;
5152 
5153 	if (INTEL_GEN(engine->i915) < 11) {
5154 		engine->irq_enable = gen8_logical_ring_enable_irq;
5155 		engine->irq_disable = gen8_logical_ring_disable_irq;
5156 	} else {
5157 		/*
5158 		 * TODO: On Gen11 interrupt masks need to be clear
5159 		 * to allow C6 entry. Keep interrupts enabled at
5160 		 * and take the hit of generating extra interrupts
5161 		 * until a more refined solution exists.
5162 		 */
5163 	}
5164 }
5165 
5166 static inline void
5167 logical_ring_default_irqs(struct intel_engine_cs *engine)
5168 {
5169 	unsigned int shift = 0;
5170 
5171 	if (INTEL_GEN(engine->i915) < 11) {
5172 		const u8 irq_shifts[] = {
5173 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5174 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5175 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5176 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5177 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5178 		};
5179 
5180 		shift = irq_shifts[engine->id];
5181 	}
5182 
5183 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5184 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5185 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5186 	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5187 }
5188 
5189 static void rcs_submission_override(struct intel_engine_cs *engine)
5190 {
5191 	switch (INTEL_GEN(engine->i915)) {
5192 	case 12:
5193 		engine->emit_flush = gen12_emit_flush_render;
5194 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5195 		break;
5196 	case 11:
5197 		engine->emit_flush = gen11_emit_flush_render;
5198 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5199 		break;
5200 	default:
5201 		engine->emit_flush = gen8_emit_flush_render;
5202 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5203 		break;
5204 	}
5205 }
5206 
5207 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5208 {
5209 	struct intel_engine_execlists * const execlists = &engine->execlists;
5210 	struct drm_i915_private *i915 = engine->i915;
5211 	struct intel_uncore *uncore = engine->uncore;
5212 	u32 base = engine->mmio_base;
5213 
5214 	tasklet_init(&engine->execlists.tasklet,
5215 		     execlists_submission_tasklet, (unsigned long)engine);
5216 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5217 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5218 
5219 	logical_ring_default_vfuncs(engine);
5220 	logical_ring_default_irqs(engine);
5221 
5222 	if (engine->class == RENDER_CLASS)
5223 		rcs_submission_override(engine);
5224 
5225 	if (intel_init_workaround_bb(engine))
5226 		/*
5227 		 * We continue even if we fail to initialize WA batch
5228 		 * because we only expect rare glitches but nothing
5229 		 * critical to prevent us from using GPU
5230 		 */
5231 		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5232 
5233 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5234 		execlists->submit_reg = uncore->regs +
5235 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5236 		execlists->ctrl_reg = uncore->regs +
5237 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5238 	} else {
5239 		execlists->submit_reg = uncore->regs +
5240 			i915_mmio_reg_offset(RING_ELSP(base));
5241 	}
5242 
5243 	execlists->csb_status =
5244 		(u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5245 
5246 	execlists->csb_write =
5247 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5248 
5249 	if (INTEL_GEN(i915) < 11)
5250 		execlists->csb_size = GEN8_CSB_ENTRIES;
5251 	else
5252 		execlists->csb_size = GEN11_CSB_ENTRIES;
5253 
5254 	if (INTEL_GEN(engine->i915) >= 11) {
5255 		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5256 		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5257 	}
5258 
5259 	/* Finally, take ownership and responsibility for cleanup! */
5260 	engine->sanitize = execlists_sanitize;
5261 	engine->release = execlists_release;
5262 
5263 	return 0;
5264 }
5265 
5266 static void init_common_reg_state(u32 * const regs,
5267 				  const struct intel_engine_cs *engine,
5268 				  const struct intel_ring *ring,
5269 				  bool inhibit)
5270 {
5271 	u32 ctl;
5272 
5273 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5274 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5275 	if (inhibit)
5276 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5277 	if (INTEL_GEN(engine->i915) < 11)
5278 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5279 					   CTX_CTRL_RS_CTX_ENABLE);
5280 	regs[CTX_CONTEXT_CONTROL] = ctl;
5281 
5282 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5283 	regs[CTX_TIMESTAMP] = 0;
5284 }
5285 
5286 static void init_wa_bb_reg_state(u32 * const regs,
5287 				 const struct intel_engine_cs *engine)
5288 {
5289 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5290 
5291 	if (wa_ctx->per_ctx.size) {
5292 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5293 
5294 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5295 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5296 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5297 	}
5298 
5299 	if (wa_ctx->indirect_ctx.size) {
5300 		lrc_ring_setup_indirect_ctx(regs, engine,
5301 					    i915_ggtt_offset(wa_ctx->vma) +
5302 					    wa_ctx->indirect_ctx.offset,
5303 					    wa_ctx->indirect_ctx.size);
5304 	}
5305 }
5306 
5307 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5308 {
5309 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5310 		/* 64b PPGTT (48bit canonical)
5311 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5312 		 * other PDP Descriptors are ignored.
5313 		 */
5314 		ASSIGN_CTX_PML4(ppgtt, regs);
5315 	} else {
5316 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5317 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5318 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5319 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5320 	}
5321 }
5322 
5323 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5324 {
5325 	if (i915_is_ggtt(vm))
5326 		return i915_vm_to_ggtt(vm)->alias;
5327 	else
5328 		return i915_vm_to_ppgtt(vm);
5329 }
5330 
5331 static void execlists_init_reg_state(u32 *regs,
5332 				     const struct intel_context *ce,
5333 				     const struct intel_engine_cs *engine,
5334 				     const struct intel_ring *ring,
5335 				     bool inhibit)
5336 {
5337 	/*
5338 	 * A context is actually a big batch buffer with several
5339 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5340 	 * values we are setting here are only for the first context restore:
5341 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5342 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5343 	 * we are not initializing here).
5344 	 *
5345 	 * Must keep consistent with virtual_update_register_offsets().
5346 	 */
5347 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5348 
5349 	init_common_reg_state(regs, engine, ring, inhibit);
5350 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5351 
5352 	init_wa_bb_reg_state(regs, engine);
5353 
5354 	__reset_stop_ring(regs, engine);
5355 }
5356 
5357 static int
5358 populate_lr_context(struct intel_context *ce,
5359 		    struct drm_i915_gem_object *ctx_obj,
5360 		    struct intel_engine_cs *engine,
5361 		    struct intel_ring *ring)
5362 {
5363 	bool inhibit = true;
5364 	void *vaddr;
5365 
5366 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5367 	if (IS_ERR(vaddr)) {
5368 		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5369 		return PTR_ERR(vaddr);
5370 	}
5371 
5372 	set_redzone(vaddr, engine);
5373 
5374 	if (engine->default_state) {
5375 		shmem_read(engine->default_state, 0,
5376 			   vaddr, engine->context_size);
5377 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5378 		inhibit = false;
5379 	}
5380 
5381 	/* Clear the ppHWSP (inc. per-context counters) */
5382 	memset(vaddr, 0, PAGE_SIZE);
5383 
5384 	/*
5385 	 * The second page of the context object contains some registers which
5386 	 * must be set up prior to the first execution.
5387 	 */
5388 	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5389 				 ce, engine, ring, inhibit);
5390 
5391 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5392 	i915_gem_object_unpin_map(ctx_obj);
5393 	return 0;
5394 }
5395 
5396 static struct intel_timeline *pinned_timeline(struct intel_context *ce)
5397 {
5398 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
5399 
5400 	return intel_timeline_create_from_engine(ce->engine,
5401 						 page_unmask_bits(tl));
5402 }
5403 
5404 static int __execlists_context_alloc(struct intel_context *ce,
5405 				     struct intel_engine_cs *engine)
5406 {
5407 	struct drm_i915_gem_object *ctx_obj;
5408 	struct intel_ring *ring;
5409 	struct i915_vma *vma;
5410 	u32 context_size;
5411 	int ret;
5412 
5413 	GEM_BUG_ON(ce->state);
5414 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5415 
5416 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5417 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5418 
5419 	if (INTEL_GEN(engine->i915) == 12) {
5420 		ce->wa_bb_page = context_size / PAGE_SIZE;
5421 		context_size += PAGE_SIZE;
5422 	}
5423 
5424 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5425 	if (IS_ERR(ctx_obj))
5426 		return PTR_ERR(ctx_obj);
5427 
5428 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5429 	if (IS_ERR(vma)) {
5430 		ret = PTR_ERR(vma);
5431 		goto error_deref_obj;
5432 	}
5433 
5434 	if (!page_mask_bits(ce->timeline)) {
5435 		struct intel_timeline *tl;
5436 
5437 		/*
5438 		 * Use the static global HWSP for the kernel context, and
5439 		 * a dynamically allocated cacheline for everyone else.
5440 		 */
5441 		if (unlikely(ce->timeline))
5442 			tl = pinned_timeline(ce);
5443 		else
5444 			tl = intel_timeline_create(engine->gt);
5445 		if (IS_ERR(tl)) {
5446 			ret = PTR_ERR(tl);
5447 			goto error_deref_obj;
5448 		}
5449 
5450 		ce->timeline = tl;
5451 	}
5452 
5453 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5454 	if (IS_ERR(ring)) {
5455 		ret = PTR_ERR(ring);
5456 		goto error_deref_obj;
5457 	}
5458 
5459 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5460 	if (ret) {
5461 		drm_dbg(&engine->i915->drm,
5462 			"Failed to populate LRC: %d\n", ret);
5463 		goto error_ring_free;
5464 	}
5465 
5466 	ce->ring = ring;
5467 	ce->state = vma;
5468 
5469 	return 0;
5470 
5471 error_ring_free:
5472 	intel_ring_put(ring);
5473 error_deref_obj:
5474 	i915_gem_object_put(ctx_obj);
5475 	return ret;
5476 }
5477 
5478 static struct list_head *virtual_queue(struct virtual_engine *ve)
5479 {
5480 	return &ve->base.execlists.default_priolist.requests[0];
5481 }
5482 
5483 static void virtual_context_destroy(struct kref *kref)
5484 {
5485 	struct virtual_engine *ve =
5486 		container_of(kref, typeof(*ve), context.ref);
5487 	unsigned int n;
5488 
5489 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5490 	GEM_BUG_ON(ve->request);
5491 	GEM_BUG_ON(ve->context.inflight);
5492 
5493 	for (n = 0; n < ve->num_siblings; n++) {
5494 		struct intel_engine_cs *sibling = ve->siblings[n];
5495 		struct rb_node *node = &ve->nodes[sibling->id].rb;
5496 		unsigned long flags;
5497 
5498 		if (RB_EMPTY_NODE(node))
5499 			continue;
5500 
5501 		spin_lock_irqsave(&sibling->active.lock, flags);
5502 
5503 		/* Detachment is lazily performed in the execlists tasklet */
5504 		if (!RB_EMPTY_NODE(node))
5505 			rb_erase_cached(node, &sibling->execlists.virtual);
5506 
5507 		spin_unlock_irqrestore(&sibling->active.lock, flags);
5508 	}
5509 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5510 
5511 	if (ve->context.state)
5512 		__execlists_context_fini(&ve->context);
5513 	intel_context_fini(&ve->context);
5514 
5515 	intel_breadcrumbs_free(ve->base.breadcrumbs);
5516 	intel_engine_free_request_pool(&ve->base);
5517 
5518 	kfree(ve->bonds);
5519 	kfree(ve);
5520 }
5521 
5522 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5523 {
5524 	int swp;
5525 
5526 	/*
5527 	 * Pick a random sibling on starting to help spread the load around.
5528 	 *
5529 	 * New contexts are typically created with exactly the same order
5530 	 * of siblings, and often started in batches. Due to the way we iterate
5531 	 * the array of sibling when submitting requests, sibling[0] is
5532 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5533 	 * randomised across the system, we also help spread the load by the
5534 	 * first engine we inspect being different each time.
5535 	 *
5536 	 * NB This does not force us to execute on this engine, it will just
5537 	 * typically be the first we inspect for submission.
5538 	 */
5539 	swp = prandom_u32_max(ve->num_siblings);
5540 	if (swp)
5541 		swap(ve->siblings[swp], ve->siblings[0]);
5542 }
5543 
5544 static int virtual_context_alloc(struct intel_context *ce)
5545 {
5546 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5547 
5548 	return __execlists_context_alloc(ce, ve->siblings[0]);
5549 }
5550 
5551 static int virtual_context_pin(struct intel_context *ce, void *vaddr)
5552 {
5553 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5554 
5555 	/* Note: we must use a real engine class for setting up reg state */
5556 	return __execlists_context_pin(ce, ve->siblings[0], vaddr);
5557 }
5558 
5559 static void virtual_context_enter(struct intel_context *ce)
5560 {
5561 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5562 	unsigned int n;
5563 
5564 	for (n = 0; n < ve->num_siblings; n++)
5565 		intel_engine_pm_get(ve->siblings[n]);
5566 
5567 	intel_timeline_enter(ce->timeline);
5568 }
5569 
5570 static void virtual_context_exit(struct intel_context *ce)
5571 {
5572 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5573 	unsigned int n;
5574 
5575 	intel_timeline_exit(ce->timeline);
5576 
5577 	for (n = 0; n < ve->num_siblings; n++)
5578 		intel_engine_pm_put(ve->siblings[n]);
5579 }
5580 
5581 static const struct intel_context_ops virtual_context_ops = {
5582 	.alloc = virtual_context_alloc,
5583 
5584 	.pre_pin = execlists_context_pre_pin,
5585 	.pin = virtual_context_pin,
5586 	.unpin = execlists_context_unpin,
5587 	.post_unpin = execlists_context_post_unpin,
5588 
5589 	.enter = virtual_context_enter,
5590 	.exit = virtual_context_exit,
5591 
5592 	.destroy = virtual_context_destroy,
5593 };
5594 
5595 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5596 {
5597 	struct i915_request *rq;
5598 	intel_engine_mask_t mask;
5599 
5600 	rq = READ_ONCE(ve->request);
5601 	if (!rq)
5602 		return 0;
5603 
5604 	/* The rq is ready for submission; rq->execution_mask is now stable. */
5605 	mask = rq->execution_mask;
5606 	if (unlikely(!mask)) {
5607 		/* Invalid selection, submit to a random engine in error */
5608 		i915_request_set_error_once(rq, -ENODEV);
5609 		mask = ve->siblings[0]->mask;
5610 	}
5611 
5612 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5613 		     rq->fence.context, rq->fence.seqno,
5614 		     mask, ve->base.execlists.queue_priority_hint);
5615 
5616 	return mask;
5617 }
5618 
5619 static void virtual_submission_tasklet(unsigned long data)
5620 {
5621 	struct virtual_engine * const ve = (struct virtual_engine *)data;
5622 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5623 	intel_engine_mask_t mask;
5624 	unsigned int n;
5625 
5626 	rcu_read_lock();
5627 	mask = virtual_submission_mask(ve);
5628 	rcu_read_unlock();
5629 	if (unlikely(!mask))
5630 		return;
5631 
5632 	local_irq_disable();
5633 	for (n = 0; n < ve->num_siblings; n++) {
5634 		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5635 		struct ve_node * const node = &ve->nodes[sibling->id];
5636 		struct rb_node **parent, *rb;
5637 		bool first;
5638 
5639 		if (!READ_ONCE(ve->request))
5640 			break; /* already handled by a sibling's tasklet */
5641 
5642 		if (unlikely(!(mask & sibling->mask))) {
5643 			if (!RB_EMPTY_NODE(&node->rb)) {
5644 				spin_lock(&sibling->active.lock);
5645 				rb_erase_cached(&node->rb,
5646 						&sibling->execlists.virtual);
5647 				RB_CLEAR_NODE(&node->rb);
5648 				spin_unlock(&sibling->active.lock);
5649 			}
5650 			continue;
5651 		}
5652 
5653 		spin_lock(&sibling->active.lock);
5654 
5655 		if (!RB_EMPTY_NODE(&node->rb)) {
5656 			/*
5657 			 * Cheat and avoid rebalancing the tree if we can
5658 			 * reuse this node in situ.
5659 			 */
5660 			first = rb_first_cached(&sibling->execlists.virtual) ==
5661 				&node->rb;
5662 			if (prio == node->prio || (prio > node->prio && first))
5663 				goto submit_engine;
5664 
5665 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5666 		}
5667 
5668 		rb = NULL;
5669 		first = true;
5670 		parent = &sibling->execlists.virtual.rb_root.rb_node;
5671 		while (*parent) {
5672 			struct ve_node *other;
5673 
5674 			rb = *parent;
5675 			other = rb_entry(rb, typeof(*other), rb);
5676 			if (prio > other->prio) {
5677 				parent = &rb->rb_left;
5678 			} else {
5679 				parent = &rb->rb_right;
5680 				first = false;
5681 			}
5682 		}
5683 
5684 		rb_link_node(&node->rb, rb, parent);
5685 		rb_insert_color_cached(&node->rb,
5686 				       &sibling->execlists.virtual,
5687 				       first);
5688 
5689 submit_engine:
5690 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5691 		node->prio = prio;
5692 		if (first && prio > sibling->execlists.queue_priority_hint)
5693 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5694 
5695 		spin_unlock(&sibling->active.lock);
5696 	}
5697 	local_irq_enable();
5698 }
5699 
5700 static void virtual_submit_request(struct i915_request *rq)
5701 {
5702 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5703 	struct i915_request *old;
5704 	unsigned long flags;
5705 
5706 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5707 		     rq->fence.context,
5708 		     rq->fence.seqno);
5709 
5710 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5711 
5712 	spin_lock_irqsave(&ve->base.active.lock, flags);
5713 
5714 	old = ve->request;
5715 	if (old) { /* background completion event from preempt-to-busy */
5716 		GEM_BUG_ON(!i915_request_completed(old));
5717 		__i915_request_submit(old);
5718 		i915_request_put(old);
5719 	}
5720 
5721 	if (i915_request_completed(rq)) {
5722 		__i915_request_submit(rq);
5723 
5724 		ve->base.execlists.queue_priority_hint = INT_MIN;
5725 		ve->request = NULL;
5726 	} else {
5727 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5728 		ve->request = i915_request_get(rq);
5729 
5730 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5731 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5732 
5733 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
5734 	}
5735 
5736 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5737 }
5738 
5739 static struct ve_bond *
5740 virtual_find_bond(struct virtual_engine *ve,
5741 		  const struct intel_engine_cs *master)
5742 {
5743 	int i;
5744 
5745 	for (i = 0; i < ve->num_bonds; i++) {
5746 		if (ve->bonds[i].master == master)
5747 			return &ve->bonds[i];
5748 	}
5749 
5750 	return NULL;
5751 }
5752 
5753 static void
5754 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5755 {
5756 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5757 	intel_engine_mask_t allowed, exec;
5758 	struct ve_bond *bond;
5759 
5760 	allowed = ~to_request(signal)->engine->mask;
5761 
5762 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5763 	if (bond)
5764 		allowed &= bond->sibling_mask;
5765 
5766 	/* Restrict the bonded request to run on only the available engines */
5767 	exec = READ_ONCE(rq->execution_mask);
5768 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5769 		;
5770 
5771 	/* Prevent the master from being re-run on the bonded engines */
5772 	to_request(signal)->execution_mask &= ~allowed;
5773 }
5774 
5775 struct intel_context *
5776 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5777 			       unsigned int count)
5778 {
5779 	struct virtual_engine *ve;
5780 	unsigned int n;
5781 	int err;
5782 
5783 	if (count == 0)
5784 		return ERR_PTR(-EINVAL);
5785 
5786 	if (count == 1)
5787 		return intel_context_create(siblings[0]);
5788 
5789 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5790 	if (!ve)
5791 		return ERR_PTR(-ENOMEM);
5792 
5793 	ve->base.i915 = siblings[0]->i915;
5794 	ve->base.gt = siblings[0]->gt;
5795 	ve->base.uncore = siblings[0]->uncore;
5796 	ve->base.id = -1;
5797 
5798 	ve->base.class = OTHER_CLASS;
5799 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5800 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5801 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5802 
5803 	/*
5804 	 * The decision on whether to submit a request using semaphores
5805 	 * depends on the saturated state of the engine. We only compute
5806 	 * this during HW submission of the request, and we need for this
5807 	 * state to be globally applied to all requests being submitted
5808 	 * to this engine. Virtual engines encompass more than one physical
5809 	 * engine and so we cannot accurately tell in advance if one of those
5810 	 * engines is already saturated and so cannot afford to use a semaphore
5811 	 * and be pessimized in priority for doing so -- if we are the only
5812 	 * context using semaphores after all other clients have stopped, we
5813 	 * will be starved on the saturated system. Such a global switch for
5814 	 * semaphores is less than ideal, but alas is the current compromise.
5815 	 */
5816 	ve->base.saturated = ALL_ENGINES;
5817 
5818 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5819 
5820 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5821 	intel_engine_init_execlists(&ve->base);
5822 
5823 	ve->base.cops = &virtual_context_ops;
5824 	ve->base.request_alloc = execlists_request_alloc;
5825 
5826 	ve->base.schedule = i915_schedule;
5827 	ve->base.submit_request = virtual_submit_request;
5828 	ve->base.bond_execute = virtual_bond_execute;
5829 
5830 	INIT_LIST_HEAD(virtual_queue(ve));
5831 	ve->base.execlists.queue_priority_hint = INT_MIN;
5832 	tasklet_init(&ve->base.execlists.tasklet,
5833 		     virtual_submission_tasklet,
5834 		     (unsigned long)ve);
5835 
5836 	intel_context_init(&ve->context, &ve->base);
5837 
5838 	ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
5839 	if (!ve->base.breadcrumbs) {
5840 		err = -ENOMEM;
5841 		goto err_put;
5842 	}
5843 
5844 	for (n = 0; n < count; n++) {
5845 		struct intel_engine_cs *sibling = siblings[n];
5846 
5847 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5848 		if (sibling->mask & ve->base.mask) {
5849 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5850 				  sibling->name);
5851 			err = -EINVAL;
5852 			goto err_put;
5853 		}
5854 
5855 		/*
5856 		 * The virtual engine implementation is tightly coupled to
5857 		 * the execlists backend -- we push out request directly
5858 		 * into a tree inside each physical engine. We could support
5859 		 * layering if we handle cloning of the requests and
5860 		 * submitting a copy into each backend.
5861 		 */
5862 		if (sibling->execlists.tasklet.func !=
5863 		    execlists_submission_tasklet) {
5864 			err = -ENODEV;
5865 			goto err_put;
5866 		}
5867 
5868 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5869 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5870 
5871 		ve->siblings[ve->num_siblings++] = sibling;
5872 		ve->base.mask |= sibling->mask;
5873 
5874 		/*
5875 		 * All physical engines must be compatible for their emission
5876 		 * functions (as we build the instructions during request
5877 		 * construction and do not alter them before submission
5878 		 * on the physical engine). We use the engine class as a guide
5879 		 * here, although that could be refined.
5880 		 */
5881 		if (ve->base.class != OTHER_CLASS) {
5882 			if (ve->base.class != sibling->class) {
5883 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5884 					  sibling->class, ve->base.class);
5885 				err = -EINVAL;
5886 				goto err_put;
5887 			}
5888 			continue;
5889 		}
5890 
5891 		ve->base.class = sibling->class;
5892 		ve->base.uabi_class = sibling->uabi_class;
5893 		snprintf(ve->base.name, sizeof(ve->base.name),
5894 			 "v%dx%d", ve->base.class, count);
5895 		ve->base.context_size = sibling->context_size;
5896 
5897 		ve->base.emit_bb_start = sibling->emit_bb_start;
5898 		ve->base.emit_flush = sibling->emit_flush;
5899 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5900 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5901 		ve->base.emit_fini_breadcrumb_dw =
5902 			sibling->emit_fini_breadcrumb_dw;
5903 
5904 		ve->base.flags = sibling->flags;
5905 	}
5906 
5907 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5908 
5909 	virtual_engine_initial_hint(ve);
5910 	return &ve->context;
5911 
5912 err_put:
5913 	intel_context_put(&ve->context);
5914 	return ERR_PTR(err);
5915 }
5916 
5917 struct intel_context *
5918 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5919 {
5920 	struct virtual_engine *se = to_virtual_engine(src);
5921 	struct intel_context *dst;
5922 
5923 	dst = intel_execlists_create_virtual(se->siblings,
5924 					     se->num_siblings);
5925 	if (IS_ERR(dst))
5926 		return dst;
5927 
5928 	if (se->num_bonds) {
5929 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5930 
5931 		de->bonds = kmemdup(se->bonds,
5932 				    sizeof(*se->bonds) * se->num_bonds,
5933 				    GFP_KERNEL);
5934 		if (!de->bonds) {
5935 			intel_context_put(dst);
5936 			return ERR_PTR(-ENOMEM);
5937 		}
5938 
5939 		de->num_bonds = se->num_bonds;
5940 	}
5941 
5942 	return dst;
5943 }
5944 
5945 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5946 				     const struct intel_engine_cs *master,
5947 				     const struct intel_engine_cs *sibling)
5948 {
5949 	struct virtual_engine *ve = to_virtual_engine(engine);
5950 	struct ve_bond *bond;
5951 	int n;
5952 
5953 	/* Sanity check the sibling is part of the virtual engine */
5954 	for (n = 0; n < ve->num_siblings; n++)
5955 		if (sibling == ve->siblings[n])
5956 			break;
5957 	if (n == ve->num_siblings)
5958 		return -EINVAL;
5959 
5960 	bond = virtual_find_bond(ve, master);
5961 	if (bond) {
5962 		bond->sibling_mask |= sibling->mask;
5963 		return 0;
5964 	}
5965 
5966 	bond = krealloc(ve->bonds,
5967 			sizeof(*bond) * (ve->num_bonds + 1),
5968 			GFP_KERNEL);
5969 	if (!bond)
5970 		return -ENOMEM;
5971 
5972 	bond[ve->num_bonds].master = master;
5973 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5974 
5975 	ve->bonds = bond;
5976 	ve->num_bonds++;
5977 
5978 	return 0;
5979 }
5980 
5981 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5982 				   struct drm_printer *m,
5983 				   void (*show_request)(struct drm_printer *m,
5984 							struct i915_request *rq,
5985 							const char *prefix),
5986 				   unsigned int max)
5987 {
5988 	const struct intel_engine_execlists *execlists = &engine->execlists;
5989 	struct i915_request *rq, *last;
5990 	unsigned long flags;
5991 	unsigned int count;
5992 	struct rb_node *rb;
5993 
5994 	spin_lock_irqsave(&engine->active.lock, flags);
5995 
5996 	last = NULL;
5997 	count = 0;
5998 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5999 		if (count++ < max - 1)
6000 			show_request(m, rq, "\t\tE ");
6001 		else
6002 			last = rq;
6003 	}
6004 	if (last) {
6005 		if (count > max) {
6006 			drm_printf(m,
6007 				   "\t\t...skipping %d executing requests...\n",
6008 				   count - max);
6009 		}
6010 		show_request(m, last, "\t\tE ");
6011 	}
6012 
6013 	if (execlists->switch_priority_hint != INT_MIN)
6014 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
6015 			   READ_ONCE(execlists->switch_priority_hint));
6016 	if (execlists->queue_priority_hint != INT_MIN)
6017 		drm_printf(m, "\t\tQueue priority hint: %d\n",
6018 			   READ_ONCE(execlists->queue_priority_hint));
6019 
6020 	last = NULL;
6021 	count = 0;
6022 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
6023 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
6024 		int i;
6025 
6026 		priolist_for_each_request(rq, p, i) {
6027 			if (count++ < max - 1)
6028 				show_request(m, rq, "\t\tQ ");
6029 			else
6030 				last = rq;
6031 		}
6032 	}
6033 	if (last) {
6034 		if (count > max) {
6035 			drm_printf(m,
6036 				   "\t\t...skipping %d queued requests...\n",
6037 				   count - max);
6038 		}
6039 		show_request(m, last, "\t\tQ ");
6040 	}
6041 
6042 	last = NULL;
6043 	count = 0;
6044 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
6045 		struct virtual_engine *ve =
6046 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
6047 		struct i915_request *rq = READ_ONCE(ve->request);
6048 
6049 		if (rq) {
6050 			if (count++ < max - 1)
6051 				show_request(m, rq, "\t\tV ");
6052 			else
6053 				last = rq;
6054 		}
6055 	}
6056 	if (last) {
6057 		if (count > max) {
6058 			drm_printf(m,
6059 				   "\t\t...skipping %d virtual requests...\n",
6060 				   count - max);
6061 		}
6062 		show_request(m, last, "\t\tV ");
6063 	}
6064 
6065 	spin_unlock_irqrestore(&engine->active.lock, flags);
6066 }
6067 
6068 void intel_lr_context_reset(struct intel_engine_cs *engine,
6069 			    struct intel_context *ce,
6070 			    u32 head,
6071 			    bool scrub)
6072 {
6073 	GEM_BUG_ON(!intel_context_is_pinned(ce));
6074 
6075 	/*
6076 	 * We want a simple context + ring to execute the breadcrumb update.
6077 	 * We cannot rely on the context being intact across the GPU hang,
6078 	 * so clear it and rebuild just what we need for the breadcrumb.
6079 	 * All pending requests for this context will be zapped, and any
6080 	 * future request will be after userspace has had the opportunity
6081 	 * to recreate its own state.
6082 	 */
6083 	if (scrub)
6084 		restore_default_state(ce, engine);
6085 
6086 	/* Rerun the request; its payload has been neutered (if guilty). */
6087 	__execlists_update_reg_state(ce, engine, head);
6088 }
6089 
6090 bool
6091 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6092 {
6093 	return engine->set_default_submission ==
6094 	       intel_execlists_set_default_submission;
6095 }
6096 
6097 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6098 #include "selftest_lrc.c"
6099 #endif
6100