xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 59c94b9d)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 #include "shmem_utils.h"
151 
152 #define RING_EXECLIST_QFULL		(1 << 0x2)
153 #define RING_EXECLIST1_VALID		(1 << 0x3)
154 #define RING_EXECLIST0_VALID		(1 << 0x4)
155 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
156 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
157 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
158 
159 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
160 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
162 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
163 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
164 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
165 
166 #define GEN8_CTX_STATUS_COMPLETED_MASK \
167 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
168 
169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
170 
171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
173 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
174 #define GEN12_IDLE_CTX_ID		0x7FF
175 #define GEN12_CSB_CTX_VALID(csb_dw) \
176 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
177 
178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
180 
181 struct virtual_engine {
182 	struct intel_engine_cs base;
183 	struct intel_context context;
184 
185 	/*
186 	 * We allow only a single request through the virtual engine at a time
187 	 * (each request in the timeline waits for the completion fence of
188 	 * the previous before being submitted). By restricting ourselves to
189 	 * only submitting a single request, each request is placed on to a
190 	 * physical to maximise load spreading (by virtue of the late greedy
191 	 * scheduling -- each real engine takes the next available request
192 	 * upon idling).
193 	 */
194 	struct i915_request *request;
195 
196 	/*
197 	 * We keep a rbtree of available virtual engines inside each physical
198 	 * engine, sorted by priority. Here we preallocate the nodes we need
199 	 * for the virtual engine, indexed by physical_engine->id.
200 	 */
201 	struct ve_node {
202 		struct rb_node rb;
203 		int prio;
204 	} nodes[I915_NUM_ENGINES];
205 
206 	/*
207 	 * Keep track of bonded pairs -- restrictions upon on our selection
208 	 * of physical engines any particular request may be submitted to.
209 	 * If we receive a submit-fence from a master engine, we will only
210 	 * use one of sibling_mask physical engines.
211 	 */
212 	struct ve_bond {
213 		const struct intel_engine_cs *master;
214 		intel_engine_mask_t sibling_mask;
215 	} *bonds;
216 	unsigned int num_bonds;
217 
218 	/* And finally, which physical engines this virtual engine maps onto. */
219 	unsigned int num_siblings;
220 	struct intel_engine_cs *siblings[];
221 };
222 
223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
224 {
225 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
226 	return container_of(engine, struct virtual_engine, base);
227 }
228 
229 static int __execlists_context_alloc(struct intel_context *ce,
230 				     struct intel_engine_cs *engine);
231 
232 static void execlists_init_reg_state(u32 *reg_state,
233 				     const struct intel_context *ce,
234 				     const struct intel_engine_cs *engine,
235 				     const struct intel_ring *ring,
236 				     bool close);
237 static void
238 __execlists_update_reg_state(const struct intel_context *ce,
239 			     const struct intel_engine_cs *engine,
240 			     u32 head);
241 
242 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
243 {
244 	if (INTEL_GEN(engine->i915) >= 12)
245 		return 0x60;
246 	else if (INTEL_GEN(engine->i915) >= 9)
247 		return 0x54;
248 	else if (engine->class == RENDER_CLASS)
249 		return 0x58;
250 	else
251 		return -1;
252 }
253 
254 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
255 {
256 	if (INTEL_GEN(engine->i915) >= 12)
257 		return 0x74;
258 	else if (INTEL_GEN(engine->i915) >= 9)
259 		return 0x68;
260 	else if (engine->class == RENDER_CLASS)
261 		return 0xd8;
262 	else
263 		return -1;
264 }
265 
266 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
267 {
268 	if (INTEL_GEN(engine->i915) >= 12)
269 		return 0x12;
270 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
271 		return 0x18;
272 	else
273 		return -1;
274 }
275 
276 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
277 {
278 	int x;
279 
280 	x = lrc_ring_wa_bb_per_ctx(engine);
281 	if (x < 0)
282 		return x;
283 
284 	return x + 2;
285 }
286 
287 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
288 {
289 	int x;
290 
291 	x = lrc_ring_indirect_ptr(engine);
292 	if (x < 0)
293 		return x;
294 
295 	return x + 2;
296 }
297 
298 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
299 {
300 	if (engine->class != RENDER_CLASS)
301 		return -1;
302 
303 	if (INTEL_GEN(engine->i915) >= 12)
304 		return 0xb6;
305 	else if (INTEL_GEN(engine->i915) >= 11)
306 		return 0xaa;
307 	else
308 		return -1;
309 }
310 
311 static u32
312 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
313 {
314 	switch (INTEL_GEN(engine->i915)) {
315 	default:
316 		MISSING_CASE(INTEL_GEN(engine->i915));
317 		fallthrough;
318 	case 12:
319 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
320 	case 11:
321 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
322 	case 10:
323 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
324 	case 9:
325 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
326 	case 8:
327 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
328 	}
329 }
330 
331 static void
332 lrc_ring_setup_indirect_ctx(u32 *regs,
333 			    const struct intel_engine_cs *engine,
334 			    u32 ctx_bb_ggtt_addr,
335 			    u32 size)
336 {
337 	GEM_BUG_ON(!size);
338 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
339 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
340 	regs[lrc_ring_indirect_ptr(engine) + 1] =
341 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
342 
343 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
344 	regs[lrc_ring_indirect_offset(engine) + 1] =
345 		lrc_ring_indirect_offset_default(engine) << 6;
346 }
347 
348 static u32 intel_context_get_runtime(const struct intel_context *ce)
349 {
350 	/*
351 	 * We can use either ppHWSP[16] which is recorded before the context
352 	 * switch (and so excludes the cost of context switches) or use the
353 	 * value from the context image itself, which is saved/restored earlier
354 	 * and so includes the cost of the save.
355 	 */
356 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
357 }
358 
359 static void mark_eio(struct i915_request *rq)
360 {
361 	if (i915_request_completed(rq))
362 		return;
363 
364 	GEM_BUG_ON(i915_request_signaled(rq));
365 
366 	i915_request_set_error_once(rq, -EIO);
367 	i915_request_mark_complete(rq);
368 }
369 
370 static struct i915_request *
371 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
372 {
373 	struct i915_request *active = rq;
374 
375 	rcu_read_lock();
376 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
377 		if (i915_request_completed(rq))
378 			break;
379 
380 		active = rq;
381 	}
382 	rcu_read_unlock();
383 
384 	return active;
385 }
386 
387 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
388 {
389 	return (i915_ggtt_offset(engine->status_page.vma) +
390 		I915_GEM_HWS_PREEMPT_ADDR);
391 }
392 
393 static inline void
394 ring_set_paused(const struct intel_engine_cs *engine, int state)
395 {
396 	/*
397 	 * We inspect HWS_PREEMPT with a semaphore inside
398 	 * engine->emit_fini_breadcrumb. If the dword is true,
399 	 * the ring is paused as the semaphore will busywait
400 	 * until the dword is false.
401 	 */
402 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
403 	if (state)
404 		wmb();
405 }
406 
407 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
408 {
409 	return rb_entry(rb, struct i915_priolist, node);
410 }
411 
412 static inline int rq_prio(const struct i915_request *rq)
413 {
414 	return READ_ONCE(rq->sched.attr.priority);
415 }
416 
417 static int effective_prio(const struct i915_request *rq)
418 {
419 	int prio = rq_prio(rq);
420 
421 	/*
422 	 * If this request is special and must not be interrupted at any
423 	 * cost, so be it. Note we are only checking the most recent request
424 	 * in the context and so may be masking an earlier vip request. It
425 	 * is hoped that under the conditions where nopreempt is used, this
426 	 * will not matter (i.e. all requests to that context will be
427 	 * nopreempt for as long as desired).
428 	 */
429 	if (i915_request_has_nopreempt(rq))
430 		prio = I915_PRIORITY_UNPREEMPTABLE;
431 
432 	return prio;
433 }
434 
435 static int queue_prio(const struct intel_engine_execlists *execlists)
436 {
437 	struct i915_priolist *p;
438 	struct rb_node *rb;
439 
440 	rb = rb_first_cached(&execlists->queue);
441 	if (!rb)
442 		return INT_MIN;
443 
444 	/*
445 	 * As the priolist[] are inverted, with the highest priority in [0],
446 	 * we have to flip the index value to become priority.
447 	 */
448 	p = to_priolist(rb);
449 	if (!I915_USER_PRIORITY_SHIFT)
450 		return p->priority;
451 
452 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
453 }
454 
455 static inline bool need_preempt(const struct intel_engine_cs *engine,
456 				const struct i915_request *rq,
457 				struct rb_node *rb)
458 {
459 	int last_prio;
460 
461 	if (!intel_engine_has_semaphores(engine))
462 		return false;
463 
464 	/*
465 	 * Check if the current priority hint merits a preemption attempt.
466 	 *
467 	 * We record the highest value priority we saw during rescheduling
468 	 * prior to this dequeue, therefore we know that if it is strictly
469 	 * less than the current tail of ESLP[0], we do not need to force
470 	 * a preempt-to-idle cycle.
471 	 *
472 	 * However, the priority hint is a mere hint that we may need to
473 	 * preempt. If that hint is stale or we may be trying to preempt
474 	 * ourselves, ignore the request.
475 	 *
476 	 * More naturally we would write
477 	 *      prio >= max(0, last);
478 	 * except that we wish to prevent triggering preemption at the same
479 	 * priority level: the task that is running should remain running
480 	 * to preserve FIFO ordering of dependencies.
481 	 */
482 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
483 	if (engine->execlists.queue_priority_hint <= last_prio)
484 		return false;
485 
486 	/*
487 	 * Check against the first request in ELSP[1], it will, thanks to the
488 	 * power of PI, be the highest priority of that context.
489 	 */
490 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
491 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
492 		return true;
493 
494 	if (rb) {
495 		struct virtual_engine *ve =
496 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
497 		bool preempt = false;
498 
499 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
500 			struct i915_request *next;
501 
502 			rcu_read_lock();
503 			next = READ_ONCE(ve->request);
504 			if (next)
505 				preempt = rq_prio(next) > last_prio;
506 			rcu_read_unlock();
507 		}
508 
509 		if (preempt)
510 			return preempt;
511 	}
512 
513 	/*
514 	 * If the inflight context did not trigger the preemption, then maybe
515 	 * it was the set of queued requests? Pick the highest priority in
516 	 * the queue (the first active priolist) and see if it deserves to be
517 	 * running instead of ELSP[0].
518 	 *
519 	 * The highest priority request in the queue can not be either
520 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
521 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
522 	 */
523 	return queue_prio(&engine->execlists) > last_prio;
524 }
525 
526 __maybe_unused static inline bool
527 assert_priority_queue(const struct i915_request *prev,
528 		      const struct i915_request *next)
529 {
530 	/*
531 	 * Without preemption, the prev may refer to the still active element
532 	 * which we refuse to let go.
533 	 *
534 	 * Even with preemption, there are times when we think it is better not
535 	 * to preempt and leave an ostensibly lower priority request in flight.
536 	 */
537 	if (i915_request_is_active(prev))
538 		return true;
539 
540 	return rq_prio(prev) >= rq_prio(next);
541 }
542 
543 /*
544  * The context descriptor encodes various attributes of a context,
545  * including its GTT address and some flags. Because it's fairly
546  * expensive to calculate, we'll just do it once and cache the result,
547  * which remains valid until the context is unpinned.
548  *
549  * This is what a descriptor looks like, from LSB to MSB::
550  *
551  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
552  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
553  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
554  *      bits 53-54:    mbz, reserved for use by hardware
555  *      bits 55-63:    group ID, currently unused and set to 0
556  *
557  * Starting from Gen11, the upper dword of the descriptor has a new format:
558  *
559  *      bits 32-36:    reserved
560  *      bits 37-47:    SW context ID
561  *      bits 48:53:    engine instance
562  *      bit 54:        mbz, reserved for use by hardware
563  *      bits 55-60:    SW counter
564  *      bits 61-63:    engine class
565  *
566  * engine info, SW context ID and SW counter need to form a unique number
567  * (Context ID) per lrc.
568  */
569 static u32
570 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
571 {
572 	u32 desc;
573 
574 	desc = INTEL_LEGACY_32B_CONTEXT;
575 	if (i915_vm_is_4lvl(ce->vm))
576 		desc = INTEL_LEGACY_64B_CONTEXT;
577 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
578 
579 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
580 	if (IS_GEN(engine->i915, 8))
581 		desc |= GEN8_CTX_L3LLC_COHERENT;
582 
583 	return i915_ggtt_offset(ce->state) | desc;
584 }
585 
586 static inline unsigned int dword_in_page(void *addr)
587 {
588 	return offset_in_page(addr) / sizeof(u32);
589 }
590 
591 static void set_offsets(u32 *regs,
592 			const u8 *data,
593 			const struct intel_engine_cs *engine,
594 			bool clear)
595 #define NOP(x) (BIT(7) | (x))
596 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
597 #define POSTED BIT(0)
598 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
599 #define REG16(x) \
600 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
601 	(((x) >> 2) & 0x7f)
602 #define END(total_state_size) 0, (total_state_size)
603 {
604 	const u32 base = engine->mmio_base;
605 
606 	while (*data) {
607 		u8 count, flags;
608 
609 		if (*data & BIT(7)) { /* skip */
610 			count = *data++ & ~BIT(7);
611 			if (clear)
612 				memset32(regs, MI_NOOP, count);
613 			regs += count;
614 			continue;
615 		}
616 
617 		count = *data & 0x3f;
618 		flags = *data >> 6;
619 		data++;
620 
621 		*regs = MI_LOAD_REGISTER_IMM(count);
622 		if (flags & POSTED)
623 			*regs |= MI_LRI_FORCE_POSTED;
624 		if (INTEL_GEN(engine->i915) >= 11)
625 			*regs |= MI_LRI_LRM_CS_MMIO;
626 		regs++;
627 
628 		GEM_BUG_ON(!count);
629 		do {
630 			u32 offset = 0;
631 			u8 v;
632 
633 			do {
634 				v = *data++;
635 				offset <<= 7;
636 				offset |= v & ~BIT(7);
637 			} while (v & BIT(7));
638 
639 			regs[0] = base + (offset << 2);
640 			if (clear)
641 				regs[1] = 0;
642 			regs += 2;
643 		} while (--count);
644 	}
645 
646 	if (clear) {
647 		u8 count = *++data;
648 
649 		/* Clear past the tail for HW access */
650 		GEM_BUG_ON(dword_in_page(regs) > count);
651 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
652 
653 		/* Close the batch; used mainly by live_lrc_layout() */
654 		*regs = MI_BATCH_BUFFER_END;
655 		if (INTEL_GEN(engine->i915) >= 10)
656 			*regs |= BIT(0);
657 	}
658 }
659 
660 static const u8 gen8_xcs_offsets[] = {
661 	NOP(1),
662 	LRI(11, 0),
663 	REG16(0x244),
664 	REG(0x034),
665 	REG(0x030),
666 	REG(0x038),
667 	REG(0x03c),
668 	REG(0x168),
669 	REG(0x140),
670 	REG(0x110),
671 	REG(0x11c),
672 	REG(0x114),
673 	REG(0x118),
674 
675 	NOP(9),
676 	LRI(9, 0),
677 	REG16(0x3a8),
678 	REG16(0x28c),
679 	REG16(0x288),
680 	REG16(0x284),
681 	REG16(0x280),
682 	REG16(0x27c),
683 	REG16(0x278),
684 	REG16(0x274),
685 	REG16(0x270),
686 
687 	NOP(13),
688 	LRI(2, 0),
689 	REG16(0x200),
690 	REG(0x028),
691 
692 	END(80)
693 };
694 
695 static const u8 gen9_xcs_offsets[] = {
696 	NOP(1),
697 	LRI(14, POSTED),
698 	REG16(0x244),
699 	REG(0x034),
700 	REG(0x030),
701 	REG(0x038),
702 	REG(0x03c),
703 	REG(0x168),
704 	REG(0x140),
705 	REG(0x110),
706 	REG(0x11c),
707 	REG(0x114),
708 	REG(0x118),
709 	REG(0x1c0),
710 	REG(0x1c4),
711 	REG(0x1c8),
712 
713 	NOP(3),
714 	LRI(9, POSTED),
715 	REG16(0x3a8),
716 	REG16(0x28c),
717 	REG16(0x288),
718 	REG16(0x284),
719 	REG16(0x280),
720 	REG16(0x27c),
721 	REG16(0x278),
722 	REG16(0x274),
723 	REG16(0x270),
724 
725 	NOP(13),
726 	LRI(1, POSTED),
727 	REG16(0x200),
728 
729 	NOP(13),
730 	LRI(44, POSTED),
731 	REG(0x028),
732 	REG(0x09c),
733 	REG(0x0c0),
734 	REG(0x178),
735 	REG(0x17c),
736 	REG16(0x358),
737 	REG(0x170),
738 	REG(0x150),
739 	REG(0x154),
740 	REG(0x158),
741 	REG16(0x41c),
742 	REG16(0x600),
743 	REG16(0x604),
744 	REG16(0x608),
745 	REG16(0x60c),
746 	REG16(0x610),
747 	REG16(0x614),
748 	REG16(0x618),
749 	REG16(0x61c),
750 	REG16(0x620),
751 	REG16(0x624),
752 	REG16(0x628),
753 	REG16(0x62c),
754 	REG16(0x630),
755 	REG16(0x634),
756 	REG16(0x638),
757 	REG16(0x63c),
758 	REG16(0x640),
759 	REG16(0x644),
760 	REG16(0x648),
761 	REG16(0x64c),
762 	REG16(0x650),
763 	REG16(0x654),
764 	REG16(0x658),
765 	REG16(0x65c),
766 	REG16(0x660),
767 	REG16(0x664),
768 	REG16(0x668),
769 	REG16(0x66c),
770 	REG16(0x670),
771 	REG16(0x674),
772 	REG16(0x678),
773 	REG16(0x67c),
774 	REG(0x068),
775 
776 	END(176)
777 };
778 
779 static const u8 gen12_xcs_offsets[] = {
780 	NOP(1),
781 	LRI(13, POSTED),
782 	REG16(0x244),
783 	REG(0x034),
784 	REG(0x030),
785 	REG(0x038),
786 	REG(0x03c),
787 	REG(0x168),
788 	REG(0x140),
789 	REG(0x110),
790 	REG(0x1c0),
791 	REG(0x1c4),
792 	REG(0x1c8),
793 	REG(0x180),
794 	REG16(0x2b4),
795 
796 	NOP(5),
797 	LRI(9, POSTED),
798 	REG16(0x3a8),
799 	REG16(0x28c),
800 	REG16(0x288),
801 	REG16(0x284),
802 	REG16(0x280),
803 	REG16(0x27c),
804 	REG16(0x278),
805 	REG16(0x274),
806 	REG16(0x270),
807 
808 	END(80)
809 };
810 
811 static const u8 gen8_rcs_offsets[] = {
812 	NOP(1),
813 	LRI(14, POSTED),
814 	REG16(0x244),
815 	REG(0x034),
816 	REG(0x030),
817 	REG(0x038),
818 	REG(0x03c),
819 	REG(0x168),
820 	REG(0x140),
821 	REG(0x110),
822 	REG(0x11c),
823 	REG(0x114),
824 	REG(0x118),
825 	REG(0x1c0),
826 	REG(0x1c4),
827 	REG(0x1c8),
828 
829 	NOP(3),
830 	LRI(9, POSTED),
831 	REG16(0x3a8),
832 	REG16(0x28c),
833 	REG16(0x288),
834 	REG16(0x284),
835 	REG16(0x280),
836 	REG16(0x27c),
837 	REG16(0x278),
838 	REG16(0x274),
839 	REG16(0x270),
840 
841 	NOP(13),
842 	LRI(1, 0),
843 	REG(0x0c8),
844 
845 	END(80)
846 };
847 
848 static const u8 gen9_rcs_offsets[] = {
849 	NOP(1),
850 	LRI(14, POSTED),
851 	REG16(0x244),
852 	REG(0x34),
853 	REG(0x30),
854 	REG(0x38),
855 	REG(0x3c),
856 	REG(0x168),
857 	REG(0x140),
858 	REG(0x110),
859 	REG(0x11c),
860 	REG(0x114),
861 	REG(0x118),
862 	REG(0x1c0),
863 	REG(0x1c4),
864 	REG(0x1c8),
865 
866 	NOP(3),
867 	LRI(9, POSTED),
868 	REG16(0x3a8),
869 	REG16(0x28c),
870 	REG16(0x288),
871 	REG16(0x284),
872 	REG16(0x280),
873 	REG16(0x27c),
874 	REG16(0x278),
875 	REG16(0x274),
876 	REG16(0x270),
877 
878 	NOP(13),
879 	LRI(1, 0),
880 	REG(0xc8),
881 
882 	NOP(13),
883 	LRI(44, POSTED),
884 	REG(0x28),
885 	REG(0x9c),
886 	REG(0xc0),
887 	REG(0x178),
888 	REG(0x17c),
889 	REG16(0x358),
890 	REG(0x170),
891 	REG(0x150),
892 	REG(0x154),
893 	REG(0x158),
894 	REG16(0x41c),
895 	REG16(0x600),
896 	REG16(0x604),
897 	REG16(0x608),
898 	REG16(0x60c),
899 	REG16(0x610),
900 	REG16(0x614),
901 	REG16(0x618),
902 	REG16(0x61c),
903 	REG16(0x620),
904 	REG16(0x624),
905 	REG16(0x628),
906 	REG16(0x62c),
907 	REG16(0x630),
908 	REG16(0x634),
909 	REG16(0x638),
910 	REG16(0x63c),
911 	REG16(0x640),
912 	REG16(0x644),
913 	REG16(0x648),
914 	REG16(0x64c),
915 	REG16(0x650),
916 	REG16(0x654),
917 	REG16(0x658),
918 	REG16(0x65c),
919 	REG16(0x660),
920 	REG16(0x664),
921 	REG16(0x668),
922 	REG16(0x66c),
923 	REG16(0x670),
924 	REG16(0x674),
925 	REG16(0x678),
926 	REG16(0x67c),
927 	REG(0x68),
928 
929 	END(176)
930 };
931 
932 static const u8 gen11_rcs_offsets[] = {
933 	NOP(1),
934 	LRI(15, POSTED),
935 	REG16(0x244),
936 	REG(0x034),
937 	REG(0x030),
938 	REG(0x038),
939 	REG(0x03c),
940 	REG(0x168),
941 	REG(0x140),
942 	REG(0x110),
943 	REG(0x11c),
944 	REG(0x114),
945 	REG(0x118),
946 	REG(0x1c0),
947 	REG(0x1c4),
948 	REG(0x1c8),
949 	REG(0x180),
950 
951 	NOP(1),
952 	LRI(9, POSTED),
953 	REG16(0x3a8),
954 	REG16(0x28c),
955 	REG16(0x288),
956 	REG16(0x284),
957 	REG16(0x280),
958 	REG16(0x27c),
959 	REG16(0x278),
960 	REG16(0x274),
961 	REG16(0x270),
962 
963 	LRI(1, POSTED),
964 	REG(0x1b0),
965 
966 	NOP(10),
967 	LRI(1, 0),
968 	REG(0x0c8),
969 
970 	END(80)
971 };
972 
973 static const u8 gen12_rcs_offsets[] = {
974 	NOP(1),
975 	LRI(13, POSTED),
976 	REG16(0x244),
977 	REG(0x034),
978 	REG(0x030),
979 	REG(0x038),
980 	REG(0x03c),
981 	REG(0x168),
982 	REG(0x140),
983 	REG(0x110),
984 	REG(0x1c0),
985 	REG(0x1c4),
986 	REG(0x1c8),
987 	REG(0x180),
988 	REG16(0x2b4),
989 
990 	NOP(5),
991 	LRI(9, POSTED),
992 	REG16(0x3a8),
993 	REG16(0x28c),
994 	REG16(0x288),
995 	REG16(0x284),
996 	REG16(0x280),
997 	REG16(0x27c),
998 	REG16(0x278),
999 	REG16(0x274),
1000 	REG16(0x270),
1001 
1002 	LRI(3, POSTED),
1003 	REG(0x1b0),
1004 	REG16(0x5a8),
1005 	REG16(0x5ac),
1006 
1007 	NOP(6),
1008 	LRI(1, 0),
1009 	REG(0x0c8),
1010 	NOP(3 + 9 + 1),
1011 
1012 	LRI(51, POSTED),
1013 	REG16(0x588),
1014 	REG16(0x588),
1015 	REG16(0x588),
1016 	REG16(0x588),
1017 	REG16(0x588),
1018 	REG16(0x588),
1019 	REG(0x028),
1020 	REG(0x09c),
1021 	REG(0x0c0),
1022 	REG(0x178),
1023 	REG(0x17c),
1024 	REG16(0x358),
1025 	REG(0x170),
1026 	REG(0x150),
1027 	REG(0x154),
1028 	REG(0x158),
1029 	REG16(0x41c),
1030 	REG16(0x600),
1031 	REG16(0x604),
1032 	REG16(0x608),
1033 	REG16(0x60c),
1034 	REG16(0x610),
1035 	REG16(0x614),
1036 	REG16(0x618),
1037 	REG16(0x61c),
1038 	REG16(0x620),
1039 	REG16(0x624),
1040 	REG16(0x628),
1041 	REG16(0x62c),
1042 	REG16(0x630),
1043 	REG16(0x634),
1044 	REG16(0x638),
1045 	REG16(0x63c),
1046 	REG16(0x640),
1047 	REG16(0x644),
1048 	REG16(0x648),
1049 	REG16(0x64c),
1050 	REG16(0x650),
1051 	REG16(0x654),
1052 	REG16(0x658),
1053 	REG16(0x65c),
1054 	REG16(0x660),
1055 	REG16(0x664),
1056 	REG16(0x668),
1057 	REG16(0x66c),
1058 	REG16(0x670),
1059 	REG16(0x674),
1060 	REG16(0x678),
1061 	REG16(0x67c),
1062 	REG(0x068),
1063 	REG(0x084),
1064 	NOP(1),
1065 
1066 	END(192)
1067 };
1068 
1069 #undef END
1070 #undef REG16
1071 #undef REG
1072 #undef LRI
1073 #undef NOP
1074 
1075 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1076 {
1077 	/*
1078 	 * The gen12+ lists only have the registers we program in the basic
1079 	 * default state. We rely on the context image using relative
1080 	 * addressing to automatic fixup the register state between the
1081 	 * physical engines for virtual engine.
1082 	 */
1083 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1084 		   !intel_engine_has_relative_mmio(engine));
1085 
1086 	if (engine->class == RENDER_CLASS) {
1087 		if (INTEL_GEN(engine->i915) >= 12)
1088 			return gen12_rcs_offsets;
1089 		else if (INTEL_GEN(engine->i915) >= 11)
1090 			return gen11_rcs_offsets;
1091 		else if (INTEL_GEN(engine->i915) >= 9)
1092 			return gen9_rcs_offsets;
1093 		else
1094 			return gen8_rcs_offsets;
1095 	} else {
1096 		if (INTEL_GEN(engine->i915) >= 12)
1097 			return gen12_xcs_offsets;
1098 		else if (INTEL_GEN(engine->i915) >= 9)
1099 			return gen9_xcs_offsets;
1100 		else
1101 			return gen8_xcs_offsets;
1102 	}
1103 }
1104 
1105 static struct i915_request *
1106 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1107 {
1108 	struct i915_request *rq, *rn, *active = NULL;
1109 	struct list_head *uninitialized_var(pl);
1110 	int prio = I915_PRIORITY_INVALID;
1111 
1112 	lockdep_assert_held(&engine->active.lock);
1113 
1114 	list_for_each_entry_safe_reverse(rq, rn,
1115 					 &engine->active.requests,
1116 					 sched.link) {
1117 		if (i915_request_completed(rq))
1118 			continue; /* XXX */
1119 
1120 		__i915_request_unsubmit(rq);
1121 
1122 		/*
1123 		 * Push the request back into the queue for later resubmission.
1124 		 * If this request is not native to this physical engine (i.e.
1125 		 * it came from a virtual source), push it back onto the virtual
1126 		 * engine so that it can be moved across onto another physical
1127 		 * engine as load dictates.
1128 		 */
1129 		if (likely(rq->execution_mask == engine->mask)) {
1130 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1131 			if (rq_prio(rq) != prio) {
1132 				prio = rq_prio(rq);
1133 				pl = i915_sched_lookup_priolist(engine, prio);
1134 			}
1135 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1136 
1137 			list_move(&rq->sched.link, pl);
1138 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1139 
1140 			/* Check in case we rollback so far we wrap [size/2] */
1141 			if (intel_ring_direction(rq->ring,
1142 						 intel_ring_wrap(rq->ring,
1143 								 rq->tail),
1144 						 rq->ring->tail) > 0)
1145 				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1146 
1147 			active = rq;
1148 		} else {
1149 			struct intel_engine_cs *owner = rq->context->engine;
1150 
1151 			/*
1152 			 * Decouple the virtual breadcrumb before moving it
1153 			 * back to the virtual engine -- we don't want the
1154 			 * request to complete in the background and try
1155 			 * and cancel the breadcrumb on the virtual engine
1156 			 * (instead of the old engine where it is linked)!
1157 			 */
1158 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1159 				     &rq->fence.flags)) {
1160 				spin_lock_nested(&rq->lock,
1161 						 SINGLE_DEPTH_NESTING);
1162 				i915_request_cancel_breadcrumb(rq);
1163 				spin_unlock(&rq->lock);
1164 			}
1165 			WRITE_ONCE(rq->engine, owner);
1166 			owner->submit_request(rq);
1167 			active = NULL;
1168 		}
1169 	}
1170 
1171 	return active;
1172 }
1173 
1174 struct i915_request *
1175 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1176 {
1177 	struct intel_engine_cs *engine =
1178 		container_of(execlists, typeof(*engine), execlists);
1179 
1180 	return __unwind_incomplete_requests(engine);
1181 }
1182 
1183 static inline void
1184 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1185 {
1186 	/*
1187 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1188 	 * The compiler should eliminate this function as dead-code.
1189 	 */
1190 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1191 		return;
1192 
1193 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1194 				   status, rq);
1195 }
1196 
1197 static void intel_engine_context_in(struct intel_engine_cs *engine)
1198 {
1199 	unsigned long flags;
1200 
1201 	if (atomic_add_unless(&engine->stats.active, 1, 0))
1202 		return;
1203 
1204 	write_seqlock_irqsave(&engine->stats.lock, flags);
1205 	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1206 		engine->stats.start = ktime_get();
1207 		atomic_inc(&engine->stats.active);
1208 	}
1209 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1210 }
1211 
1212 static void intel_engine_context_out(struct intel_engine_cs *engine)
1213 {
1214 	unsigned long flags;
1215 
1216 	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1217 
1218 	if (atomic_add_unless(&engine->stats.active, -1, 1))
1219 		return;
1220 
1221 	write_seqlock_irqsave(&engine->stats.lock, flags);
1222 	if (atomic_dec_and_test(&engine->stats.active)) {
1223 		engine->stats.total =
1224 			ktime_add(engine->stats.total,
1225 				  ktime_sub(ktime_get(), engine->stats.start));
1226 	}
1227 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1228 }
1229 
1230 static void
1231 execlists_check_context(const struct intel_context *ce,
1232 			const struct intel_engine_cs *engine)
1233 {
1234 	const struct intel_ring *ring = ce->ring;
1235 	u32 *regs = ce->lrc_reg_state;
1236 	bool valid = true;
1237 	int x;
1238 
1239 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1240 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1241 		       engine->name,
1242 		       regs[CTX_RING_START],
1243 		       i915_ggtt_offset(ring->vma));
1244 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1245 		valid = false;
1246 	}
1247 
1248 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1249 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1250 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1251 		       engine->name,
1252 		       regs[CTX_RING_CTL],
1253 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1254 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1255 		valid = false;
1256 	}
1257 
1258 	x = lrc_ring_mi_mode(engine);
1259 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1260 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1261 		       engine->name, regs[x + 1]);
1262 		regs[x + 1] &= ~STOP_RING;
1263 		regs[x + 1] |= STOP_RING << 16;
1264 		valid = false;
1265 	}
1266 
1267 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1268 }
1269 
1270 static void restore_default_state(struct intel_context *ce,
1271 				  struct intel_engine_cs *engine)
1272 {
1273 	u32 *regs;
1274 
1275 	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1276 	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1277 
1278 	ce->runtime.last = intel_context_get_runtime(ce);
1279 }
1280 
1281 static void reset_active(struct i915_request *rq,
1282 			 struct intel_engine_cs *engine)
1283 {
1284 	struct intel_context * const ce = rq->context;
1285 	u32 head;
1286 
1287 	/*
1288 	 * The executing context has been cancelled. We want to prevent
1289 	 * further execution along this context and propagate the error on
1290 	 * to anything depending on its results.
1291 	 *
1292 	 * In __i915_request_submit(), we apply the -EIO and remove the
1293 	 * requests' payloads for any banned requests. But first, we must
1294 	 * rewind the context back to the start of the incomplete request so
1295 	 * that we do not jump back into the middle of the batch.
1296 	 *
1297 	 * We preserve the breadcrumbs and semaphores of the incomplete
1298 	 * requests so that inter-timeline dependencies (i.e other timelines)
1299 	 * remain correctly ordered. And we defer to __i915_request_submit()
1300 	 * so that all asynchronous waits are correctly handled.
1301 	 */
1302 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1303 		     rq->fence.context, rq->fence.seqno);
1304 
1305 	/* On resubmission of the active request, payload will be scrubbed */
1306 	if (i915_request_completed(rq))
1307 		head = rq->tail;
1308 	else
1309 		head = active_request(ce->timeline, rq)->head;
1310 	head = intel_ring_wrap(ce->ring, head);
1311 
1312 	/* Scrub the context image to prevent replaying the previous batch */
1313 	restore_default_state(ce, engine);
1314 	__execlists_update_reg_state(ce, engine, head);
1315 
1316 	/* We've switched away, so this should be a no-op, but intent matters */
1317 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1318 }
1319 
1320 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1321 {
1322 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1323 	ce->runtime.num_underflow += dt < 0;
1324 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1325 #endif
1326 }
1327 
1328 static void intel_context_update_runtime(struct intel_context *ce)
1329 {
1330 	u32 old;
1331 	s32 dt;
1332 
1333 	if (intel_context_is_barrier(ce))
1334 		return;
1335 
1336 	old = ce->runtime.last;
1337 	ce->runtime.last = intel_context_get_runtime(ce);
1338 	dt = ce->runtime.last - old;
1339 
1340 	if (unlikely(dt <= 0)) {
1341 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1342 			 old, ce->runtime.last, dt);
1343 		st_update_runtime_underflow(ce, dt);
1344 		return;
1345 	}
1346 
1347 	ewma_runtime_add(&ce->runtime.avg, dt);
1348 	ce->runtime.total += dt;
1349 }
1350 
1351 static inline struct intel_engine_cs *
1352 __execlists_schedule_in(struct i915_request *rq)
1353 {
1354 	struct intel_engine_cs * const engine = rq->engine;
1355 	struct intel_context * const ce = rq->context;
1356 
1357 	intel_context_get(ce);
1358 
1359 	if (unlikely(intel_context_is_banned(ce)))
1360 		reset_active(rq, engine);
1361 
1362 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1363 		execlists_check_context(ce, engine);
1364 
1365 	if (ce->tag) {
1366 		/* Use a fixed tag for OA and friends */
1367 		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1368 		ce->lrc.ccid = ce->tag;
1369 	} else {
1370 		/* We don't need a strict matching tag, just different values */
1371 		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1372 
1373 		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1374 		clear_bit(tag - 1, &engine->context_tag);
1375 		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1376 
1377 		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1378 	}
1379 
1380 	ce->lrc.ccid |= engine->execlists.ccid;
1381 
1382 	__intel_gt_pm_get(engine->gt);
1383 	if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1384 		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1385 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1386 	intel_engine_context_in(engine);
1387 
1388 	return engine;
1389 }
1390 
1391 static inline struct i915_request *
1392 execlists_schedule_in(struct i915_request *rq, int idx)
1393 {
1394 	struct intel_context * const ce = rq->context;
1395 	struct intel_engine_cs *old;
1396 
1397 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1398 	trace_i915_request_in(rq, idx);
1399 
1400 	old = READ_ONCE(ce->inflight);
1401 	do {
1402 		if (!old) {
1403 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1404 			break;
1405 		}
1406 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1407 
1408 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1409 	return i915_request_get(rq);
1410 }
1411 
1412 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1413 {
1414 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1415 	struct i915_request *next = READ_ONCE(ve->request);
1416 
1417 	if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1418 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
1419 }
1420 
1421 static inline void
1422 __execlists_schedule_out(struct i915_request *rq,
1423 			 struct intel_engine_cs * const engine,
1424 			 unsigned int ccid)
1425 {
1426 	struct intel_context * const ce = rq->context;
1427 
1428 	/*
1429 	 * NB process_csb() is not under the engine->active.lock and hence
1430 	 * schedule_out can race with schedule_in meaning that we should
1431 	 * refrain from doing non-trivial work here.
1432 	 */
1433 
1434 	/*
1435 	 * If we have just completed this context, the engine may now be
1436 	 * idle and we want to re-enter powersaving.
1437 	 */
1438 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1439 	    i915_request_completed(rq))
1440 		intel_engine_add_retire(engine, ce->timeline);
1441 
1442 	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1443 	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1444 	if (ccid < BITS_PER_LONG) {
1445 		GEM_BUG_ON(ccid == 0);
1446 		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1447 		set_bit(ccid - 1, &engine->context_tag);
1448 	}
1449 
1450 	intel_context_update_runtime(ce);
1451 	intel_engine_context_out(engine);
1452 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1453 	if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1454 		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1455 	intel_gt_pm_put_async(engine->gt);
1456 
1457 	/*
1458 	 * If this is part of a virtual engine, its next request may
1459 	 * have been blocked waiting for access to the active context.
1460 	 * We have to kick all the siblings again in case we need to
1461 	 * switch (e.g. the next request is not runnable on this
1462 	 * engine). Hopefully, we will already have submitted the next
1463 	 * request before the tasklet runs and do not need to rebuild
1464 	 * each virtual tree and kick everyone again.
1465 	 */
1466 	if (ce->engine != engine)
1467 		kick_siblings(rq, ce);
1468 
1469 	intel_context_put(ce);
1470 }
1471 
1472 static inline void
1473 execlists_schedule_out(struct i915_request *rq)
1474 {
1475 	struct intel_context * const ce = rq->context;
1476 	struct intel_engine_cs *cur, *old;
1477 	u32 ccid;
1478 
1479 	trace_i915_request_out(rq);
1480 
1481 	ccid = rq->context->lrc.ccid;
1482 	old = READ_ONCE(ce->inflight);
1483 	do
1484 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1485 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1486 	if (!cur)
1487 		__execlists_schedule_out(rq, old, ccid);
1488 
1489 	i915_request_put(rq);
1490 }
1491 
1492 static u64 execlists_update_context(struct i915_request *rq)
1493 {
1494 	struct intel_context *ce = rq->context;
1495 	u64 desc = ce->lrc.desc;
1496 	u32 tail, prev;
1497 
1498 	/*
1499 	 * WaIdleLiteRestore:bdw,skl
1500 	 *
1501 	 * We should never submit the context with the same RING_TAIL twice
1502 	 * just in case we submit an empty ring, which confuses the HW.
1503 	 *
1504 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1505 	 * the normal request to be able to always advance the RING_TAIL on
1506 	 * subsequent resubmissions (for lite restore). Should that fail us,
1507 	 * and we try and submit the same tail again, force the context
1508 	 * reload.
1509 	 *
1510 	 * If we need to return to a preempted context, we need to skip the
1511 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1512 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1513 	 * an earlier request.
1514 	 */
1515 	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1516 	prev = rq->ring->tail;
1517 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1518 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1519 		desc |= CTX_DESC_FORCE_RESTORE;
1520 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1521 	rq->tail = rq->wa_tail;
1522 
1523 	/*
1524 	 * Make sure the context image is complete before we submit it to HW.
1525 	 *
1526 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1527 	 * an uncached write such as our mmio register access, the empirical
1528 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1529 	 * may not be visible to the HW prior to the completion of the UC
1530 	 * register write and that we may begin execution from the context
1531 	 * before its image is complete leading to invalid PD chasing.
1532 	 */
1533 	wmb();
1534 
1535 	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1536 	return desc;
1537 }
1538 
1539 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1540 {
1541 	if (execlists->ctrl_reg) {
1542 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1543 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1544 	} else {
1545 		writel(upper_32_bits(desc), execlists->submit_reg);
1546 		writel(lower_32_bits(desc), execlists->submit_reg);
1547 	}
1548 }
1549 
1550 static __maybe_unused char *
1551 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1552 {
1553 	if (!rq)
1554 		return "";
1555 
1556 	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1557 		 prefix,
1558 		 rq->context->lrc.ccid,
1559 		 rq->fence.context, rq->fence.seqno,
1560 		 i915_request_completed(rq) ? "!" :
1561 		 i915_request_started(rq) ? "*" :
1562 		 "",
1563 		 rq_prio(rq));
1564 
1565 	return buf;
1566 }
1567 
1568 static __maybe_unused void
1569 trace_ports(const struct intel_engine_execlists *execlists,
1570 	    const char *msg,
1571 	    struct i915_request * const *ports)
1572 {
1573 	const struct intel_engine_cs *engine =
1574 		container_of(execlists, typeof(*engine), execlists);
1575 	char __maybe_unused p0[40], p1[40];
1576 
1577 	if (!ports[0])
1578 		return;
1579 
1580 	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1581 		     dump_port(p0, sizeof(p0), "", ports[0]),
1582 		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1583 }
1584 
1585 static inline bool
1586 reset_in_progress(const struct intel_engine_execlists *execlists)
1587 {
1588 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1589 }
1590 
1591 static __maybe_unused bool
1592 assert_pending_valid(const struct intel_engine_execlists *execlists,
1593 		     const char *msg)
1594 {
1595 	struct intel_engine_cs *engine =
1596 		container_of(execlists, typeof(*engine), execlists);
1597 	struct i915_request * const *port, *rq;
1598 	struct intel_context *ce = NULL;
1599 	bool sentinel = false;
1600 	u32 ccid = -1;
1601 
1602 	trace_ports(execlists, msg, execlists->pending);
1603 
1604 	/* We may be messing around with the lists during reset, lalala */
1605 	if (reset_in_progress(execlists))
1606 		return true;
1607 
1608 	if (!execlists->pending[0]) {
1609 		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1610 			      engine->name);
1611 		return false;
1612 	}
1613 
1614 	if (execlists->pending[execlists_num_ports(execlists)]) {
1615 		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1616 			      engine->name, execlists_num_ports(execlists));
1617 		return false;
1618 	}
1619 
1620 	for (port = execlists->pending; (rq = *port); port++) {
1621 		unsigned long flags;
1622 		bool ok = true;
1623 
1624 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1625 		GEM_BUG_ON(!i915_request_is_active(rq));
1626 
1627 		if (ce == rq->context) {
1628 			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1629 				      engine->name,
1630 				      ce->timeline->fence_context,
1631 				      port - execlists->pending);
1632 			return false;
1633 		}
1634 		ce = rq->context;
1635 
1636 		if (ccid == ce->lrc.ccid) {
1637 			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1638 				      engine->name,
1639 				      ccid, ce->timeline->fence_context,
1640 				      port - execlists->pending);
1641 			return false;
1642 		}
1643 		ccid = ce->lrc.ccid;
1644 
1645 		/*
1646 		 * Sentinels are supposed to be the last request so they flush
1647 		 * the current execution off the HW. Check that they are the only
1648 		 * request in the pending submission.
1649 		 */
1650 		if (sentinel) {
1651 			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1652 				      engine->name,
1653 				      ce->timeline->fence_context,
1654 				      port - execlists->pending);
1655 			return false;
1656 		}
1657 		sentinel = i915_request_has_sentinel(rq);
1658 
1659 		/* Hold tightly onto the lock to prevent concurrent retires! */
1660 		if (!spin_trylock_irqsave(&rq->lock, flags))
1661 			continue;
1662 
1663 		if (i915_request_completed(rq))
1664 			goto unlock;
1665 
1666 		if (i915_active_is_idle(&ce->active) &&
1667 		    !intel_context_is_barrier(ce)) {
1668 			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1669 				      engine->name,
1670 				      ce->timeline->fence_context,
1671 				      port - execlists->pending);
1672 			ok = false;
1673 			goto unlock;
1674 		}
1675 
1676 		if (!i915_vma_is_pinned(ce->state)) {
1677 			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1678 				      engine->name,
1679 				      ce->timeline->fence_context,
1680 				      port - execlists->pending);
1681 			ok = false;
1682 			goto unlock;
1683 		}
1684 
1685 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1686 			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1687 				      engine->name,
1688 				      ce->timeline->fence_context,
1689 				      port - execlists->pending);
1690 			ok = false;
1691 			goto unlock;
1692 		}
1693 
1694 unlock:
1695 		spin_unlock_irqrestore(&rq->lock, flags);
1696 		if (!ok)
1697 			return false;
1698 	}
1699 
1700 	return ce;
1701 }
1702 
1703 static void execlists_submit_ports(struct intel_engine_cs *engine)
1704 {
1705 	struct intel_engine_execlists *execlists = &engine->execlists;
1706 	unsigned int n;
1707 
1708 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1709 
1710 	/*
1711 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1712 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1713 	 * not be relinquished until the device is idle (see
1714 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1715 	 * that all ELSP are drained i.e. we have processed the CSB,
1716 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1717 	 */
1718 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1719 
1720 	/*
1721 	 * ELSQ note: the submit queue is not cleared after being submitted
1722 	 * to the HW so we need to make sure we always clean it up. This is
1723 	 * currently ensured by the fact that we always write the same number
1724 	 * of elsq entries, keep this in mind before changing the loop below.
1725 	 */
1726 	for (n = execlists_num_ports(execlists); n--; ) {
1727 		struct i915_request *rq = execlists->pending[n];
1728 
1729 		write_desc(execlists,
1730 			   rq ? execlists_update_context(rq) : 0,
1731 			   n);
1732 	}
1733 
1734 	/* we need to manually load the submit queue */
1735 	if (execlists->ctrl_reg)
1736 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1737 }
1738 
1739 static bool ctx_single_port_submission(const struct intel_context *ce)
1740 {
1741 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1742 		intel_context_force_single_submission(ce));
1743 }
1744 
1745 static bool can_merge_ctx(const struct intel_context *prev,
1746 			  const struct intel_context *next)
1747 {
1748 	if (prev != next)
1749 		return false;
1750 
1751 	if (ctx_single_port_submission(prev))
1752 		return false;
1753 
1754 	return true;
1755 }
1756 
1757 static unsigned long i915_request_flags(const struct i915_request *rq)
1758 {
1759 	return READ_ONCE(rq->fence.flags);
1760 }
1761 
1762 static bool can_merge_rq(const struct i915_request *prev,
1763 			 const struct i915_request *next)
1764 {
1765 	GEM_BUG_ON(prev == next);
1766 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1767 
1768 	/*
1769 	 * We do not submit known completed requests. Therefore if the next
1770 	 * request is already completed, we can pretend to merge it in
1771 	 * with the previous context (and we will skip updating the ELSP
1772 	 * and tracking). Thus hopefully keeping the ELSP full with active
1773 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1774 	 * us.
1775 	 */
1776 	if (i915_request_completed(next))
1777 		return true;
1778 
1779 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1780 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1781 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1782 		return false;
1783 
1784 	if (!can_merge_ctx(prev->context, next->context))
1785 		return false;
1786 
1787 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1788 	return true;
1789 }
1790 
1791 static void virtual_update_register_offsets(u32 *regs,
1792 					    struct intel_engine_cs *engine)
1793 {
1794 	set_offsets(regs, reg_offsets(engine), engine, false);
1795 }
1796 
1797 static bool virtual_matches(const struct virtual_engine *ve,
1798 			    const struct i915_request *rq,
1799 			    const struct intel_engine_cs *engine)
1800 {
1801 	const struct intel_engine_cs *inflight;
1802 
1803 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1804 		return false;
1805 
1806 	/*
1807 	 * We track when the HW has completed saving the context image
1808 	 * (i.e. when we have seen the final CS event switching out of
1809 	 * the context) and must not overwrite the context image before
1810 	 * then. This restricts us to only using the active engine
1811 	 * while the previous virtualized request is inflight (so
1812 	 * we reuse the register offsets). This is a very small
1813 	 * hystersis on the greedy seelction algorithm.
1814 	 */
1815 	inflight = intel_context_inflight(&ve->context);
1816 	if (inflight && inflight != engine)
1817 		return false;
1818 
1819 	return true;
1820 }
1821 
1822 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve)
1823 {
1824 	/*
1825 	 * All the outstanding signals on ve->siblings[0] must have
1826 	 * been completed, just pending the interrupt handler. As those
1827 	 * signals still refer to the old sibling (via rq->engine), we must
1828 	 * transfer those to the old irq_worker to keep our locking
1829 	 * consistent.
1830 	 */
1831 	intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context);
1832 }
1833 
1834 #define for_each_waiter(p__, rq__) \
1835 	list_for_each_entry_lockless(p__, \
1836 				     &(rq__)->sched.waiters_list, \
1837 				     wait_link)
1838 
1839 #define for_each_signaler(p__, rq__) \
1840 	list_for_each_entry_rcu(p__, \
1841 				&(rq__)->sched.signalers_list, \
1842 				signal_link)
1843 
1844 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1845 {
1846 	LIST_HEAD(list);
1847 
1848 	/*
1849 	 * We want to move the interrupted request to the back of
1850 	 * the round-robin list (i.e. its priority level), but
1851 	 * in doing so, we must then move all requests that were in
1852 	 * flight and were waiting for the interrupted request to
1853 	 * be run after it again.
1854 	 */
1855 	do {
1856 		struct i915_dependency *p;
1857 
1858 		GEM_BUG_ON(i915_request_is_active(rq));
1859 		list_move_tail(&rq->sched.link, pl);
1860 
1861 		for_each_waiter(p, rq) {
1862 			struct i915_request *w =
1863 				container_of(p->waiter, typeof(*w), sched);
1864 
1865 			if (p->flags & I915_DEPENDENCY_WEAK)
1866 				continue;
1867 
1868 			/* Leave semaphores spinning on the other engines */
1869 			if (w->engine != rq->engine)
1870 				continue;
1871 
1872 			/* No waiter should start before its signaler */
1873 			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1874 				   i915_request_started(w) &&
1875 				   !i915_request_completed(rq));
1876 
1877 			GEM_BUG_ON(i915_request_is_active(w));
1878 			if (!i915_request_is_ready(w))
1879 				continue;
1880 
1881 			if (rq_prio(w) < rq_prio(rq))
1882 				continue;
1883 
1884 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1885 			list_move_tail(&w->sched.link, &list);
1886 		}
1887 
1888 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1889 	} while (rq);
1890 }
1891 
1892 static void defer_active(struct intel_engine_cs *engine)
1893 {
1894 	struct i915_request *rq;
1895 
1896 	rq = __unwind_incomplete_requests(engine);
1897 	if (!rq)
1898 		return;
1899 
1900 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1901 }
1902 
1903 static bool
1904 need_timeslice(const struct intel_engine_cs *engine,
1905 	       const struct i915_request *rq,
1906 	       const struct rb_node *rb)
1907 {
1908 	int hint;
1909 
1910 	if (!intel_engine_has_timeslices(engine))
1911 		return false;
1912 
1913 	hint = engine->execlists.queue_priority_hint;
1914 
1915 	if (rb) {
1916 		const struct virtual_engine *ve =
1917 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1918 		const struct intel_engine_cs *inflight =
1919 			intel_context_inflight(&ve->context);
1920 
1921 		if (!inflight || inflight == engine) {
1922 			struct i915_request *next;
1923 
1924 			rcu_read_lock();
1925 			next = READ_ONCE(ve->request);
1926 			if (next)
1927 				hint = max(hint, rq_prio(next));
1928 			rcu_read_unlock();
1929 		}
1930 	}
1931 
1932 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1933 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1934 
1935 	GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1936 	return hint >= effective_prio(rq);
1937 }
1938 
1939 static bool
1940 timeslice_yield(const struct intel_engine_execlists *el,
1941 		const struct i915_request *rq)
1942 {
1943 	/*
1944 	 * Once bitten, forever smitten!
1945 	 *
1946 	 * If the active context ever busy-waited on a semaphore,
1947 	 * it will be treated as a hog until the end of its timeslice (i.e.
1948 	 * until it is scheduled out and replaced by a new submission,
1949 	 * possibly even its own lite-restore). The HW only sends an interrupt
1950 	 * on the first miss, and we do know if that semaphore has been
1951 	 * signaled, or even if it is now stuck on another semaphore. Play
1952 	 * safe, yield if it might be stuck -- it will be given a fresh
1953 	 * timeslice in the near future.
1954 	 */
1955 	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1956 }
1957 
1958 static bool
1959 timeslice_expired(const struct intel_engine_execlists *el,
1960 		  const struct i915_request *rq)
1961 {
1962 	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1963 }
1964 
1965 static int
1966 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1967 {
1968 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1969 		return engine->execlists.queue_priority_hint;
1970 
1971 	return rq_prio(list_next_entry(rq, sched.link));
1972 }
1973 
1974 static inline unsigned long
1975 timeslice(const struct intel_engine_cs *engine)
1976 {
1977 	return READ_ONCE(engine->props.timeslice_duration_ms);
1978 }
1979 
1980 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1981 {
1982 	const struct intel_engine_execlists *execlists = &engine->execlists;
1983 	const struct i915_request *rq = *execlists->active;
1984 
1985 	if (!rq || i915_request_completed(rq))
1986 		return 0;
1987 
1988 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1989 		return 0;
1990 
1991 	return timeslice(engine);
1992 }
1993 
1994 static void set_timeslice(struct intel_engine_cs *engine)
1995 {
1996 	unsigned long duration;
1997 
1998 	if (!intel_engine_has_timeslices(engine))
1999 		return;
2000 
2001 	duration = active_timeslice(engine);
2002 	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2003 
2004 	set_timer_ms(&engine->execlists.timer, duration);
2005 }
2006 
2007 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2008 {
2009 	struct intel_engine_execlists *execlists = &engine->execlists;
2010 	unsigned long duration;
2011 
2012 	if (!intel_engine_has_timeslices(engine))
2013 		return;
2014 
2015 	WRITE_ONCE(execlists->switch_priority_hint, prio);
2016 	if (prio == INT_MIN)
2017 		return;
2018 
2019 	if (timer_pending(&execlists->timer))
2020 		return;
2021 
2022 	duration = timeslice(engine);
2023 	ENGINE_TRACE(engine,
2024 		     "start timeslicing, prio:%d, interval:%lu",
2025 		     prio, duration);
2026 
2027 	set_timer_ms(&execlists->timer, duration);
2028 }
2029 
2030 static void record_preemption(struct intel_engine_execlists *execlists)
2031 {
2032 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2033 }
2034 
2035 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2036 					    const struct i915_request *rq)
2037 {
2038 	if (!rq)
2039 		return 0;
2040 
2041 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2042 	if (unlikely(intel_context_is_banned(rq->context)))
2043 		return 1;
2044 
2045 	return READ_ONCE(engine->props.preempt_timeout_ms);
2046 }
2047 
2048 static void set_preempt_timeout(struct intel_engine_cs *engine,
2049 				const struct i915_request *rq)
2050 {
2051 	if (!intel_engine_has_preempt_reset(engine))
2052 		return;
2053 
2054 	set_timer_ms(&engine->execlists.preempt,
2055 		     active_preempt_timeout(engine, rq));
2056 }
2057 
2058 static inline void clear_ports(struct i915_request **ports, int count)
2059 {
2060 	memset_p((void **)ports, NULL, count);
2061 }
2062 
2063 static void execlists_dequeue(struct intel_engine_cs *engine)
2064 {
2065 	struct intel_engine_execlists * const execlists = &engine->execlists;
2066 	struct i915_request **port = execlists->pending;
2067 	struct i915_request ** const last_port = port + execlists->port_mask;
2068 	struct i915_request * const *active;
2069 	struct i915_request *last;
2070 	struct rb_node *rb;
2071 	bool submit = false;
2072 
2073 	/*
2074 	 * Hardware submission is through 2 ports. Conceptually each port
2075 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2076 	 * static for a context, and unique to each, so we only execute
2077 	 * requests belonging to a single context from each ring. RING_HEAD
2078 	 * is maintained by the CS in the context image, it marks the place
2079 	 * where it got up to last time, and through RING_TAIL we tell the CS
2080 	 * where we want to execute up to this time.
2081 	 *
2082 	 * In this list the requests are in order of execution. Consecutive
2083 	 * requests from the same context are adjacent in the ringbuffer. We
2084 	 * can combine these requests into a single RING_TAIL update:
2085 	 *
2086 	 *              RING_HEAD...req1...req2
2087 	 *                                    ^- RING_TAIL
2088 	 * since to execute req2 the CS must first execute req1.
2089 	 *
2090 	 * Our goal then is to point each port to the end of a consecutive
2091 	 * sequence of requests as being the most optimal (fewest wake ups
2092 	 * and context switches) submission.
2093 	 */
2094 
2095 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2096 		struct virtual_engine *ve =
2097 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2098 		struct i915_request *rq = READ_ONCE(ve->request);
2099 
2100 		if (!rq) { /* lazily cleanup after another engine handled rq */
2101 			rb_erase_cached(rb, &execlists->virtual);
2102 			RB_CLEAR_NODE(rb);
2103 			rb = rb_first_cached(&execlists->virtual);
2104 			continue;
2105 		}
2106 
2107 		if (!virtual_matches(ve, rq, engine)) {
2108 			rb = rb_next(rb);
2109 			continue;
2110 		}
2111 
2112 		break;
2113 	}
2114 
2115 	/*
2116 	 * If the queue is higher priority than the last
2117 	 * request in the currently active context, submit afresh.
2118 	 * We will resubmit again afterwards in case we need to split
2119 	 * the active context to interject the preemption request,
2120 	 * i.e. we will retrigger preemption following the ack in case
2121 	 * of trouble.
2122 	 */
2123 	active = READ_ONCE(execlists->active);
2124 
2125 	/*
2126 	 * In theory we can skip over completed contexts that have not
2127 	 * yet been processed by events (as those events are in flight):
2128 	 *
2129 	 * while ((last = *active) && i915_request_completed(last))
2130 	 *	active++;
2131 	 *
2132 	 * However, the GPU cannot handle this as it will ultimately
2133 	 * find itself trying to jump back into a context it has just
2134 	 * completed and barf.
2135 	 */
2136 
2137 	if ((last = *active)) {
2138 		if (need_preempt(engine, last, rb)) {
2139 			if (i915_request_completed(last)) {
2140 				tasklet_hi_schedule(&execlists->tasklet);
2141 				return;
2142 			}
2143 
2144 			ENGINE_TRACE(engine,
2145 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2146 				     last->fence.context,
2147 				     last->fence.seqno,
2148 				     last->sched.attr.priority,
2149 				     execlists->queue_priority_hint);
2150 			record_preemption(execlists);
2151 
2152 			/*
2153 			 * Don't let the RING_HEAD advance past the breadcrumb
2154 			 * as we unwind (and until we resubmit) so that we do
2155 			 * not accidentally tell it to go backwards.
2156 			 */
2157 			ring_set_paused(engine, 1);
2158 
2159 			/*
2160 			 * Note that we have not stopped the GPU at this point,
2161 			 * so we are unwinding the incomplete requests as they
2162 			 * remain inflight and so by the time we do complete
2163 			 * the preemption, some of the unwound requests may
2164 			 * complete!
2165 			 */
2166 			__unwind_incomplete_requests(engine);
2167 
2168 			last = NULL;
2169 		} else if (need_timeslice(engine, last, rb) &&
2170 			   timeslice_expired(execlists, last)) {
2171 			if (i915_request_completed(last)) {
2172 				tasklet_hi_schedule(&execlists->tasklet);
2173 				return;
2174 			}
2175 
2176 			ENGINE_TRACE(engine,
2177 				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2178 				     last->fence.context,
2179 				     last->fence.seqno,
2180 				     last->sched.attr.priority,
2181 				     execlists->queue_priority_hint,
2182 				     yesno(timeslice_yield(execlists, last)));
2183 
2184 			ring_set_paused(engine, 1);
2185 			defer_active(engine);
2186 
2187 			/*
2188 			 * Unlike for preemption, if we rewind and continue
2189 			 * executing the same context as previously active,
2190 			 * the order of execution will remain the same and
2191 			 * the tail will only advance. We do not need to
2192 			 * force a full context restore, as a lite-restore
2193 			 * is sufficient to resample the monotonic TAIL.
2194 			 *
2195 			 * If we switch to any other context, similarly we
2196 			 * will not rewind TAIL of current context, and
2197 			 * normal save/restore will preserve state and allow
2198 			 * us to later continue executing the same request.
2199 			 */
2200 			last = NULL;
2201 		} else {
2202 			/*
2203 			 * Otherwise if we already have a request pending
2204 			 * for execution after the current one, we can
2205 			 * just wait until the next CS event before
2206 			 * queuing more. In either case we will force a
2207 			 * lite-restore preemption event, but if we wait
2208 			 * we hopefully coalesce several updates into a single
2209 			 * submission.
2210 			 */
2211 			if (!list_is_last(&last->sched.link,
2212 					  &engine->active.requests)) {
2213 				/*
2214 				 * Even if ELSP[1] is occupied and not worthy
2215 				 * of timeslices, our queue might be.
2216 				 */
2217 				start_timeslice(engine, queue_prio(execlists));
2218 				return;
2219 			}
2220 		}
2221 	}
2222 
2223 	while (rb) { /* XXX virtual is always taking precedence */
2224 		struct virtual_engine *ve =
2225 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2226 		struct i915_request *rq;
2227 
2228 		spin_lock(&ve->base.active.lock);
2229 
2230 		rq = ve->request;
2231 		if (unlikely(!rq)) { /* lost the race to a sibling */
2232 			spin_unlock(&ve->base.active.lock);
2233 			rb_erase_cached(rb, &execlists->virtual);
2234 			RB_CLEAR_NODE(rb);
2235 			rb = rb_first_cached(&execlists->virtual);
2236 			continue;
2237 		}
2238 
2239 		GEM_BUG_ON(rq != ve->request);
2240 		GEM_BUG_ON(rq->engine != &ve->base);
2241 		GEM_BUG_ON(rq->context != &ve->context);
2242 
2243 		if (rq_prio(rq) >= queue_prio(execlists)) {
2244 			if (!virtual_matches(ve, rq, engine)) {
2245 				spin_unlock(&ve->base.active.lock);
2246 				rb = rb_next(rb);
2247 				continue;
2248 			}
2249 
2250 			if (last && !can_merge_rq(last, rq)) {
2251 				spin_unlock(&ve->base.active.lock);
2252 				start_timeslice(engine, rq_prio(rq));
2253 				return; /* leave this for another sibling */
2254 			}
2255 
2256 			ENGINE_TRACE(engine,
2257 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2258 				     rq->fence.context,
2259 				     rq->fence.seqno,
2260 				     i915_request_completed(rq) ? "!" :
2261 				     i915_request_started(rq) ? "*" :
2262 				     "",
2263 				     yesno(engine != ve->siblings[0]));
2264 
2265 			WRITE_ONCE(ve->request, NULL);
2266 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2267 				   INT_MIN);
2268 			rb_erase_cached(rb, &execlists->virtual);
2269 			RB_CLEAR_NODE(rb);
2270 
2271 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2272 			WRITE_ONCE(rq->engine, engine);
2273 
2274 			if (engine != ve->siblings[0]) {
2275 				u32 *regs = ve->context.lrc_reg_state;
2276 				unsigned int n;
2277 
2278 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2279 
2280 				if (!intel_engine_has_relative_mmio(engine))
2281 					virtual_update_register_offsets(regs,
2282 									engine);
2283 
2284 				if (!list_empty(&ve->context.signals))
2285 					virtual_xfer_breadcrumbs(ve);
2286 
2287 				/*
2288 				 * Move the bound engine to the top of the list
2289 				 * for future execution. We then kick this
2290 				 * tasklet first before checking others, so that
2291 				 * we preferentially reuse this set of bound
2292 				 * registers.
2293 				 */
2294 				for (n = 1; n < ve->num_siblings; n++) {
2295 					if (ve->siblings[n] == engine) {
2296 						swap(ve->siblings[n],
2297 						     ve->siblings[0]);
2298 						break;
2299 					}
2300 				}
2301 
2302 				GEM_BUG_ON(ve->siblings[0] != engine);
2303 			}
2304 
2305 			if (__i915_request_submit(rq)) {
2306 				submit = true;
2307 				last = rq;
2308 			}
2309 			i915_request_put(rq);
2310 
2311 			/*
2312 			 * Hmm, we have a bunch of virtual engine requests,
2313 			 * but the first one was already completed (thanks
2314 			 * preempt-to-busy!). Keep looking at the veng queue
2315 			 * until we have no more relevant requests (i.e.
2316 			 * the normal submit queue has higher priority).
2317 			 */
2318 			if (!submit) {
2319 				spin_unlock(&ve->base.active.lock);
2320 				rb = rb_first_cached(&execlists->virtual);
2321 				continue;
2322 			}
2323 		}
2324 
2325 		spin_unlock(&ve->base.active.lock);
2326 		break;
2327 	}
2328 
2329 	while ((rb = rb_first_cached(&execlists->queue))) {
2330 		struct i915_priolist *p = to_priolist(rb);
2331 		struct i915_request *rq, *rn;
2332 		int i;
2333 
2334 		priolist_for_each_request_consume(rq, rn, p, i) {
2335 			bool merge = true;
2336 
2337 			/*
2338 			 * Can we combine this request with the current port?
2339 			 * It has to be the same context/ringbuffer and not
2340 			 * have any exceptions (e.g. GVT saying never to
2341 			 * combine contexts).
2342 			 *
2343 			 * If we can combine the requests, we can execute both
2344 			 * by updating the RING_TAIL to point to the end of the
2345 			 * second request, and so we never need to tell the
2346 			 * hardware about the first.
2347 			 */
2348 			if (last && !can_merge_rq(last, rq)) {
2349 				/*
2350 				 * If we are on the second port and cannot
2351 				 * combine this request with the last, then we
2352 				 * are done.
2353 				 */
2354 				if (port == last_port)
2355 					goto done;
2356 
2357 				/*
2358 				 * We must not populate both ELSP[] with the
2359 				 * same LRCA, i.e. we must submit 2 different
2360 				 * contexts if we submit 2 ELSP.
2361 				 */
2362 				if (last->context == rq->context)
2363 					goto done;
2364 
2365 				if (i915_request_has_sentinel(last))
2366 					goto done;
2367 
2368 				/*
2369 				 * If GVT overrides us we only ever submit
2370 				 * port[0], leaving port[1] empty. Note that we
2371 				 * also have to be careful that we don't queue
2372 				 * the same context (even though a different
2373 				 * request) to the second port.
2374 				 */
2375 				if (ctx_single_port_submission(last->context) ||
2376 				    ctx_single_port_submission(rq->context))
2377 					goto done;
2378 
2379 				merge = false;
2380 			}
2381 
2382 			if (__i915_request_submit(rq)) {
2383 				if (!merge) {
2384 					*port = execlists_schedule_in(last, port - execlists->pending);
2385 					port++;
2386 					last = NULL;
2387 				}
2388 
2389 				GEM_BUG_ON(last &&
2390 					   !can_merge_ctx(last->context,
2391 							  rq->context));
2392 				GEM_BUG_ON(last &&
2393 					   i915_seqno_passed(last->fence.seqno,
2394 							     rq->fence.seqno));
2395 
2396 				submit = true;
2397 				last = rq;
2398 			}
2399 		}
2400 
2401 		rb_erase_cached(&p->node, &execlists->queue);
2402 		i915_priolist_free(p);
2403 	}
2404 
2405 done:
2406 	/*
2407 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2408 	 *
2409 	 * We choose the priority hint such that if we add a request of greater
2410 	 * priority than this, we kick the submission tasklet to decide on
2411 	 * the right order of submitting the requests to hardware. We must
2412 	 * also be prepared to reorder requests as they are in-flight on the
2413 	 * HW. We derive the priority hint then as the first "hole" in
2414 	 * the HW submission ports and if there are no available slots,
2415 	 * the priority of the lowest executing request, i.e. last.
2416 	 *
2417 	 * When we do receive a higher priority request ready to run from the
2418 	 * user, see queue_request(), the priority hint is bumped to that
2419 	 * request triggering preemption on the next dequeue (or subsequent
2420 	 * interrupt for secondary ports).
2421 	 */
2422 	execlists->queue_priority_hint = queue_prio(execlists);
2423 
2424 	if (submit) {
2425 		*port = execlists_schedule_in(last, port - execlists->pending);
2426 		execlists->switch_priority_hint =
2427 			switch_prio(engine, *execlists->pending);
2428 
2429 		/*
2430 		 * Skip if we ended up with exactly the same set of requests,
2431 		 * e.g. trying to timeslice a pair of ordered contexts
2432 		 */
2433 		if (!memcmp(active, execlists->pending,
2434 			    (port - execlists->pending + 1) * sizeof(*port))) {
2435 			do
2436 				execlists_schedule_out(fetch_and_zero(port));
2437 			while (port-- != execlists->pending);
2438 
2439 			goto skip_submit;
2440 		}
2441 		clear_ports(port + 1, last_port - port);
2442 
2443 		WRITE_ONCE(execlists->yield, -1);
2444 		set_preempt_timeout(engine, *active);
2445 		execlists_submit_ports(engine);
2446 	} else {
2447 		start_timeslice(engine, execlists->queue_priority_hint);
2448 skip_submit:
2449 		ring_set_paused(engine, 0);
2450 	}
2451 }
2452 
2453 static void
2454 cancel_port_requests(struct intel_engine_execlists * const execlists)
2455 {
2456 	struct i915_request * const *port;
2457 
2458 	for (port = execlists->pending; *port; port++)
2459 		execlists_schedule_out(*port);
2460 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2461 
2462 	/* Mark the end of active before we overwrite *active */
2463 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2464 		execlists_schedule_out(*port);
2465 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2466 
2467 	smp_wmb(); /* complete the seqlock for execlists_active() */
2468 	WRITE_ONCE(execlists->active, execlists->inflight);
2469 }
2470 
2471 static inline void
2472 invalidate_csb_entries(const u32 *first, const u32 *last)
2473 {
2474 	clflush((void *)first);
2475 	clflush((void *)last);
2476 }
2477 
2478 /*
2479  * Starting with Gen12, the status has a new format:
2480  *
2481  *     bit  0:     switched to new queue
2482  *     bit  1:     reserved
2483  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2484  *                 switch detail is set to "wait on semaphore"
2485  *     bits 3-5:   engine class
2486  *     bits 6-11:  engine instance
2487  *     bits 12-14: reserved
2488  *     bits 15-25: sw context id of the lrc the GT switched to
2489  *     bits 26-31: sw counter of the lrc the GT switched to
2490  *     bits 32-35: context switch detail
2491  *                  - 0: ctx complete
2492  *                  - 1: wait on sync flip
2493  *                  - 2: wait on vblank
2494  *                  - 3: wait on scanline
2495  *                  - 4: wait on semaphore
2496  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2497  *                       WAIT_FOR_EVENT)
2498  *     bit  36:    reserved
2499  *     bits 37-43: wait detail (for switch detail 1 to 4)
2500  *     bits 44-46: reserved
2501  *     bits 47-57: sw context id of the lrc the GT switched away from
2502  *     bits 58-63: sw counter of the lrc the GT switched away from
2503  */
2504 static inline bool
2505 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2506 {
2507 	u32 lower_dw = csb[0];
2508 	u32 upper_dw = csb[1];
2509 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2510 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2511 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2512 
2513 	/*
2514 	 * The context switch detail is not guaranteed to be 5 when a preemption
2515 	 * occurs, so we can't just check for that. The check below works for
2516 	 * all the cases we care about, including preemptions of WAIT
2517 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2518 	 * would require some extra handling, but we don't support that.
2519 	 */
2520 	if (!ctx_away_valid || new_queue) {
2521 		GEM_BUG_ON(!ctx_to_valid);
2522 		return true;
2523 	}
2524 
2525 	/*
2526 	 * switch detail = 5 is covered by the case above and we do not expect a
2527 	 * context switch on an unsuccessful wait instruction since we always
2528 	 * use polling mode.
2529 	 */
2530 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2531 	return false;
2532 }
2533 
2534 static inline bool
2535 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2536 {
2537 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2538 }
2539 
2540 static void process_csb(struct intel_engine_cs *engine)
2541 {
2542 	struct intel_engine_execlists * const execlists = &engine->execlists;
2543 	const u32 * const buf = execlists->csb_status;
2544 	const u8 num_entries = execlists->csb_size;
2545 	u8 head, tail;
2546 
2547 	/*
2548 	 * As we modify our execlists state tracking we require exclusive
2549 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2550 	 * and we assume that is only inside the reset paths and so serialised.
2551 	 */
2552 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2553 		   !reset_in_progress(execlists));
2554 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2555 
2556 	/*
2557 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2558 	 * When reading from the csb_write mmio register, we have to be
2559 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2560 	 * the low 4bits. As it happens we know the next 4bits are always
2561 	 * zero and so we can simply masked off the low u8 of the register
2562 	 * and treat it identically to reading from the HWSP (without having
2563 	 * to use explicit shifting and masking, and probably bifurcating
2564 	 * the code to handle the legacy mmio read).
2565 	 */
2566 	head = execlists->csb_head;
2567 	tail = READ_ONCE(*execlists->csb_write);
2568 	if (unlikely(head == tail))
2569 		return;
2570 
2571 	/*
2572 	 * Hopefully paired with a wmb() in HW!
2573 	 *
2574 	 * We must complete the read of the write pointer before any reads
2575 	 * from the CSB, so that we do not see stale values. Without an rmb
2576 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2577 	 * we perform the READ_ONCE(*csb_write).
2578 	 */
2579 	rmb();
2580 
2581 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2582 	do {
2583 		bool promote;
2584 
2585 		if (++head == num_entries)
2586 			head = 0;
2587 
2588 		/*
2589 		 * We are flying near dragons again.
2590 		 *
2591 		 * We hold a reference to the request in execlist_port[]
2592 		 * but no more than that. We are operating in softirq
2593 		 * context and so cannot hold any mutex or sleep. That
2594 		 * prevents us stopping the requests we are processing
2595 		 * in port[] from being retired simultaneously (the
2596 		 * breadcrumb will be complete before we see the
2597 		 * context-switch). As we only hold the reference to the
2598 		 * request, any pointer chasing underneath the request
2599 		 * is subject to a potential use-after-free. Thus we
2600 		 * store all of the bookkeeping within port[] as
2601 		 * required, and avoid using unguarded pointers beneath
2602 		 * request itself. The same applies to the atomic
2603 		 * status notifier.
2604 		 */
2605 
2606 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2607 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2608 
2609 		if (INTEL_GEN(engine->i915) >= 12)
2610 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2611 		else
2612 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2613 		if (promote) {
2614 			struct i915_request * const *old = execlists->active;
2615 
2616 			ring_set_paused(engine, 0);
2617 
2618 			/* Point active to the new ELSP; prevent overwriting */
2619 			WRITE_ONCE(execlists->active, execlists->pending);
2620 			smp_wmb(); /* notify execlists_active() */
2621 
2622 			/* cancel old inflight, prepare for switch */
2623 			trace_ports(execlists, "preempted", old);
2624 			while (*old)
2625 				execlists_schedule_out(*old++);
2626 
2627 			/* switch pending to inflight */
2628 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2629 			memcpy(execlists->inflight,
2630 			       execlists->pending,
2631 			       execlists_num_ports(execlists) *
2632 			       sizeof(*execlists->pending));
2633 			smp_wmb(); /* complete the seqlock */
2634 			WRITE_ONCE(execlists->active, execlists->inflight);
2635 
2636 			WRITE_ONCE(execlists->pending[0], NULL);
2637 		} else {
2638 			GEM_BUG_ON(!*execlists->active);
2639 
2640 			/* port0 completed, advanced to port1 */
2641 			trace_ports(execlists, "completed", execlists->active);
2642 
2643 			/*
2644 			 * We rely on the hardware being strongly
2645 			 * ordered, that the breadcrumb write is
2646 			 * coherent (visible from the CPU) before the
2647 			 * user interrupt is processed. One might assume
2648 			 * that the breadcrumb write being before the
2649 			 * user interrupt and the CS event for the context
2650 			 * switch would therefore be before the CS event
2651 			 * itself...
2652 			 */
2653 			if (GEM_SHOW_DEBUG() &&
2654 			    !i915_request_completed(*execlists->active)) {
2655 				struct i915_request *rq = *execlists->active;
2656 				const u32 *regs __maybe_unused =
2657 					rq->context->lrc_reg_state;
2658 
2659 				ENGINE_TRACE(engine,
2660 					     "context completed before request!\n");
2661 				ENGINE_TRACE(engine,
2662 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2663 					     ENGINE_READ(engine, RING_START),
2664 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2665 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2666 					     ENGINE_READ(engine, RING_CTL),
2667 					     ENGINE_READ(engine, RING_MI_MODE));
2668 				ENGINE_TRACE(engine,
2669 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2670 					     i915_ggtt_offset(rq->ring->vma),
2671 					     rq->head, rq->tail,
2672 					     rq->fence.context,
2673 					     lower_32_bits(rq->fence.seqno),
2674 					     hwsp_seqno(rq));
2675 				ENGINE_TRACE(engine,
2676 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2677 					     regs[CTX_RING_START],
2678 					     regs[CTX_RING_HEAD],
2679 					     regs[CTX_RING_TAIL]);
2680 			}
2681 
2682 			execlists_schedule_out(*execlists->active++);
2683 
2684 			GEM_BUG_ON(execlists->active - execlists->inflight >
2685 				   execlists_num_ports(execlists));
2686 		}
2687 	} while (head != tail);
2688 
2689 	execlists->csb_head = head;
2690 	set_timeslice(engine);
2691 
2692 	/*
2693 	 * Gen11 has proven to fail wrt global observation point between
2694 	 * entry and tail update, failing on the ordering and thus
2695 	 * we see an old entry in the context status buffer.
2696 	 *
2697 	 * Forcibly evict out entries for the next gpu csb update,
2698 	 * to increase the odds that we get a fresh entries with non
2699 	 * working hardware. The cost for doing so comes out mostly with
2700 	 * the wash as hardware, working or not, will need to do the
2701 	 * invalidation before.
2702 	 */
2703 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2704 }
2705 
2706 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2707 {
2708 	lockdep_assert_held(&engine->active.lock);
2709 	if (!READ_ONCE(engine->execlists.pending[0])) {
2710 		rcu_read_lock(); /* protect peeking at execlists->active */
2711 		execlists_dequeue(engine);
2712 		rcu_read_unlock();
2713 	}
2714 }
2715 
2716 static void __execlists_hold(struct i915_request *rq)
2717 {
2718 	LIST_HEAD(list);
2719 
2720 	do {
2721 		struct i915_dependency *p;
2722 
2723 		if (i915_request_is_active(rq))
2724 			__i915_request_unsubmit(rq);
2725 
2726 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2727 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2728 		i915_request_set_hold(rq);
2729 		RQ_TRACE(rq, "on hold\n");
2730 
2731 		for_each_waiter(p, rq) {
2732 			struct i915_request *w =
2733 				container_of(p->waiter, typeof(*w), sched);
2734 
2735 			/* Leave semaphores spinning on the other engines */
2736 			if (w->engine != rq->engine)
2737 				continue;
2738 
2739 			if (!i915_request_is_ready(w))
2740 				continue;
2741 
2742 			if (i915_request_completed(w))
2743 				continue;
2744 
2745 			if (i915_request_on_hold(w))
2746 				continue;
2747 
2748 			list_move_tail(&w->sched.link, &list);
2749 		}
2750 
2751 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2752 	} while (rq);
2753 }
2754 
2755 static bool execlists_hold(struct intel_engine_cs *engine,
2756 			   struct i915_request *rq)
2757 {
2758 	spin_lock_irq(&engine->active.lock);
2759 
2760 	if (i915_request_completed(rq)) { /* too late! */
2761 		rq = NULL;
2762 		goto unlock;
2763 	}
2764 
2765 	if (rq->engine != engine) { /* preempted virtual engine */
2766 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2767 
2768 		/*
2769 		 * intel_context_inflight() is only protected by virtue
2770 		 * of process_csb() being called only by the tasklet (or
2771 		 * directly from inside reset while the tasklet is suspended).
2772 		 * Assert that neither of those are allowed to run while we
2773 		 * poke at the request queues.
2774 		 */
2775 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2776 
2777 		/*
2778 		 * An unsubmitted request along a virtual engine will
2779 		 * remain on the active (this) engine until we are able
2780 		 * to process the context switch away (and so mark the
2781 		 * context as no longer in flight). That cannot have happened
2782 		 * yet, otherwise we would not be hanging!
2783 		 */
2784 		spin_lock(&ve->base.active.lock);
2785 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2786 		GEM_BUG_ON(ve->request != rq);
2787 		ve->request = NULL;
2788 		spin_unlock(&ve->base.active.lock);
2789 		i915_request_put(rq);
2790 
2791 		rq->engine = engine;
2792 	}
2793 
2794 	/*
2795 	 * Transfer this request onto the hold queue to prevent it
2796 	 * being resumbitted to HW (and potentially completed) before we have
2797 	 * released it. Since we may have already submitted following
2798 	 * requests, we need to remove those as well.
2799 	 */
2800 	GEM_BUG_ON(i915_request_on_hold(rq));
2801 	GEM_BUG_ON(rq->engine != engine);
2802 	__execlists_hold(rq);
2803 	GEM_BUG_ON(list_empty(&engine->active.hold));
2804 
2805 unlock:
2806 	spin_unlock_irq(&engine->active.lock);
2807 	return rq;
2808 }
2809 
2810 static bool hold_request(const struct i915_request *rq)
2811 {
2812 	struct i915_dependency *p;
2813 	bool result = false;
2814 
2815 	/*
2816 	 * If one of our ancestors is on hold, we must also be on hold,
2817 	 * otherwise we will bypass it and execute before it.
2818 	 */
2819 	rcu_read_lock();
2820 	for_each_signaler(p, rq) {
2821 		const struct i915_request *s =
2822 			container_of(p->signaler, typeof(*s), sched);
2823 
2824 		if (s->engine != rq->engine)
2825 			continue;
2826 
2827 		result = i915_request_on_hold(s);
2828 		if (result)
2829 			break;
2830 	}
2831 	rcu_read_unlock();
2832 
2833 	return result;
2834 }
2835 
2836 static void __execlists_unhold(struct i915_request *rq)
2837 {
2838 	LIST_HEAD(list);
2839 
2840 	do {
2841 		struct i915_dependency *p;
2842 
2843 		RQ_TRACE(rq, "hold release\n");
2844 
2845 		GEM_BUG_ON(!i915_request_on_hold(rq));
2846 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2847 
2848 		i915_request_clear_hold(rq);
2849 		list_move_tail(&rq->sched.link,
2850 			       i915_sched_lookup_priolist(rq->engine,
2851 							  rq_prio(rq)));
2852 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2853 
2854 		/* Also release any children on this engine that are ready */
2855 		for_each_waiter(p, rq) {
2856 			struct i915_request *w =
2857 				container_of(p->waiter, typeof(*w), sched);
2858 
2859 			/* Propagate any change in error status */
2860 			if (rq->fence.error)
2861 				i915_request_set_error_once(w, rq->fence.error);
2862 
2863 			if (w->engine != rq->engine)
2864 				continue;
2865 
2866 			if (!i915_request_on_hold(w))
2867 				continue;
2868 
2869 			/* Check that no other parents are also on hold */
2870 			if (hold_request(w))
2871 				continue;
2872 
2873 			list_move_tail(&w->sched.link, &list);
2874 		}
2875 
2876 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2877 	} while (rq);
2878 }
2879 
2880 static void execlists_unhold(struct intel_engine_cs *engine,
2881 			     struct i915_request *rq)
2882 {
2883 	spin_lock_irq(&engine->active.lock);
2884 
2885 	/*
2886 	 * Move this request back to the priority queue, and all of its
2887 	 * children and grandchildren that were suspended along with it.
2888 	 */
2889 	__execlists_unhold(rq);
2890 
2891 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2892 		engine->execlists.queue_priority_hint = rq_prio(rq);
2893 		tasklet_hi_schedule(&engine->execlists.tasklet);
2894 	}
2895 
2896 	spin_unlock_irq(&engine->active.lock);
2897 }
2898 
2899 struct execlists_capture {
2900 	struct work_struct work;
2901 	struct i915_request *rq;
2902 	struct i915_gpu_coredump *error;
2903 };
2904 
2905 static void execlists_capture_work(struct work_struct *work)
2906 {
2907 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2908 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2909 	struct intel_engine_cs *engine = cap->rq->engine;
2910 	struct intel_gt_coredump *gt = cap->error->gt;
2911 	struct intel_engine_capture_vma *vma;
2912 
2913 	/* Compress all the objects attached to the request, slow! */
2914 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2915 	if (vma) {
2916 		struct i915_vma_compress *compress =
2917 			i915_vma_capture_prepare(gt);
2918 
2919 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2920 		i915_vma_capture_finish(gt, compress);
2921 	}
2922 
2923 	gt->simulated = gt->engine->simulated;
2924 	cap->error->simulated = gt->simulated;
2925 
2926 	/* Publish the error state, and announce it to the world */
2927 	i915_error_state_store(cap->error);
2928 	i915_gpu_coredump_put(cap->error);
2929 
2930 	/* Return this request and all that depend upon it for signaling */
2931 	execlists_unhold(engine, cap->rq);
2932 	i915_request_put(cap->rq);
2933 
2934 	kfree(cap);
2935 }
2936 
2937 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2938 {
2939 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2940 	struct execlists_capture *cap;
2941 
2942 	cap = kmalloc(sizeof(*cap), gfp);
2943 	if (!cap)
2944 		return NULL;
2945 
2946 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2947 	if (!cap->error)
2948 		goto err_cap;
2949 
2950 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2951 	if (!cap->error->gt)
2952 		goto err_gpu;
2953 
2954 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2955 	if (!cap->error->gt->engine)
2956 		goto err_gt;
2957 
2958 	return cap;
2959 
2960 err_gt:
2961 	kfree(cap->error->gt);
2962 err_gpu:
2963 	kfree(cap->error);
2964 err_cap:
2965 	kfree(cap);
2966 	return NULL;
2967 }
2968 
2969 static struct i915_request *
2970 active_context(struct intel_engine_cs *engine, u32 ccid)
2971 {
2972 	const struct intel_engine_execlists * const el = &engine->execlists;
2973 	struct i915_request * const *port, *rq;
2974 
2975 	/*
2976 	 * Use the most recent result from process_csb(), but just in case
2977 	 * we trigger an error (via interrupt) before the first CS event has
2978 	 * been written, peek at the next submission.
2979 	 */
2980 
2981 	for (port = el->active; (rq = *port); port++) {
2982 		if (rq->context->lrc.ccid == ccid) {
2983 			ENGINE_TRACE(engine,
2984 				     "ccid found at active:%zd\n",
2985 				     port - el->active);
2986 			return rq;
2987 		}
2988 	}
2989 
2990 	for (port = el->pending; (rq = *port); port++) {
2991 		if (rq->context->lrc.ccid == ccid) {
2992 			ENGINE_TRACE(engine,
2993 				     "ccid found at pending:%zd\n",
2994 				     port - el->pending);
2995 			return rq;
2996 		}
2997 	}
2998 
2999 	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3000 	return NULL;
3001 }
3002 
3003 static u32 active_ccid(struct intel_engine_cs *engine)
3004 {
3005 	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3006 }
3007 
3008 static bool execlists_capture(struct intel_engine_cs *engine)
3009 {
3010 	struct execlists_capture *cap;
3011 
3012 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3013 		return true;
3014 
3015 	/*
3016 	 * We need to _quickly_ capture the engine state before we reset.
3017 	 * We are inside an atomic section (softirq) here and we are delaying
3018 	 * the forced preemption event.
3019 	 */
3020 	cap = capture_regs(engine);
3021 	if (!cap)
3022 		return true;
3023 
3024 	spin_lock_irq(&engine->active.lock);
3025 	cap->rq = active_context(engine, active_ccid(engine));
3026 	if (cap->rq) {
3027 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3028 		cap->rq = i915_request_get_rcu(cap->rq);
3029 	}
3030 	spin_unlock_irq(&engine->active.lock);
3031 	if (!cap->rq)
3032 		goto err_free;
3033 
3034 	/*
3035 	 * Remove the request from the execlists queue, and take ownership
3036 	 * of the request. We pass it to our worker who will _slowly_ compress
3037 	 * all the pages the _user_ requested for debugging their batch, after
3038 	 * which we return it to the queue for signaling.
3039 	 *
3040 	 * By removing them from the execlists queue, we also remove the
3041 	 * requests from being processed by __unwind_incomplete_requests()
3042 	 * during the intel_engine_reset(), and so they will *not* be replayed
3043 	 * afterwards.
3044 	 *
3045 	 * Note that because we have not yet reset the engine at this point,
3046 	 * it is possible for the request that we have identified as being
3047 	 * guilty, did in fact complete and we will then hit an arbitration
3048 	 * point allowing the outstanding preemption to succeed. The likelihood
3049 	 * of that is very low (as capturing of the engine registers should be
3050 	 * fast enough to run inside an irq-off atomic section!), so we will
3051 	 * simply hold that request accountable for being non-preemptible
3052 	 * long enough to force the reset.
3053 	 */
3054 	if (!execlists_hold(engine, cap->rq))
3055 		goto err_rq;
3056 
3057 	INIT_WORK(&cap->work, execlists_capture_work);
3058 	schedule_work(&cap->work);
3059 	return true;
3060 
3061 err_rq:
3062 	i915_request_put(cap->rq);
3063 err_free:
3064 	i915_gpu_coredump_put(cap->error);
3065 	kfree(cap);
3066 	return false;
3067 }
3068 
3069 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3070 {
3071 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3072 	unsigned long *lock = &engine->gt->reset.flags;
3073 
3074 	if (!intel_has_reset_engine(engine->gt))
3075 		return;
3076 
3077 	if (test_and_set_bit(bit, lock))
3078 		return;
3079 
3080 	ENGINE_TRACE(engine, "reset for %s\n", msg);
3081 
3082 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3083 	tasklet_disable_nosync(&engine->execlists.tasklet);
3084 
3085 	ring_set_paused(engine, 1); /* Freeze the current request in place */
3086 	if (execlists_capture(engine))
3087 		intel_engine_reset(engine, msg);
3088 	else
3089 		ring_set_paused(engine, 0);
3090 
3091 	tasklet_enable(&engine->execlists.tasklet);
3092 	clear_and_wake_up_bit(bit, lock);
3093 }
3094 
3095 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3096 {
3097 	const struct timer_list *t = &engine->execlists.preempt;
3098 
3099 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3100 		return false;
3101 
3102 	if (!timer_expired(t))
3103 		return false;
3104 
3105 	return READ_ONCE(engine->execlists.pending[0]);
3106 }
3107 
3108 /*
3109  * Check the unread Context Status Buffers and manage the submission of new
3110  * contexts to the ELSP accordingly.
3111  */
3112 static void execlists_submission_tasklet(unsigned long data)
3113 {
3114 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3115 	bool timeout = preempt_timeout(engine);
3116 
3117 	process_csb(engine);
3118 
3119 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3120 		engine->execlists.error_interrupt = 0;
3121 		if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
3122 			execlists_reset(engine, "CS error");
3123 	}
3124 
3125 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3126 		unsigned long flags;
3127 
3128 		spin_lock_irqsave(&engine->active.lock, flags);
3129 		__execlists_submission_tasklet(engine);
3130 		spin_unlock_irqrestore(&engine->active.lock, flags);
3131 
3132 		/* Recheck after serialising with direct-submission */
3133 		if (unlikely(timeout && preempt_timeout(engine)))
3134 			execlists_reset(engine, "preemption time out");
3135 	}
3136 }
3137 
3138 static void __execlists_kick(struct intel_engine_execlists *execlists)
3139 {
3140 	/* Kick the tasklet for some interrupt coalescing and reset handling */
3141 	tasklet_hi_schedule(&execlists->tasklet);
3142 }
3143 
3144 #define execlists_kick(t, member) \
3145 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3146 
3147 static void execlists_timeslice(struct timer_list *timer)
3148 {
3149 	execlists_kick(timer, timer);
3150 }
3151 
3152 static void execlists_preempt(struct timer_list *timer)
3153 {
3154 	execlists_kick(timer, preempt);
3155 }
3156 
3157 static void queue_request(struct intel_engine_cs *engine,
3158 			  struct i915_request *rq)
3159 {
3160 	GEM_BUG_ON(!list_empty(&rq->sched.link));
3161 	list_add_tail(&rq->sched.link,
3162 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3163 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3164 }
3165 
3166 static void __submit_queue_imm(struct intel_engine_cs *engine)
3167 {
3168 	struct intel_engine_execlists * const execlists = &engine->execlists;
3169 
3170 	if (reset_in_progress(execlists))
3171 		return; /* defer until we restart the engine following reset */
3172 
3173 	__execlists_submission_tasklet(engine);
3174 }
3175 
3176 static void submit_queue(struct intel_engine_cs *engine,
3177 			 const struct i915_request *rq)
3178 {
3179 	struct intel_engine_execlists *execlists = &engine->execlists;
3180 
3181 	if (rq_prio(rq) <= execlists->queue_priority_hint)
3182 		return;
3183 
3184 	execlists->queue_priority_hint = rq_prio(rq);
3185 	__submit_queue_imm(engine);
3186 }
3187 
3188 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3189 			     const struct i915_request *rq)
3190 {
3191 	GEM_BUG_ON(i915_request_on_hold(rq));
3192 	return !list_empty(&engine->active.hold) && hold_request(rq);
3193 }
3194 
3195 static void flush_csb(struct intel_engine_cs *engine)
3196 {
3197 	struct intel_engine_execlists *el = &engine->execlists;
3198 
3199 	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3200 		if (!reset_in_progress(el))
3201 			process_csb(engine);
3202 		tasklet_unlock(&el->tasklet);
3203 	}
3204 }
3205 
3206 static void execlists_submit_request(struct i915_request *request)
3207 {
3208 	struct intel_engine_cs *engine = request->engine;
3209 	unsigned long flags;
3210 
3211 	/* Hopefully we clear execlists->pending[] to let us through */
3212 	flush_csb(engine);
3213 
3214 	/* Will be called from irq-context when using foreign fences. */
3215 	spin_lock_irqsave(&engine->active.lock, flags);
3216 
3217 	if (unlikely(ancestor_on_hold(engine, request))) {
3218 		RQ_TRACE(request, "ancestor on hold\n");
3219 		list_add_tail(&request->sched.link, &engine->active.hold);
3220 		i915_request_set_hold(request);
3221 	} else {
3222 		queue_request(engine, request);
3223 
3224 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3225 		GEM_BUG_ON(list_empty(&request->sched.link));
3226 
3227 		submit_queue(engine, request);
3228 	}
3229 
3230 	spin_unlock_irqrestore(&engine->active.lock, flags);
3231 }
3232 
3233 static void __execlists_context_fini(struct intel_context *ce)
3234 {
3235 	intel_ring_put(ce->ring);
3236 	i915_vma_put(ce->state);
3237 }
3238 
3239 static void execlists_context_destroy(struct kref *kref)
3240 {
3241 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3242 
3243 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3244 	GEM_BUG_ON(intel_context_is_pinned(ce));
3245 
3246 	if (ce->state)
3247 		__execlists_context_fini(ce);
3248 
3249 	intel_context_fini(ce);
3250 	intel_context_free(ce);
3251 }
3252 
3253 static void
3254 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3255 {
3256 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3257 		return;
3258 
3259 	vaddr += engine->context_size;
3260 
3261 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3262 }
3263 
3264 static void
3265 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3266 {
3267 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3268 		return;
3269 
3270 	vaddr += engine->context_size;
3271 
3272 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3273 		drm_err_once(&engine->i915->drm,
3274 			     "%s context redzone overwritten!\n",
3275 			     engine->name);
3276 }
3277 
3278 static void execlists_context_unpin(struct intel_context *ce)
3279 {
3280 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3281 		      ce->engine);
3282 
3283 	i915_gem_object_unpin_map(ce->state->obj);
3284 }
3285 
3286 static u32 *
3287 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3288 {
3289 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3290 		MI_SRM_LRM_GLOBAL_GTT |
3291 		MI_LRI_LRM_CS_MMIO;
3292 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3293 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3294 		CTX_TIMESTAMP * sizeof(u32);
3295 	*cs++ = 0;
3296 
3297 	*cs++ = MI_LOAD_REGISTER_REG |
3298 		MI_LRR_SOURCE_CS_MMIO |
3299 		MI_LRI_LRM_CS_MMIO;
3300 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3301 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3302 
3303 	*cs++ = MI_LOAD_REGISTER_REG |
3304 		MI_LRR_SOURCE_CS_MMIO |
3305 		MI_LRI_LRM_CS_MMIO;
3306 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3307 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3308 
3309 	return cs;
3310 }
3311 
3312 static u32 *
3313 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3314 {
3315 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3316 
3317 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3318 		MI_SRM_LRM_GLOBAL_GTT |
3319 		MI_LRI_LRM_CS_MMIO;
3320 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3321 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3322 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3323 	*cs++ = 0;
3324 
3325 	return cs;
3326 }
3327 
3328 static u32 *
3329 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3330 {
3331 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3332 
3333 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3334 		MI_SRM_LRM_GLOBAL_GTT |
3335 		MI_LRI_LRM_CS_MMIO;
3336 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3337 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3338 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3339 	*cs++ = 0;
3340 
3341 	*cs++ = MI_LOAD_REGISTER_REG |
3342 		MI_LRR_SOURCE_CS_MMIO |
3343 		MI_LRI_LRM_CS_MMIO;
3344 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3345 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3346 
3347 	return cs;
3348 }
3349 
3350 static u32 *
3351 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3352 {
3353 	cs = gen12_emit_timestamp_wa(ce, cs);
3354 	cs = gen12_emit_cmd_buf_wa(ce, cs);
3355 	cs = gen12_emit_restore_scratch(ce, cs);
3356 
3357 	return cs;
3358 }
3359 
3360 static u32 *
3361 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3362 {
3363 	cs = gen12_emit_timestamp_wa(ce, cs);
3364 	cs = gen12_emit_restore_scratch(ce, cs);
3365 
3366 	return cs;
3367 }
3368 
3369 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3370 {
3371 	return PAGE_SIZE * ce->wa_bb_page;
3372 }
3373 
3374 static u32 *context_indirect_bb(const struct intel_context *ce)
3375 {
3376 	void *ptr;
3377 
3378 	GEM_BUG_ON(!ce->wa_bb_page);
3379 
3380 	ptr = ce->lrc_reg_state;
3381 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3382 	ptr += context_wa_bb_offset(ce);
3383 
3384 	return ptr;
3385 }
3386 
3387 static void
3388 setup_indirect_ctx_bb(const struct intel_context *ce,
3389 		      const struct intel_engine_cs *engine,
3390 		      u32 *(*emit)(const struct intel_context *, u32 *))
3391 {
3392 	u32 * const start = context_indirect_bb(ce);
3393 	u32 *cs;
3394 
3395 	cs = emit(ce, start);
3396 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3397 	while ((unsigned long)cs % CACHELINE_BYTES)
3398 		*cs++ = MI_NOOP;
3399 
3400 	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3401 				    i915_ggtt_offset(ce->state) +
3402 				    context_wa_bb_offset(ce),
3403 				    (cs - start) * sizeof(*cs));
3404 }
3405 
3406 static void
3407 __execlists_update_reg_state(const struct intel_context *ce,
3408 			     const struct intel_engine_cs *engine,
3409 			     u32 head)
3410 {
3411 	struct intel_ring *ring = ce->ring;
3412 	u32 *regs = ce->lrc_reg_state;
3413 
3414 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3415 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3416 
3417 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3418 	regs[CTX_RING_HEAD] = head;
3419 	regs[CTX_RING_TAIL] = ring->tail;
3420 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3421 
3422 	/* RPCS */
3423 	if (engine->class == RENDER_CLASS) {
3424 		regs[CTX_R_PWR_CLK_STATE] =
3425 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3426 
3427 		i915_oa_init_reg_state(ce, engine);
3428 	}
3429 
3430 	if (ce->wa_bb_page) {
3431 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3432 
3433 		fn = gen12_emit_indirect_ctx_xcs;
3434 		if (ce->engine->class == RENDER_CLASS)
3435 			fn = gen12_emit_indirect_ctx_rcs;
3436 
3437 		/* Mutually exclusive wrt to global indirect bb */
3438 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3439 		setup_indirect_ctx_bb(ce, engine, fn);
3440 	}
3441 }
3442 
3443 static int
3444 __execlists_context_pin(struct intel_context *ce,
3445 			struct intel_engine_cs *engine)
3446 {
3447 	void *vaddr;
3448 
3449 	GEM_BUG_ON(!ce->state);
3450 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3451 
3452 	vaddr = i915_gem_object_pin_map(ce->state->obj,
3453 					i915_coherent_map_type(engine->i915) |
3454 					I915_MAP_OVERRIDE);
3455 	if (IS_ERR(vaddr))
3456 		return PTR_ERR(vaddr);
3457 
3458 	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3459 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3460 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3461 
3462 	return 0;
3463 }
3464 
3465 static int execlists_context_pin(struct intel_context *ce)
3466 {
3467 	return __execlists_context_pin(ce, ce->engine);
3468 }
3469 
3470 static int execlists_context_alloc(struct intel_context *ce)
3471 {
3472 	return __execlists_context_alloc(ce, ce->engine);
3473 }
3474 
3475 static void execlists_context_reset(struct intel_context *ce)
3476 {
3477 	CE_TRACE(ce, "reset\n");
3478 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3479 
3480 	intel_ring_reset(ce->ring, ce->ring->emit);
3481 
3482 	/* Scrub away the garbage */
3483 	execlists_init_reg_state(ce->lrc_reg_state,
3484 				 ce, ce->engine, ce->ring, true);
3485 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3486 
3487 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3488 }
3489 
3490 static const struct intel_context_ops execlists_context_ops = {
3491 	.alloc = execlists_context_alloc,
3492 
3493 	.pin = execlists_context_pin,
3494 	.unpin = execlists_context_unpin,
3495 
3496 	.enter = intel_context_enter_engine,
3497 	.exit = intel_context_exit_engine,
3498 
3499 	.reset = execlists_context_reset,
3500 	.destroy = execlists_context_destroy,
3501 };
3502 
3503 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3504 {
3505 	u32 *cs;
3506 
3507 	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3508 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3509 		return 0;
3510 
3511 	cs = intel_ring_begin(rq, 6);
3512 	if (IS_ERR(cs))
3513 		return PTR_ERR(cs);
3514 
3515 	/*
3516 	 * Check if we have been preempted before we even get started.
3517 	 *
3518 	 * After this point i915_request_started() reports true, even if
3519 	 * we get preempted and so are no longer running.
3520 	 */
3521 	*cs++ = MI_ARB_CHECK;
3522 	*cs++ = MI_NOOP;
3523 
3524 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3525 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3526 	*cs++ = 0;
3527 	*cs++ = rq->fence.seqno - 1;
3528 
3529 	intel_ring_advance(rq, cs);
3530 
3531 	/* Record the updated position of the request's payload */
3532 	rq->infix = intel_ring_offset(rq, cs);
3533 
3534 	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3535 
3536 	return 0;
3537 }
3538 
3539 static int emit_pdps(struct i915_request *rq)
3540 {
3541 	const struct intel_engine_cs * const engine = rq->engine;
3542 	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3543 	int err, i;
3544 	u32 *cs;
3545 
3546 	GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3547 
3548 	/*
3549 	 * Beware ye of the dragons, this sequence is magic!
3550 	 *
3551 	 * Small changes to this sequence can cause anything from
3552 	 * GPU hangs to forcewake errors and machine lockups!
3553 	 */
3554 
3555 	/* Flush any residual operations from the context load */
3556 	err = engine->emit_flush(rq, EMIT_FLUSH);
3557 	if (err)
3558 		return err;
3559 
3560 	/* Magic required to prevent forcewake errors! */
3561 	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3562 	if (err)
3563 		return err;
3564 
3565 	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3566 	if (IS_ERR(cs))
3567 		return PTR_ERR(cs);
3568 
3569 	/* Ensure the LRI have landed before we invalidate & continue */
3570 	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3571 	for (i = GEN8_3LVL_PDPES; i--; ) {
3572 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3573 		u32 base = engine->mmio_base;
3574 
3575 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3576 		*cs++ = upper_32_bits(pd_daddr);
3577 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3578 		*cs++ = lower_32_bits(pd_daddr);
3579 	}
3580 	*cs++ = MI_NOOP;
3581 
3582 	intel_ring_advance(rq, cs);
3583 
3584 	return 0;
3585 }
3586 
3587 static int execlists_request_alloc(struct i915_request *request)
3588 {
3589 	int ret;
3590 
3591 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3592 
3593 	/*
3594 	 * Flush enough space to reduce the likelihood of waiting after
3595 	 * we start building the request - in which case we will just
3596 	 * have to repeat work.
3597 	 */
3598 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3599 
3600 	/*
3601 	 * Note that after this point, we have committed to using
3602 	 * this request as it is being used to both track the
3603 	 * state of engine initialisation and liveness of the
3604 	 * golden renderstate above. Think twice before you try
3605 	 * to cancel/unwind this request now.
3606 	 */
3607 
3608 	if (!i915_vm_is_4lvl(request->context->vm)) {
3609 		ret = emit_pdps(request);
3610 		if (ret)
3611 			return ret;
3612 	}
3613 
3614 	/* Unconditionally invalidate GPU caches and TLBs. */
3615 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3616 	if (ret)
3617 		return ret;
3618 
3619 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3620 	return 0;
3621 }
3622 
3623 /*
3624  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3625  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3626  * but there is a slight complication as this is applied in WA batch where the
3627  * values are only initialized once so we cannot take register value at the
3628  * beginning and reuse it further; hence we save its value to memory, upload a
3629  * constant value with bit21 set and then we restore it back with the saved value.
3630  * To simplify the WA, a constant value is formed by using the default value
3631  * of this register. This shouldn't be a problem because we are only modifying
3632  * it for a short period and this batch in non-premptible. We can ofcourse
3633  * use additional instructions that read the actual value of the register
3634  * at that time and set our bit of interest but it makes the WA complicated.
3635  *
3636  * This WA is also required for Gen9 so extracting as a function avoids
3637  * code duplication.
3638  */
3639 static u32 *
3640 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3641 {
3642 	/* NB no one else is allowed to scribble over scratch + 256! */
3643 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3644 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3645 	*batch++ = intel_gt_scratch_offset(engine->gt,
3646 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3647 	*batch++ = 0;
3648 
3649 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3650 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3651 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3652 
3653 	batch = gen8_emit_pipe_control(batch,
3654 				       PIPE_CONTROL_CS_STALL |
3655 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3656 				       0);
3657 
3658 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3659 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3660 	*batch++ = intel_gt_scratch_offset(engine->gt,
3661 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3662 	*batch++ = 0;
3663 
3664 	return batch;
3665 }
3666 
3667 /*
3668  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3669  * initialized at the beginning and shared across all contexts but this field
3670  * helps us to have multiple batches at different offsets and select them based
3671  * on a criteria. At the moment this batch always start at the beginning of the page
3672  * and at this point we don't have multiple wa_ctx batch buffers.
3673  *
3674  * The number of WA applied are not known at the beginning; we use this field
3675  * to return the no of DWORDS written.
3676  *
3677  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3678  * so it adds NOOPs as padding to make it cacheline aligned.
3679  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3680  * makes a complete batch buffer.
3681  */
3682 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3683 {
3684 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3685 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3686 
3687 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3688 	if (IS_BROADWELL(engine->i915))
3689 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3690 
3691 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3692 	/* Actual scratch location is at 128 bytes offset */
3693 	batch = gen8_emit_pipe_control(batch,
3694 				       PIPE_CONTROL_FLUSH_L3 |
3695 				       PIPE_CONTROL_STORE_DATA_INDEX |
3696 				       PIPE_CONTROL_CS_STALL |
3697 				       PIPE_CONTROL_QW_WRITE,
3698 				       LRC_PPHWSP_SCRATCH_ADDR);
3699 
3700 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3701 
3702 	/* Pad to end of cacheline */
3703 	while ((unsigned long)batch % CACHELINE_BYTES)
3704 		*batch++ = MI_NOOP;
3705 
3706 	/*
3707 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3708 	 * execution depends on the length specified in terms of cache lines
3709 	 * in the register CTX_RCS_INDIRECT_CTX
3710 	 */
3711 
3712 	return batch;
3713 }
3714 
3715 struct lri {
3716 	i915_reg_t reg;
3717 	u32 value;
3718 };
3719 
3720 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3721 {
3722 	GEM_BUG_ON(!count || count > 63);
3723 
3724 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3725 	do {
3726 		*batch++ = i915_mmio_reg_offset(lri->reg);
3727 		*batch++ = lri->value;
3728 	} while (lri++, --count);
3729 	*batch++ = MI_NOOP;
3730 
3731 	return batch;
3732 }
3733 
3734 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3735 {
3736 	static const struct lri lri[] = {
3737 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3738 		{
3739 			COMMON_SLICE_CHICKEN2,
3740 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3741 				       0),
3742 		},
3743 
3744 		/* BSpec: 11391 */
3745 		{
3746 			FF_SLICE_CHICKEN,
3747 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3748 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3749 		},
3750 
3751 		/* BSpec: 11299 */
3752 		{
3753 			_3D_CHICKEN3,
3754 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3755 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3756 		}
3757 	};
3758 
3759 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3760 
3761 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3762 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3763 
3764 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3765 	batch = gen8_emit_pipe_control(batch,
3766 				       PIPE_CONTROL_FLUSH_L3 |
3767 				       PIPE_CONTROL_STORE_DATA_INDEX |
3768 				       PIPE_CONTROL_CS_STALL |
3769 				       PIPE_CONTROL_QW_WRITE,
3770 				       LRC_PPHWSP_SCRATCH_ADDR);
3771 
3772 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3773 
3774 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3775 	if (HAS_POOLED_EU(engine->i915)) {
3776 		/*
3777 		 * EU pool configuration is setup along with golden context
3778 		 * during context initialization. This value depends on
3779 		 * device type (2x6 or 3x6) and needs to be updated based
3780 		 * on which subslice is disabled especially for 2x6
3781 		 * devices, however it is safe to load default
3782 		 * configuration of 3x6 device instead of masking off
3783 		 * corresponding bits because HW ignores bits of a disabled
3784 		 * subslice and drops down to appropriate config. Please
3785 		 * see render_state_setup() in i915_gem_render_state.c for
3786 		 * possible configurations, to avoid duplication they are
3787 		 * not shown here again.
3788 		 */
3789 		*batch++ = GEN9_MEDIA_POOL_STATE;
3790 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3791 		*batch++ = 0x00777000;
3792 		*batch++ = 0;
3793 		*batch++ = 0;
3794 		*batch++ = 0;
3795 	}
3796 
3797 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3798 
3799 	/* Pad to end of cacheline */
3800 	while ((unsigned long)batch % CACHELINE_BYTES)
3801 		*batch++ = MI_NOOP;
3802 
3803 	return batch;
3804 }
3805 
3806 static u32 *
3807 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3808 {
3809 	int i;
3810 
3811 	/*
3812 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3813 	 *
3814 	 * Ensure the engine is idle prior to programming a
3815 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3816 	 */
3817 	batch = gen8_emit_pipe_control(batch,
3818 				       PIPE_CONTROL_CS_STALL,
3819 				       0);
3820 	/*
3821 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3822 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3823 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3824 	 * confusing. Since gen8_emit_pipe_control() already advances the
3825 	 * batch by 6 dwords, we advance the other 10 here, completing a
3826 	 * cacheline. It's not clear if the workaround requires this padding
3827 	 * before other commands, or if it's just the regular padding we would
3828 	 * already have for the workaround bb, so leave it here for now.
3829 	 */
3830 	for (i = 0; i < 10; i++)
3831 		*batch++ = MI_NOOP;
3832 
3833 	/* Pad to end of cacheline */
3834 	while ((unsigned long)batch % CACHELINE_BYTES)
3835 		*batch++ = MI_NOOP;
3836 
3837 	return batch;
3838 }
3839 
3840 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3841 
3842 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3843 {
3844 	struct drm_i915_gem_object *obj;
3845 	struct i915_vma *vma;
3846 	int err;
3847 
3848 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3849 	if (IS_ERR(obj))
3850 		return PTR_ERR(obj);
3851 
3852 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3853 	if (IS_ERR(vma)) {
3854 		err = PTR_ERR(vma);
3855 		goto err;
3856 	}
3857 
3858 	err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3859 	if (err)
3860 		goto err;
3861 
3862 	engine->wa_ctx.vma = vma;
3863 	return 0;
3864 
3865 err:
3866 	i915_gem_object_put(obj);
3867 	return err;
3868 }
3869 
3870 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3871 {
3872 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3873 }
3874 
3875 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3876 
3877 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3878 {
3879 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3880 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3881 					    &wa_ctx->per_ctx };
3882 	wa_bb_func_t wa_bb_fn[2];
3883 	void *batch, *batch_ptr;
3884 	unsigned int i;
3885 	int ret;
3886 
3887 	if (engine->class != RENDER_CLASS)
3888 		return 0;
3889 
3890 	switch (INTEL_GEN(engine->i915)) {
3891 	case 12:
3892 	case 11:
3893 		return 0;
3894 	case 10:
3895 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3896 		wa_bb_fn[1] = NULL;
3897 		break;
3898 	case 9:
3899 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3900 		wa_bb_fn[1] = NULL;
3901 		break;
3902 	case 8:
3903 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3904 		wa_bb_fn[1] = NULL;
3905 		break;
3906 	default:
3907 		MISSING_CASE(INTEL_GEN(engine->i915));
3908 		return 0;
3909 	}
3910 
3911 	ret = lrc_setup_wa_ctx(engine);
3912 	if (ret) {
3913 		drm_dbg(&engine->i915->drm,
3914 			"Failed to setup context WA page: %d\n", ret);
3915 		return ret;
3916 	}
3917 
3918 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
3919 
3920 	/*
3921 	 * Emit the two workaround batch buffers, recording the offset from the
3922 	 * start of the workaround batch buffer object for each and their
3923 	 * respective sizes.
3924 	 */
3925 	batch_ptr = batch;
3926 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3927 		wa_bb[i]->offset = batch_ptr - batch;
3928 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3929 						  CACHELINE_BYTES))) {
3930 			ret = -EINVAL;
3931 			break;
3932 		}
3933 		if (wa_bb_fn[i])
3934 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3935 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3936 	}
3937 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3938 
3939 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
3940 	i915_gem_object_unpin_map(wa_ctx->vma->obj);
3941 	if (ret)
3942 		lrc_destroy_wa_ctx(engine);
3943 
3944 	return ret;
3945 }
3946 
3947 static void reset_csb_pointers(struct intel_engine_cs *engine)
3948 {
3949 	struct intel_engine_execlists * const execlists = &engine->execlists;
3950 	const unsigned int reset_value = execlists->csb_size - 1;
3951 
3952 	ring_set_paused(engine, 0);
3953 
3954 	/*
3955 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3956 	 * Bludgeon them with a mmio update to be sure.
3957 	 */
3958 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3959 		     0xffff << 16 | reset_value << 8 | reset_value);
3960 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3961 
3962 	/*
3963 	 * After a reset, the HW starts writing into CSB entry [0]. We
3964 	 * therefore have to set our HEAD pointer back one entry so that
3965 	 * the *first* entry we check is entry 0. To complicate this further,
3966 	 * as we don't wait for the first interrupt after reset, we have to
3967 	 * fake the HW write to point back to the last entry so that our
3968 	 * inline comparison of our cached head position against the last HW
3969 	 * write works even before the first interrupt.
3970 	 */
3971 	execlists->csb_head = reset_value;
3972 	WRITE_ONCE(*execlists->csb_write, reset_value);
3973 	wmb(); /* Make sure this is visible to HW (paranoia?) */
3974 
3975 	invalidate_csb_entries(&execlists->csb_status[0],
3976 			       &execlists->csb_status[reset_value]);
3977 
3978 	/* Once more for luck and our trusty paranoia */
3979 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3980 		     0xffff << 16 | reset_value << 8 | reset_value);
3981 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3982 
3983 	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
3984 }
3985 
3986 static void execlists_sanitize(struct intel_engine_cs *engine)
3987 {
3988 	/*
3989 	 * Poison residual state on resume, in case the suspend didn't!
3990 	 *
3991 	 * We have to assume that across suspend/resume (or other loss
3992 	 * of control) that the contents of our pinned buffers has been
3993 	 * lost, replaced by garbage. Since this doesn't always happen,
3994 	 * let's poison such state so that we more quickly spot when
3995 	 * we falsely assume it has been preserved.
3996 	 */
3997 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3998 		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
3999 
4000 	reset_csb_pointers(engine);
4001 
4002 	/*
4003 	 * The kernel_context HWSP is stored in the status_page. As above,
4004 	 * that may be lost on resume/initialisation, and so we need to
4005 	 * reset the value in the HWSP.
4006 	 */
4007 	intel_timeline_reset_seqno(engine->kernel_context->timeline);
4008 
4009 	/* And scrub the dirty cachelines for the HWSP */
4010 	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4011 }
4012 
4013 static void enable_error_interrupt(struct intel_engine_cs *engine)
4014 {
4015 	u32 status;
4016 
4017 	engine->execlists.error_interrupt = 0;
4018 	ENGINE_WRITE(engine, RING_EMR, ~0u);
4019 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4020 
4021 	status = ENGINE_READ(engine, RING_ESR);
4022 	if (unlikely(status)) {
4023 		drm_err(&engine->i915->drm,
4024 			"engine '%s' resumed still in error: %08x\n",
4025 			engine->name, status);
4026 		__intel_gt_reset(engine->gt, engine->mask);
4027 	}
4028 
4029 	/*
4030 	 * On current gen8+, we have 2 signals to play with
4031 	 *
4032 	 * - I915_ERROR_INSTUCTION (bit 0)
4033 	 *
4034 	 *    Generate an error if the command parser encounters an invalid
4035 	 *    instruction
4036 	 *
4037 	 *    This is a fatal error.
4038 	 *
4039 	 * - CP_PRIV (bit 2)
4040 	 *
4041 	 *    Generate an error on privilege violation (where the CP replaces
4042 	 *    the instruction with a no-op). This also fires for writes into
4043 	 *    read-only scratch pages.
4044 	 *
4045 	 *    This is a non-fatal error, parsing continues.
4046 	 *
4047 	 * * there are a few others defined for odd HW that we do not use
4048 	 *
4049 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4050 	 * error (as the HW is validating and suppressing the mistakes), we
4051 	 * only unmask the instruction error bit.
4052 	 */
4053 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4054 }
4055 
4056 static void enable_execlists(struct intel_engine_cs *engine)
4057 {
4058 	u32 mode;
4059 
4060 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4061 
4062 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4063 
4064 	if (INTEL_GEN(engine->i915) >= 11)
4065 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4066 	else
4067 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4068 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4069 
4070 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4071 
4072 	ENGINE_WRITE_FW(engine,
4073 			RING_HWS_PGA,
4074 			i915_ggtt_offset(engine->status_page.vma));
4075 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4076 
4077 	enable_error_interrupt(engine);
4078 
4079 	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4080 }
4081 
4082 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4083 {
4084 	bool unexpected = false;
4085 
4086 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4087 		drm_dbg(&engine->i915->drm,
4088 			"STOP_RING still set in RING_MI_MODE\n");
4089 		unexpected = true;
4090 	}
4091 
4092 	return unexpected;
4093 }
4094 
4095 static int execlists_resume(struct intel_engine_cs *engine)
4096 {
4097 	intel_mocs_init_engine(engine);
4098 
4099 	intel_engine_reset_breadcrumbs(engine);
4100 
4101 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4102 		struct drm_printer p = drm_debug_printer(__func__);
4103 
4104 		intel_engine_dump(engine, &p, NULL);
4105 	}
4106 
4107 	enable_execlists(engine);
4108 
4109 	return 0;
4110 }
4111 
4112 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4113 {
4114 	struct intel_engine_execlists * const execlists = &engine->execlists;
4115 	unsigned long flags;
4116 
4117 	ENGINE_TRACE(engine, "depth<-%d\n",
4118 		     atomic_read(&execlists->tasklet.count));
4119 
4120 	/*
4121 	 * Prevent request submission to the hardware until we have
4122 	 * completed the reset in i915_gem_reset_finish(). If a request
4123 	 * is completed by one engine, it may then queue a request
4124 	 * to a second via its execlists->tasklet *just* as we are
4125 	 * calling engine->resume() and also writing the ELSP.
4126 	 * Turning off the execlists->tasklet until the reset is over
4127 	 * prevents the race.
4128 	 */
4129 	__tasklet_disable_sync_once(&execlists->tasklet);
4130 	GEM_BUG_ON(!reset_in_progress(execlists));
4131 
4132 	/* And flush any current direct submission. */
4133 	spin_lock_irqsave(&engine->active.lock, flags);
4134 	spin_unlock_irqrestore(&engine->active.lock, flags);
4135 
4136 	/*
4137 	 * We stop engines, otherwise we might get failed reset and a
4138 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4139 	 * from system hang if batchbuffer is progressing when
4140 	 * the reset is issued, regardless of READY_TO_RESET ack.
4141 	 * Thus assume it is best to stop engines on all gens
4142 	 * where we have a gpu reset.
4143 	 *
4144 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4145 	 *
4146 	 * FIXME: Wa for more modern gens needs to be validated
4147 	 */
4148 	ring_set_paused(engine, 1);
4149 	intel_engine_stop_cs(engine);
4150 
4151 	engine->execlists.reset_ccid = active_ccid(engine);
4152 }
4153 
4154 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4155 {
4156 	int x;
4157 
4158 	x = lrc_ring_mi_mode(engine);
4159 	if (x != -1) {
4160 		regs[x + 1] &= ~STOP_RING;
4161 		regs[x + 1] |= STOP_RING << 16;
4162 	}
4163 }
4164 
4165 static void __execlists_reset_reg_state(const struct intel_context *ce,
4166 					const struct intel_engine_cs *engine)
4167 {
4168 	u32 *regs = ce->lrc_reg_state;
4169 
4170 	__reset_stop_ring(regs, engine);
4171 }
4172 
4173 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4174 {
4175 	struct intel_engine_execlists * const execlists = &engine->execlists;
4176 	struct intel_context *ce;
4177 	struct i915_request *rq;
4178 	u32 head;
4179 
4180 	mb(); /* paranoia: read the CSB pointers from after the reset */
4181 	clflush(execlists->csb_write);
4182 	mb();
4183 
4184 	process_csb(engine); /* drain preemption events */
4185 
4186 	/* Following the reset, we need to reload the CSB read/write pointers */
4187 	reset_csb_pointers(engine);
4188 
4189 	/*
4190 	 * Save the currently executing context, even if we completed
4191 	 * its request, it was still running at the time of the
4192 	 * reset and will have been clobbered.
4193 	 */
4194 	rq = active_context(engine, engine->execlists.reset_ccid);
4195 	if (!rq)
4196 		goto unwind;
4197 
4198 	ce = rq->context;
4199 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4200 
4201 	if (i915_request_completed(rq)) {
4202 		/* Idle context; tidy up the ring so we can restart afresh */
4203 		head = intel_ring_wrap(ce->ring, rq->tail);
4204 		goto out_replay;
4205 	}
4206 
4207 	/* We still have requests in-flight; the engine should be active */
4208 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4209 
4210 	/* Context has requests still in-flight; it should not be idle! */
4211 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4212 
4213 	rq = active_request(ce->timeline, rq);
4214 	head = intel_ring_wrap(ce->ring, rq->head);
4215 	GEM_BUG_ON(head == ce->ring->tail);
4216 
4217 	/*
4218 	 * If this request hasn't started yet, e.g. it is waiting on a
4219 	 * semaphore, we need to avoid skipping the request or else we
4220 	 * break the signaling chain. However, if the context is corrupt
4221 	 * the request will not restart and we will be stuck with a wedged
4222 	 * device. It is quite often the case that if we issue a reset
4223 	 * while the GPU is loading the context image, that the context
4224 	 * image becomes corrupt.
4225 	 *
4226 	 * Otherwise, if we have not started yet, the request should replay
4227 	 * perfectly and we do not need to flag the result as being erroneous.
4228 	 */
4229 	if (!i915_request_started(rq))
4230 		goto out_replay;
4231 
4232 	/*
4233 	 * If the request was innocent, we leave the request in the ELSP
4234 	 * and will try to replay it on restarting. The context image may
4235 	 * have been corrupted by the reset, in which case we may have
4236 	 * to service a new GPU hang, but more likely we can continue on
4237 	 * without impact.
4238 	 *
4239 	 * If the request was guilty, we presume the context is corrupt
4240 	 * and have to at least restore the RING register in the context
4241 	 * image back to the expected values to skip over the guilty request.
4242 	 */
4243 	__i915_request_reset(rq, stalled);
4244 
4245 	/*
4246 	 * We want a simple context + ring to execute the breadcrumb update.
4247 	 * We cannot rely on the context being intact across the GPU hang,
4248 	 * so clear it and rebuild just what we need for the breadcrumb.
4249 	 * All pending requests for this context will be zapped, and any
4250 	 * future request will be after userspace has had the opportunity
4251 	 * to recreate its own state.
4252 	 */
4253 out_replay:
4254 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4255 		     head, ce->ring->tail);
4256 	__execlists_reset_reg_state(ce, engine);
4257 	__execlists_update_reg_state(ce, engine, head);
4258 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4259 
4260 unwind:
4261 	/* Push back any incomplete requests for replay after the reset. */
4262 	cancel_port_requests(execlists);
4263 	__unwind_incomplete_requests(engine);
4264 }
4265 
4266 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4267 {
4268 	unsigned long flags;
4269 
4270 	ENGINE_TRACE(engine, "\n");
4271 
4272 	spin_lock_irqsave(&engine->active.lock, flags);
4273 
4274 	__execlists_reset(engine, stalled);
4275 
4276 	spin_unlock_irqrestore(&engine->active.lock, flags);
4277 }
4278 
4279 static void nop_submission_tasklet(unsigned long data)
4280 {
4281 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4282 
4283 	/* The driver is wedged; don't process any more events. */
4284 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4285 }
4286 
4287 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4288 {
4289 	struct intel_engine_execlists * const execlists = &engine->execlists;
4290 	struct i915_request *rq, *rn;
4291 	struct rb_node *rb;
4292 	unsigned long flags;
4293 
4294 	ENGINE_TRACE(engine, "\n");
4295 
4296 	/*
4297 	 * Before we call engine->cancel_requests(), we should have exclusive
4298 	 * access to the submission state. This is arranged for us by the
4299 	 * caller disabling the interrupt generation, the tasklet and other
4300 	 * threads that may then access the same state, giving us a free hand
4301 	 * to reset state. However, we still need to let lockdep be aware that
4302 	 * we know this state may be accessed in hardirq context, so we
4303 	 * disable the irq around this manipulation and we want to keep
4304 	 * the spinlock focused on its duties and not accidentally conflate
4305 	 * coverage to the submission's irq state. (Similarly, although we
4306 	 * shouldn't need to disable irq around the manipulation of the
4307 	 * submission's irq state, we also wish to remind ourselves that
4308 	 * it is irq state.)
4309 	 */
4310 	spin_lock_irqsave(&engine->active.lock, flags);
4311 
4312 	__execlists_reset(engine, true);
4313 
4314 	/* Mark all executing requests as skipped. */
4315 	list_for_each_entry(rq, &engine->active.requests, sched.link)
4316 		mark_eio(rq);
4317 
4318 	/* Flush the queued requests to the timeline list (for retiring). */
4319 	while ((rb = rb_first_cached(&execlists->queue))) {
4320 		struct i915_priolist *p = to_priolist(rb);
4321 		int i;
4322 
4323 		priolist_for_each_request_consume(rq, rn, p, i) {
4324 			mark_eio(rq);
4325 			__i915_request_submit(rq);
4326 		}
4327 
4328 		rb_erase_cached(&p->node, &execlists->queue);
4329 		i915_priolist_free(p);
4330 	}
4331 
4332 	/* On-hold requests will be flushed to timeline upon their release */
4333 	list_for_each_entry(rq, &engine->active.hold, sched.link)
4334 		mark_eio(rq);
4335 
4336 	/* Cancel all attached virtual engines */
4337 	while ((rb = rb_first_cached(&execlists->virtual))) {
4338 		struct virtual_engine *ve =
4339 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4340 
4341 		rb_erase_cached(rb, &execlists->virtual);
4342 		RB_CLEAR_NODE(rb);
4343 
4344 		spin_lock(&ve->base.active.lock);
4345 		rq = fetch_and_zero(&ve->request);
4346 		if (rq) {
4347 			mark_eio(rq);
4348 
4349 			rq->engine = engine;
4350 			__i915_request_submit(rq);
4351 			i915_request_put(rq);
4352 
4353 			ve->base.execlists.queue_priority_hint = INT_MIN;
4354 		}
4355 		spin_unlock(&ve->base.active.lock);
4356 	}
4357 
4358 	/* Remaining _unready_ requests will be nop'ed when submitted */
4359 
4360 	execlists->queue_priority_hint = INT_MIN;
4361 	execlists->queue = RB_ROOT_CACHED;
4362 
4363 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4364 	execlists->tasklet.func = nop_submission_tasklet;
4365 
4366 	spin_unlock_irqrestore(&engine->active.lock, flags);
4367 }
4368 
4369 static void execlists_reset_finish(struct intel_engine_cs *engine)
4370 {
4371 	struct intel_engine_execlists * const execlists = &engine->execlists;
4372 
4373 	/*
4374 	 * After a GPU reset, we may have requests to replay. Do so now while
4375 	 * we still have the forcewake to be sure that the GPU is not allowed
4376 	 * to sleep before we restart and reload a context.
4377 	 */
4378 	GEM_BUG_ON(!reset_in_progress(execlists));
4379 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4380 		execlists->tasklet.func(execlists->tasklet.data);
4381 
4382 	if (__tasklet_enable(&execlists->tasklet))
4383 		/* And kick in case we missed a new request submission. */
4384 		tasklet_hi_schedule(&execlists->tasklet);
4385 	ENGINE_TRACE(engine, "depth->%d\n",
4386 		     atomic_read(&execlists->tasklet.count));
4387 }
4388 
4389 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4390 				    u64 offset, u32 len,
4391 				    const unsigned int flags)
4392 {
4393 	u32 *cs;
4394 
4395 	cs = intel_ring_begin(rq, 4);
4396 	if (IS_ERR(cs))
4397 		return PTR_ERR(cs);
4398 
4399 	/*
4400 	 * WaDisableCtxRestoreArbitration:bdw,chv
4401 	 *
4402 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4403 	 * particular all the gen that do not need the w/a at all!), if we
4404 	 * took care to make sure that on every switch into this context
4405 	 * (both ordinary and for preemption) that arbitrartion was enabled
4406 	 * we would be fine.  However, for gen8 there is another w/a that
4407 	 * requires us to not preempt inside GPGPU execution, so we keep
4408 	 * arbitration disabled for gen8 batches. Arbitration will be
4409 	 * re-enabled before we close the request
4410 	 * (engine->emit_fini_breadcrumb).
4411 	 */
4412 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4413 
4414 	/* FIXME(BDW+): Address space and security selectors. */
4415 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4416 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4417 	*cs++ = lower_32_bits(offset);
4418 	*cs++ = upper_32_bits(offset);
4419 
4420 	intel_ring_advance(rq, cs);
4421 
4422 	return 0;
4423 }
4424 
4425 static int gen8_emit_bb_start(struct i915_request *rq,
4426 			      u64 offset, u32 len,
4427 			      const unsigned int flags)
4428 {
4429 	u32 *cs;
4430 
4431 	cs = intel_ring_begin(rq, 6);
4432 	if (IS_ERR(cs))
4433 		return PTR_ERR(cs);
4434 
4435 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4436 
4437 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4438 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4439 	*cs++ = lower_32_bits(offset);
4440 	*cs++ = upper_32_bits(offset);
4441 
4442 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4443 	*cs++ = MI_NOOP;
4444 
4445 	intel_ring_advance(rq, cs);
4446 
4447 	return 0;
4448 }
4449 
4450 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4451 {
4452 	ENGINE_WRITE(engine, RING_IMR,
4453 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4454 	ENGINE_POSTING_READ(engine, RING_IMR);
4455 }
4456 
4457 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4458 {
4459 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4460 }
4461 
4462 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4463 {
4464 	u32 cmd, *cs;
4465 
4466 	cs = intel_ring_begin(request, 4);
4467 	if (IS_ERR(cs))
4468 		return PTR_ERR(cs);
4469 
4470 	cmd = MI_FLUSH_DW + 1;
4471 
4472 	/* We always require a command barrier so that subsequent
4473 	 * commands, such as breadcrumb interrupts, are strictly ordered
4474 	 * wrt the contents of the write cache being flushed to memory
4475 	 * (and thus being coherent from the CPU).
4476 	 */
4477 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4478 
4479 	if (mode & EMIT_INVALIDATE) {
4480 		cmd |= MI_INVALIDATE_TLB;
4481 		if (request->engine->class == VIDEO_DECODE_CLASS)
4482 			cmd |= MI_INVALIDATE_BSD;
4483 	}
4484 
4485 	*cs++ = cmd;
4486 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4487 	*cs++ = 0; /* upper addr */
4488 	*cs++ = 0; /* value */
4489 	intel_ring_advance(request, cs);
4490 
4491 	return 0;
4492 }
4493 
4494 static int gen8_emit_flush_render(struct i915_request *request,
4495 				  u32 mode)
4496 {
4497 	bool vf_flush_wa = false, dc_flush_wa = false;
4498 	u32 *cs, flags = 0;
4499 	int len;
4500 
4501 	flags |= PIPE_CONTROL_CS_STALL;
4502 
4503 	if (mode & EMIT_FLUSH) {
4504 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4505 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4506 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4507 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4508 	}
4509 
4510 	if (mode & EMIT_INVALIDATE) {
4511 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4512 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4513 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4514 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4515 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4516 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4517 		flags |= PIPE_CONTROL_QW_WRITE;
4518 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4519 
4520 		/*
4521 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4522 		 * pipe control.
4523 		 */
4524 		if (IS_GEN(request->engine->i915, 9))
4525 			vf_flush_wa = true;
4526 
4527 		/* WaForGAMHang:kbl */
4528 		if (IS_KBL_REVID(request->engine->i915, 0, KBL_REVID_B0))
4529 			dc_flush_wa = true;
4530 	}
4531 
4532 	len = 6;
4533 
4534 	if (vf_flush_wa)
4535 		len += 6;
4536 
4537 	if (dc_flush_wa)
4538 		len += 12;
4539 
4540 	cs = intel_ring_begin(request, len);
4541 	if (IS_ERR(cs))
4542 		return PTR_ERR(cs);
4543 
4544 	if (vf_flush_wa)
4545 		cs = gen8_emit_pipe_control(cs, 0, 0);
4546 
4547 	if (dc_flush_wa)
4548 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4549 					    0);
4550 
4551 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4552 
4553 	if (dc_flush_wa)
4554 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4555 
4556 	intel_ring_advance(request, cs);
4557 
4558 	return 0;
4559 }
4560 
4561 static int gen11_emit_flush_render(struct i915_request *request,
4562 				   u32 mode)
4563 {
4564 	if (mode & EMIT_FLUSH) {
4565 		u32 *cs;
4566 		u32 flags = 0;
4567 
4568 		flags |= PIPE_CONTROL_CS_STALL;
4569 
4570 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4571 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4572 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4573 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4574 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4575 		flags |= PIPE_CONTROL_QW_WRITE;
4576 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4577 
4578 		cs = intel_ring_begin(request, 6);
4579 		if (IS_ERR(cs))
4580 			return PTR_ERR(cs);
4581 
4582 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4583 		intel_ring_advance(request, cs);
4584 	}
4585 
4586 	if (mode & EMIT_INVALIDATE) {
4587 		u32 *cs;
4588 		u32 flags = 0;
4589 
4590 		flags |= PIPE_CONTROL_CS_STALL;
4591 
4592 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4593 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4594 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4595 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4596 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4597 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4598 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4599 		flags |= PIPE_CONTROL_QW_WRITE;
4600 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4601 
4602 		cs = intel_ring_begin(request, 6);
4603 		if (IS_ERR(cs))
4604 			return PTR_ERR(cs);
4605 
4606 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4607 		intel_ring_advance(request, cs);
4608 	}
4609 
4610 	return 0;
4611 }
4612 
4613 static u32 preparser_disable(bool state)
4614 {
4615 	return MI_ARB_CHECK | 1 << 8 | state;
4616 }
4617 
4618 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4619 {
4620 	static const i915_reg_t vd[] = {
4621 		GEN12_VD0_AUX_NV,
4622 		GEN12_VD1_AUX_NV,
4623 		GEN12_VD2_AUX_NV,
4624 		GEN12_VD3_AUX_NV,
4625 	};
4626 
4627 	static const i915_reg_t ve[] = {
4628 		GEN12_VE0_AUX_NV,
4629 		GEN12_VE1_AUX_NV,
4630 	};
4631 
4632 	if (engine->class == VIDEO_DECODE_CLASS)
4633 		return vd[engine->instance];
4634 
4635 	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4636 		return ve[engine->instance];
4637 
4638 	GEM_BUG_ON("unknown aux_inv_reg\n");
4639 
4640 	return INVALID_MMIO_REG;
4641 }
4642 
4643 static u32 *
4644 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4645 {
4646 	*cs++ = MI_LOAD_REGISTER_IMM(1);
4647 	*cs++ = i915_mmio_reg_offset(inv_reg);
4648 	*cs++ = AUX_INV;
4649 	*cs++ = MI_NOOP;
4650 
4651 	return cs;
4652 }
4653 
4654 static int gen12_emit_flush_render(struct i915_request *request,
4655 				   u32 mode)
4656 {
4657 	if (mode & EMIT_FLUSH) {
4658 		u32 flags = 0;
4659 		u32 *cs;
4660 
4661 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4662 		flags |= PIPE_CONTROL_FLUSH_L3;
4663 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4664 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4665 		/* Wa_1409600907:tgl */
4666 		flags |= PIPE_CONTROL_DEPTH_STALL;
4667 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4668 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4669 
4670 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4671 		flags |= PIPE_CONTROL_QW_WRITE;
4672 
4673 		flags |= PIPE_CONTROL_CS_STALL;
4674 
4675 		cs = intel_ring_begin(request, 6);
4676 		if (IS_ERR(cs))
4677 			return PTR_ERR(cs);
4678 
4679 		cs = gen12_emit_pipe_control(cs,
4680 					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4681 					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4682 		intel_ring_advance(request, cs);
4683 	}
4684 
4685 	if (mode & EMIT_INVALIDATE) {
4686 		u32 flags = 0;
4687 		u32 *cs;
4688 
4689 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4690 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4691 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4692 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4693 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4694 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4695 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4696 
4697 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4698 		flags |= PIPE_CONTROL_QW_WRITE;
4699 
4700 		flags |= PIPE_CONTROL_CS_STALL;
4701 
4702 		cs = intel_ring_begin(request, 8 + 4);
4703 		if (IS_ERR(cs))
4704 			return PTR_ERR(cs);
4705 
4706 		/*
4707 		 * Prevent the pre-parser from skipping past the TLB
4708 		 * invalidate and loading a stale page for the batch
4709 		 * buffer / request payload.
4710 		 */
4711 		*cs++ = preparser_disable(true);
4712 
4713 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4714 
4715 		/* hsdes: 1809175790 */
4716 		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4717 
4718 		*cs++ = preparser_disable(false);
4719 		intel_ring_advance(request, cs);
4720 	}
4721 
4722 	return 0;
4723 }
4724 
4725 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4726 {
4727 	intel_engine_mask_t aux_inv = 0;
4728 	u32 cmd, *cs;
4729 
4730 	if (mode & EMIT_INVALIDATE)
4731 		aux_inv = request->engine->mask & ~BIT(BCS0);
4732 
4733 	cs = intel_ring_begin(request,
4734 			      4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0));
4735 	if (IS_ERR(cs))
4736 		return PTR_ERR(cs);
4737 
4738 	cmd = MI_FLUSH_DW + 1;
4739 
4740 	/* We always require a command barrier so that subsequent
4741 	 * commands, such as breadcrumb interrupts, are strictly ordered
4742 	 * wrt the contents of the write cache being flushed to memory
4743 	 * (and thus being coherent from the CPU).
4744 	 */
4745 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4746 
4747 	if (mode & EMIT_INVALIDATE) {
4748 		cmd |= MI_INVALIDATE_TLB;
4749 		if (request->engine->class == VIDEO_DECODE_CLASS)
4750 			cmd |= MI_INVALIDATE_BSD;
4751 	}
4752 
4753 	*cs++ = cmd;
4754 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4755 	*cs++ = 0; /* upper addr */
4756 	*cs++ = 0; /* value */
4757 
4758 	if (aux_inv) { /* hsdes: 1809175790 */
4759 		struct intel_engine_cs *engine;
4760 		unsigned int tmp;
4761 
4762 		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4763 		for_each_engine_masked(engine, request->engine->gt,
4764 				       aux_inv, tmp) {
4765 			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4766 			*cs++ = AUX_INV;
4767 		}
4768 		*cs++ = MI_NOOP;
4769 	}
4770 	intel_ring_advance(request, cs);
4771 
4772 	return 0;
4773 }
4774 
4775 static void assert_request_valid(struct i915_request *rq)
4776 {
4777 	struct intel_ring *ring __maybe_unused = rq->ring;
4778 
4779 	/* Can we unwind this request without appearing to go forwards? */
4780 	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4781 }
4782 
4783 /*
4784  * Reserve space for 2 NOOPs at the end of each request to be
4785  * used as a workaround for not being allowed to do lite
4786  * restore with HEAD==TAIL (WaIdleLiteRestore).
4787  */
4788 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4789 {
4790 	/* Ensure there's always at least one preemption point per-request. */
4791 	*cs++ = MI_ARB_CHECK;
4792 	*cs++ = MI_NOOP;
4793 	request->wa_tail = intel_ring_offset(request, cs);
4794 
4795 	/* Check that entire request is less than half the ring */
4796 	assert_request_valid(request);
4797 
4798 	return cs;
4799 }
4800 
4801 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4802 {
4803 	*cs++ = MI_SEMAPHORE_WAIT |
4804 		MI_SEMAPHORE_GLOBAL_GTT |
4805 		MI_SEMAPHORE_POLL |
4806 		MI_SEMAPHORE_SAD_EQ_SDD;
4807 	*cs++ = 0;
4808 	*cs++ = intel_hws_preempt_address(request->engine);
4809 	*cs++ = 0;
4810 
4811 	return cs;
4812 }
4813 
4814 static __always_inline u32*
4815 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4816 {
4817 	*cs++ = MI_USER_INTERRUPT;
4818 
4819 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4820 	if (intel_engine_has_semaphores(request->engine))
4821 		cs = emit_preempt_busywait(request, cs);
4822 
4823 	request->tail = intel_ring_offset(request, cs);
4824 	assert_ring_tail_valid(request->ring, request->tail);
4825 
4826 	return gen8_emit_wa_tail(request, cs);
4827 }
4828 
4829 static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs)
4830 {
4831 	u32 addr = i915_request_active_timeline(request)->hwsp_offset;
4832 
4833 	return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0);
4834 }
4835 
4836 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4837 {
4838 	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4839 }
4840 
4841 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4842 {
4843 	cs = gen8_emit_pipe_control(cs,
4844 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4845 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4846 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4847 				    0);
4848 
4849 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4850 	cs = gen8_emit_ggtt_write_rcs(cs,
4851 				      request->fence.seqno,
4852 				      i915_request_active_timeline(request)->hwsp_offset,
4853 				      PIPE_CONTROL_FLUSH_ENABLE |
4854 				      PIPE_CONTROL_CS_STALL);
4855 
4856 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4857 }
4858 
4859 static u32 *
4860 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4861 {
4862 	cs = gen8_emit_ggtt_write_rcs(cs,
4863 				      request->fence.seqno,
4864 				      i915_request_active_timeline(request)->hwsp_offset,
4865 				      PIPE_CONTROL_CS_STALL |
4866 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4867 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4868 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4869 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4870 				      PIPE_CONTROL_FLUSH_ENABLE);
4871 
4872 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4873 }
4874 
4875 /*
4876  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4877  * flush and will continue pre-fetching the instructions after it before the
4878  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4879  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4880  * of the next request before the memory has been flushed, we're guaranteed that
4881  * we won't access the batch itself too early.
4882  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4883  * so, if the current request is modifying an instruction in the next request on
4884  * the same intel_context, we might pre-fetch and then execute the pre-update
4885  * instruction. To avoid this, the users of self-modifying code should either
4886  * disable the parser around the code emitting the memory writes, via a new flag
4887  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4888  * the in-kernel use-cases we've opted to use a separate context, see
4889  * reloc_gpu() as an example.
4890  * All the above applies only to the instructions themselves. Non-inline data
4891  * used by the instructions is not pre-fetched.
4892  */
4893 
4894 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4895 {
4896 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4897 		MI_SEMAPHORE_GLOBAL_GTT |
4898 		MI_SEMAPHORE_POLL |
4899 		MI_SEMAPHORE_SAD_EQ_SDD;
4900 	*cs++ = 0;
4901 	*cs++ = intel_hws_preempt_address(request->engine);
4902 	*cs++ = 0;
4903 	*cs++ = 0;
4904 	*cs++ = MI_NOOP;
4905 
4906 	return cs;
4907 }
4908 
4909 static __always_inline u32*
4910 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4911 {
4912 	*cs++ = MI_USER_INTERRUPT;
4913 
4914 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4915 	if (intel_engine_has_semaphores(request->engine))
4916 		cs = gen12_emit_preempt_busywait(request, cs);
4917 
4918 	request->tail = intel_ring_offset(request, cs);
4919 	assert_ring_tail_valid(request->ring, request->tail);
4920 
4921 	return gen8_emit_wa_tail(request, cs);
4922 }
4923 
4924 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4925 {
4926 	return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4927 }
4928 
4929 static u32 *
4930 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4931 {
4932 	cs = gen12_emit_ggtt_write_rcs(cs,
4933 				       request->fence.seqno,
4934 				       i915_request_active_timeline(request)->hwsp_offset,
4935 				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4936 				       PIPE_CONTROL_CS_STALL |
4937 				       PIPE_CONTROL_TILE_CACHE_FLUSH |
4938 				       PIPE_CONTROL_FLUSH_L3 |
4939 				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4940 				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4941 				       /* Wa_1409600907:tgl */
4942 				       PIPE_CONTROL_DEPTH_STALL |
4943 				       PIPE_CONTROL_DC_FLUSH_ENABLE |
4944 				       PIPE_CONTROL_FLUSH_ENABLE);
4945 
4946 	return gen12_emit_fini_breadcrumb_tail(request, cs);
4947 }
4948 
4949 static void execlists_park(struct intel_engine_cs *engine)
4950 {
4951 	cancel_timer(&engine->execlists.timer);
4952 	cancel_timer(&engine->execlists.preempt);
4953 }
4954 
4955 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4956 {
4957 	engine->submit_request = execlists_submit_request;
4958 	engine->schedule = i915_schedule;
4959 	engine->execlists.tasklet.func = execlists_submission_tasklet;
4960 
4961 	engine->reset.prepare = execlists_reset_prepare;
4962 	engine->reset.rewind = execlists_reset_rewind;
4963 	engine->reset.cancel = execlists_reset_cancel;
4964 	engine->reset.finish = execlists_reset_finish;
4965 
4966 	engine->park = execlists_park;
4967 	engine->unpark = NULL;
4968 
4969 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4970 	if (!intel_vgpu_active(engine->i915)) {
4971 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4972 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
4973 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4974 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
4975 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
4976 		}
4977 	}
4978 
4979 	if (INTEL_GEN(engine->i915) >= 12)
4980 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4981 
4982 	if (intel_engine_has_preemption(engine))
4983 		engine->emit_bb_start = gen8_emit_bb_start;
4984 	else
4985 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
4986 }
4987 
4988 static void execlists_shutdown(struct intel_engine_cs *engine)
4989 {
4990 	/* Synchronise with residual timers and any softirq they raise */
4991 	del_timer_sync(&engine->execlists.timer);
4992 	del_timer_sync(&engine->execlists.preempt);
4993 	tasklet_kill(&engine->execlists.tasklet);
4994 }
4995 
4996 static void execlists_release(struct intel_engine_cs *engine)
4997 {
4998 	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
4999 
5000 	execlists_shutdown(engine);
5001 
5002 	intel_engine_cleanup_common(engine);
5003 	lrc_destroy_wa_ctx(engine);
5004 }
5005 
5006 static void
5007 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5008 {
5009 	/* Default vfuncs which can be overriden by each engine. */
5010 
5011 	engine->resume = execlists_resume;
5012 
5013 	engine->cops = &execlists_context_ops;
5014 	engine->request_alloc = execlists_request_alloc;
5015 
5016 	engine->emit_flush = gen8_emit_flush;
5017 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5018 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5019 	if (INTEL_GEN(engine->i915) >= 12) {
5020 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5021 		engine->emit_flush = gen12_emit_flush;
5022 	}
5023 	engine->set_default_submission = intel_execlists_set_default_submission;
5024 
5025 	if (INTEL_GEN(engine->i915) < 11) {
5026 		engine->irq_enable = gen8_logical_ring_enable_irq;
5027 		engine->irq_disable = gen8_logical_ring_disable_irq;
5028 	} else {
5029 		/*
5030 		 * TODO: On Gen11 interrupt masks need to be clear
5031 		 * to allow C6 entry. Keep interrupts enabled at
5032 		 * and take the hit of generating extra interrupts
5033 		 * until a more refined solution exists.
5034 		 */
5035 	}
5036 }
5037 
5038 static inline void
5039 logical_ring_default_irqs(struct intel_engine_cs *engine)
5040 {
5041 	unsigned int shift = 0;
5042 
5043 	if (INTEL_GEN(engine->i915) < 11) {
5044 		const u8 irq_shifts[] = {
5045 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5046 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5047 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5048 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5049 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5050 		};
5051 
5052 		shift = irq_shifts[engine->id];
5053 	}
5054 
5055 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5056 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5057 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5058 	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5059 }
5060 
5061 static void rcs_submission_override(struct intel_engine_cs *engine)
5062 {
5063 	switch (INTEL_GEN(engine->i915)) {
5064 	case 12:
5065 		engine->emit_flush = gen12_emit_flush_render;
5066 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5067 		break;
5068 	case 11:
5069 		engine->emit_flush = gen11_emit_flush_render;
5070 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5071 		break;
5072 	default:
5073 		engine->emit_flush = gen8_emit_flush_render;
5074 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5075 		break;
5076 	}
5077 }
5078 
5079 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5080 {
5081 	struct intel_engine_execlists * const execlists = &engine->execlists;
5082 	struct drm_i915_private *i915 = engine->i915;
5083 	struct intel_uncore *uncore = engine->uncore;
5084 	u32 base = engine->mmio_base;
5085 
5086 	tasklet_init(&engine->execlists.tasklet,
5087 		     execlists_submission_tasklet, (unsigned long)engine);
5088 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5089 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5090 
5091 	logical_ring_default_vfuncs(engine);
5092 	logical_ring_default_irqs(engine);
5093 
5094 	if (engine->class == RENDER_CLASS)
5095 		rcs_submission_override(engine);
5096 
5097 	if (intel_init_workaround_bb(engine))
5098 		/*
5099 		 * We continue even if we fail to initialize WA batch
5100 		 * because we only expect rare glitches but nothing
5101 		 * critical to prevent us from using GPU
5102 		 */
5103 		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5104 
5105 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5106 		execlists->submit_reg = uncore->regs +
5107 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5108 		execlists->ctrl_reg = uncore->regs +
5109 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5110 	} else {
5111 		execlists->submit_reg = uncore->regs +
5112 			i915_mmio_reg_offset(RING_ELSP(base));
5113 	}
5114 
5115 	execlists->csb_status =
5116 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5117 
5118 	execlists->csb_write =
5119 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5120 
5121 	if (INTEL_GEN(i915) < 11)
5122 		execlists->csb_size = GEN8_CSB_ENTRIES;
5123 	else
5124 		execlists->csb_size = GEN11_CSB_ENTRIES;
5125 
5126 	if (INTEL_GEN(engine->i915) >= 11) {
5127 		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5128 		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5129 	}
5130 
5131 	/* Finally, take ownership and responsibility for cleanup! */
5132 	engine->sanitize = execlists_sanitize;
5133 	engine->release = execlists_release;
5134 
5135 	return 0;
5136 }
5137 
5138 static void init_common_reg_state(u32 * const regs,
5139 				  const struct intel_engine_cs *engine,
5140 				  const struct intel_ring *ring,
5141 				  bool inhibit)
5142 {
5143 	u32 ctl;
5144 
5145 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5146 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5147 	if (inhibit)
5148 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5149 	if (INTEL_GEN(engine->i915) < 11)
5150 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5151 					   CTX_CTRL_RS_CTX_ENABLE);
5152 	regs[CTX_CONTEXT_CONTROL] = ctl;
5153 
5154 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5155 	regs[CTX_TIMESTAMP] = 0;
5156 }
5157 
5158 static void init_wa_bb_reg_state(u32 * const regs,
5159 				 const struct intel_engine_cs *engine)
5160 {
5161 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5162 
5163 	if (wa_ctx->per_ctx.size) {
5164 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5165 
5166 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5167 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5168 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5169 	}
5170 
5171 	if (wa_ctx->indirect_ctx.size) {
5172 		lrc_ring_setup_indirect_ctx(regs, engine,
5173 					    i915_ggtt_offset(wa_ctx->vma) +
5174 					    wa_ctx->indirect_ctx.offset,
5175 					    wa_ctx->indirect_ctx.size);
5176 	}
5177 }
5178 
5179 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5180 {
5181 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5182 		/* 64b PPGTT (48bit canonical)
5183 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5184 		 * other PDP Descriptors are ignored.
5185 		 */
5186 		ASSIGN_CTX_PML4(ppgtt, regs);
5187 	} else {
5188 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5189 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5190 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5191 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5192 	}
5193 }
5194 
5195 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5196 {
5197 	if (i915_is_ggtt(vm))
5198 		return i915_vm_to_ggtt(vm)->alias;
5199 	else
5200 		return i915_vm_to_ppgtt(vm);
5201 }
5202 
5203 static void execlists_init_reg_state(u32 *regs,
5204 				     const struct intel_context *ce,
5205 				     const struct intel_engine_cs *engine,
5206 				     const struct intel_ring *ring,
5207 				     bool inhibit)
5208 {
5209 	/*
5210 	 * A context is actually a big batch buffer with several
5211 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5212 	 * values we are setting here are only for the first context restore:
5213 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5214 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5215 	 * we are not initializing here).
5216 	 *
5217 	 * Must keep consistent with virtual_update_register_offsets().
5218 	 */
5219 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5220 
5221 	init_common_reg_state(regs, engine, ring, inhibit);
5222 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5223 
5224 	init_wa_bb_reg_state(regs, engine);
5225 
5226 	__reset_stop_ring(regs, engine);
5227 }
5228 
5229 static int
5230 populate_lr_context(struct intel_context *ce,
5231 		    struct drm_i915_gem_object *ctx_obj,
5232 		    struct intel_engine_cs *engine,
5233 		    struct intel_ring *ring)
5234 {
5235 	bool inhibit = true;
5236 	void *vaddr;
5237 
5238 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5239 	if (IS_ERR(vaddr)) {
5240 		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5241 		return PTR_ERR(vaddr);
5242 	}
5243 
5244 	set_redzone(vaddr, engine);
5245 
5246 	if (engine->default_state) {
5247 		shmem_read(engine->default_state, 0,
5248 			   vaddr, engine->context_size);
5249 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5250 		inhibit = false;
5251 	}
5252 
5253 	/* Clear the ppHWSP (inc. per-context counters) */
5254 	memset(vaddr, 0, PAGE_SIZE);
5255 
5256 	/*
5257 	 * The second page of the context object contains some registers which
5258 	 * must be set up prior to the first execution.
5259 	 */
5260 	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5261 				 ce, engine, ring, inhibit);
5262 
5263 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5264 	i915_gem_object_unpin_map(ctx_obj);
5265 	return 0;
5266 }
5267 
5268 static int __execlists_context_alloc(struct intel_context *ce,
5269 				     struct intel_engine_cs *engine)
5270 {
5271 	struct drm_i915_gem_object *ctx_obj;
5272 	struct intel_ring *ring;
5273 	struct i915_vma *vma;
5274 	u32 context_size;
5275 	int ret;
5276 
5277 	GEM_BUG_ON(ce->state);
5278 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5279 
5280 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5281 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5282 
5283 	if (INTEL_GEN(engine->i915) == 12) {
5284 		ce->wa_bb_page = context_size / PAGE_SIZE;
5285 		context_size += PAGE_SIZE;
5286 	}
5287 
5288 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5289 	if (IS_ERR(ctx_obj))
5290 		return PTR_ERR(ctx_obj);
5291 
5292 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5293 	if (IS_ERR(vma)) {
5294 		ret = PTR_ERR(vma);
5295 		goto error_deref_obj;
5296 	}
5297 
5298 	if (!ce->timeline) {
5299 		struct intel_timeline *tl;
5300 		struct i915_vma *hwsp;
5301 
5302 		/*
5303 		 * Use the static global HWSP for the kernel context, and
5304 		 * a dynamically allocated cacheline for everyone else.
5305 		 */
5306 		hwsp = NULL;
5307 		if (unlikely(intel_context_is_barrier(ce)))
5308 			hwsp = engine->status_page.vma;
5309 
5310 		tl = intel_timeline_create(engine->gt, hwsp);
5311 		if (IS_ERR(tl)) {
5312 			ret = PTR_ERR(tl);
5313 			goto error_deref_obj;
5314 		}
5315 
5316 		ce->timeline = tl;
5317 	}
5318 
5319 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5320 	if (IS_ERR(ring)) {
5321 		ret = PTR_ERR(ring);
5322 		goto error_deref_obj;
5323 	}
5324 
5325 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5326 	if (ret) {
5327 		drm_dbg(&engine->i915->drm,
5328 			"Failed to populate LRC: %d\n", ret);
5329 		goto error_ring_free;
5330 	}
5331 
5332 	ce->ring = ring;
5333 	ce->state = vma;
5334 
5335 	return 0;
5336 
5337 error_ring_free:
5338 	intel_ring_put(ring);
5339 error_deref_obj:
5340 	i915_gem_object_put(ctx_obj);
5341 	return ret;
5342 }
5343 
5344 static struct list_head *virtual_queue(struct virtual_engine *ve)
5345 {
5346 	return &ve->base.execlists.default_priolist.requests[0];
5347 }
5348 
5349 static void virtual_context_destroy(struct kref *kref)
5350 {
5351 	struct virtual_engine *ve =
5352 		container_of(kref, typeof(*ve), context.ref);
5353 	unsigned int n;
5354 
5355 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5356 	GEM_BUG_ON(ve->request);
5357 	GEM_BUG_ON(ve->context.inflight);
5358 
5359 	for (n = 0; n < ve->num_siblings; n++) {
5360 		struct intel_engine_cs *sibling = ve->siblings[n];
5361 		struct rb_node *node = &ve->nodes[sibling->id].rb;
5362 		unsigned long flags;
5363 
5364 		if (RB_EMPTY_NODE(node))
5365 			continue;
5366 
5367 		spin_lock_irqsave(&sibling->active.lock, flags);
5368 
5369 		/* Detachment is lazily performed in the execlists tasklet */
5370 		if (!RB_EMPTY_NODE(node))
5371 			rb_erase_cached(node, &sibling->execlists.virtual);
5372 
5373 		spin_unlock_irqrestore(&sibling->active.lock, flags);
5374 	}
5375 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5376 
5377 	if (ve->context.state)
5378 		__execlists_context_fini(&ve->context);
5379 	intel_context_fini(&ve->context);
5380 
5381 	intel_engine_free_request_pool(&ve->base);
5382 
5383 	kfree(ve->bonds);
5384 	kfree(ve);
5385 }
5386 
5387 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5388 {
5389 	int swp;
5390 
5391 	/*
5392 	 * Pick a random sibling on starting to help spread the load around.
5393 	 *
5394 	 * New contexts are typically created with exactly the same order
5395 	 * of siblings, and often started in batches. Due to the way we iterate
5396 	 * the array of sibling when submitting requests, sibling[0] is
5397 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5398 	 * randomised across the system, we also help spread the load by the
5399 	 * first engine we inspect being different each time.
5400 	 *
5401 	 * NB This does not force us to execute on this engine, it will just
5402 	 * typically be the first we inspect for submission.
5403 	 */
5404 	swp = prandom_u32_max(ve->num_siblings);
5405 	if (!swp)
5406 		return;
5407 
5408 	swap(ve->siblings[swp], ve->siblings[0]);
5409 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
5410 		virtual_update_register_offsets(ve->context.lrc_reg_state,
5411 						ve->siblings[0]);
5412 }
5413 
5414 static int virtual_context_alloc(struct intel_context *ce)
5415 {
5416 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5417 
5418 	return __execlists_context_alloc(ce, ve->siblings[0]);
5419 }
5420 
5421 static int virtual_context_pin(struct intel_context *ce)
5422 {
5423 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5424 	int err;
5425 
5426 	/* Note: we must use a real engine class for setting up reg state */
5427 	err = __execlists_context_pin(ce, ve->siblings[0]);
5428 	if (err)
5429 		return err;
5430 
5431 	virtual_engine_initial_hint(ve);
5432 	return 0;
5433 }
5434 
5435 static void virtual_context_enter(struct intel_context *ce)
5436 {
5437 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5438 	unsigned int n;
5439 
5440 	for (n = 0; n < ve->num_siblings; n++)
5441 		intel_engine_pm_get(ve->siblings[n]);
5442 
5443 	intel_timeline_enter(ce->timeline);
5444 }
5445 
5446 static void virtual_context_exit(struct intel_context *ce)
5447 {
5448 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5449 	unsigned int n;
5450 
5451 	intel_timeline_exit(ce->timeline);
5452 
5453 	for (n = 0; n < ve->num_siblings; n++)
5454 		intel_engine_pm_put(ve->siblings[n]);
5455 }
5456 
5457 static const struct intel_context_ops virtual_context_ops = {
5458 	.alloc = virtual_context_alloc,
5459 
5460 	.pin = virtual_context_pin,
5461 	.unpin = execlists_context_unpin,
5462 
5463 	.enter = virtual_context_enter,
5464 	.exit = virtual_context_exit,
5465 
5466 	.destroy = virtual_context_destroy,
5467 };
5468 
5469 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5470 {
5471 	struct i915_request *rq;
5472 	intel_engine_mask_t mask;
5473 
5474 	rq = READ_ONCE(ve->request);
5475 	if (!rq)
5476 		return 0;
5477 
5478 	/* The rq is ready for submission; rq->execution_mask is now stable. */
5479 	mask = rq->execution_mask;
5480 	if (unlikely(!mask)) {
5481 		/* Invalid selection, submit to a random engine in error */
5482 		i915_request_set_error_once(rq, -ENODEV);
5483 		mask = ve->siblings[0]->mask;
5484 	}
5485 
5486 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5487 		     rq->fence.context, rq->fence.seqno,
5488 		     mask, ve->base.execlists.queue_priority_hint);
5489 
5490 	return mask;
5491 }
5492 
5493 static void virtual_submission_tasklet(unsigned long data)
5494 {
5495 	struct virtual_engine * const ve = (struct virtual_engine *)data;
5496 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5497 	intel_engine_mask_t mask;
5498 	unsigned int n;
5499 
5500 	rcu_read_lock();
5501 	mask = virtual_submission_mask(ve);
5502 	rcu_read_unlock();
5503 	if (unlikely(!mask))
5504 		return;
5505 
5506 	local_irq_disable();
5507 	for (n = 0; n < ve->num_siblings; n++) {
5508 		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5509 		struct ve_node * const node = &ve->nodes[sibling->id];
5510 		struct rb_node **parent, *rb;
5511 		bool first;
5512 
5513 		if (!READ_ONCE(ve->request))
5514 			break; /* already handled by a sibling's tasklet */
5515 
5516 		if (unlikely(!(mask & sibling->mask))) {
5517 			if (!RB_EMPTY_NODE(&node->rb)) {
5518 				spin_lock(&sibling->active.lock);
5519 				rb_erase_cached(&node->rb,
5520 						&sibling->execlists.virtual);
5521 				RB_CLEAR_NODE(&node->rb);
5522 				spin_unlock(&sibling->active.lock);
5523 			}
5524 			continue;
5525 		}
5526 
5527 		spin_lock(&sibling->active.lock);
5528 
5529 		if (!RB_EMPTY_NODE(&node->rb)) {
5530 			/*
5531 			 * Cheat and avoid rebalancing the tree if we can
5532 			 * reuse this node in situ.
5533 			 */
5534 			first = rb_first_cached(&sibling->execlists.virtual) ==
5535 				&node->rb;
5536 			if (prio == node->prio || (prio > node->prio && first))
5537 				goto submit_engine;
5538 
5539 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5540 		}
5541 
5542 		rb = NULL;
5543 		first = true;
5544 		parent = &sibling->execlists.virtual.rb_root.rb_node;
5545 		while (*parent) {
5546 			struct ve_node *other;
5547 
5548 			rb = *parent;
5549 			other = rb_entry(rb, typeof(*other), rb);
5550 			if (prio > other->prio) {
5551 				parent = &rb->rb_left;
5552 			} else {
5553 				parent = &rb->rb_right;
5554 				first = false;
5555 			}
5556 		}
5557 
5558 		rb_link_node(&node->rb, rb, parent);
5559 		rb_insert_color_cached(&node->rb,
5560 				       &sibling->execlists.virtual,
5561 				       first);
5562 
5563 submit_engine:
5564 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5565 		node->prio = prio;
5566 		if (first && prio > sibling->execlists.queue_priority_hint)
5567 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5568 
5569 		spin_unlock(&sibling->active.lock);
5570 	}
5571 	local_irq_enable();
5572 }
5573 
5574 static void virtual_submit_request(struct i915_request *rq)
5575 {
5576 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5577 	struct i915_request *old;
5578 	unsigned long flags;
5579 
5580 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5581 		     rq->fence.context,
5582 		     rq->fence.seqno);
5583 
5584 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5585 
5586 	spin_lock_irqsave(&ve->base.active.lock, flags);
5587 
5588 	old = ve->request;
5589 	if (old) { /* background completion event from preempt-to-busy */
5590 		GEM_BUG_ON(!i915_request_completed(old));
5591 		__i915_request_submit(old);
5592 		i915_request_put(old);
5593 	}
5594 
5595 	if (i915_request_completed(rq)) {
5596 		__i915_request_submit(rq);
5597 
5598 		ve->base.execlists.queue_priority_hint = INT_MIN;
5599 		ve->request = NULL;
5600 	} else {
5601 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5602 		ve->request = i915_request_get(rq);
5603 
5604 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5605 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5606 
5607 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
5608 	}
5609 
5610 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5611 }
5612 
5613 static struct ve_bond *
5614 virtual_find_bond(struct virtual_engine *ve,
5615 		  const struct intel_engine_cs *master)
5616 {
5617 	int i;
5618 
5619 	for (i = 0; i < ve->num_bonds; i++) {
5620 		if (ve->bonds[i].master == master)
5621 			return &ve->bonds[i];
5622 	}
5623 
5624 	return NULL;
5625 }
5626 
5627 static void
5628 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5629 {
5630 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5631 	intel_engine_mask_t allowed, exec;
5632 	struct ve_bond *bond;
5633 
5634 	allowed = ~to_request(signal)->engine->mask;
5635 
5636 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5637 	if (bond)
5638 		allowed &= bond->sibling_mask;
5639 
5640 	/* Restrict the bonded request to run on only the available engines */
5641 	exec = READ_ONCE(rq->execution_mask);
5642 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5643 		;
5644 
5645 	/* Prevent the master from being re-run on the bonded engines */
5646 	to_request(signal)->execution_mask &= ~allowed;
5647 }
5648 
5649 struct intel_context *
5650 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5651 			       unsigned int count)
5652 {
5653 	struct virtual_engine *ve;
5654 	unsigned int n;
5655 	int err;
5656 
5657 	if (count == 0)
5658 		return ERR_PTR(-EINVAL);
5659 
5660 	if (count == 1)
5661 		return intel_context_create(siblings[0]);
5662 
5663 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5664 	if (!ve)
5665 		return ERR_PTR(-ENOMEM);
5666 
5667 	ve->base.i915 = siblings[0]->i915;
5668 	ve->base.gt = siblings[0]->gt;
5669 	ve->base.uncore = siblings[0]->uncore;
5670 	ve->base.id = -1;
5671 
5672 	ve->base.class = OTHER_CLASS;
5673 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5674 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5675 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5676 
5677 	/*
5678 	 * The decision on whether to submit a request using semaphores
5679 	 * depends on the saturated state of the engine. We only compute
5680 	 * this during HW submission of the request, and we need for this
5681 	 * state to be globally applied to all requests being submitted
5682 	 * to this engine. Virtual engines encompass more than one physical
5683 	 * engine and so we cannot accurately tell in advance if one of those
5684 	 * engines is already saturated and so cannot afford to use a semaphore
5685 	 * and be pessimized in priority for doing so -- if we are the only
5686 	 * context using semaphores after all other clients have stopped, we
5687 	 * will be starved on the saturated system. Such a global switch for
5688 	 * semaphores is less than ideal, but alas is the current compromise.
5689 	 */
5690 	ve->base.saturated = ALL_ENGINES;
5691 
5692 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5693 
5694 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5695 	intel_engine_init_breadcrumbs(&ve->base);
5696 	intel_engine_init_execlists(&ve->base);
5697 
5698 	ve->base.cops = &virtual_context_ops;
5699 	ve->base.request_alloc = execlists_request_alloc;
5700 
5701 	ve->base.schedule = i915_schedule;
5702 	ve->base.submit_request = virtual_submit_request;
5703 	ve->base.bond_execute = virtual_bond_execute;
5704 
5705 	INIT_LIST_HEAD(virtual_queue(ve));
5706 	ve->base.execlists.queue_priority_hint = INT_MIN;
5707 	tasklet_init(&ve->base.execlists.tasklet,
5708 		     virtual_submission_tasklet,
5709 		     (unsigned long)ve);
5710 
5711 	intel_context_init(&ve->context, &ve->base);
5712 
5713 	for (n = 0; n < count; n++) {
5714 		struct intel_engine_cs *sibling = siblings[n];
5715 
5716 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5717 		if (sibling->mask & ve->base.mask) {
5718 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5719 				  sibling->name);
5720 			err = -EINVAL;
5721 			goto err_put;
5722 		}
5723 
5724 		/*
5725 		 * The virtual engine implementation is tightly coupled to
5726 		 * the execlists backend -- we push out request directly
5727 		 * into a tree inside each physical engine. We could support
5728 		 * layering if we handle cloning of the requests and
5729 		 * submitting a copy into each backend.
5730 		 */
5731 		if (sibling->execlists.tasklet.func !=
5732 		    execlists_submission_tasklet) {
5733 			err = -ENODEV;
5734 			goto err_put;
5735 		}
5736 
5737 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5738 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5739 
5740 		ve->siblings[ve->num_siblings++] = sibling;
5741 		ve->base.mask |= sibling->mask;
5742 
5743 		/*
5744 		 * All physical engines must be compatible for their emission
5745 		 * functions (as we build the instructions during request
5746 		 * construction and do not alter them before submission
5747 		 * on the physical engine). We use the engine class as a guide
5748 		 * here, although that could be refined.
5749 		 */
5750 		if (ve->base.class != OTHER_CLASS) {
5751 			if (ve->base.class != sibling->class) {
5752 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5753 					  sibling->class, ve->base.class);
5754 				err = -EINVAL;
5755 				goto err_put;
5756 			}
5757 			continue;
5758 		}
5759 
5760 		ve->base.class = sibling->class;
5761 		ve->base.uabi_class = sibling->uabi_class;
5762 		snprintf(ve->base.name, sizeof(ve->base.name),
5763 			 "v%dx%d", ve->base.class, count);
5764 		ve->base.context_size = sibling->context_size;
5765 
5766 		ve->base.emit_bb_start = sibling->emit_bb_start;
5767 		ve->base.emit_flush = sibling->emit_flush;
5768 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5769 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5770 		ve->base.emit_fini_breadcrumb_dw =
5771 			sibling->emit_fini_breadcrumb_dw;
5772 
5773 		ve->base.flags = sibling->flags;
5774 	}
5775 
5776 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5777 
5778 	return &ve->context;
5779 
5780 err_put:
5781 	intel_context_put(&ve->context);
5782 	return ERR_PTR(err);
5783 }
5784 
5785 struct intel_context *
5786 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5787 {
5788 	struct virtual_engine *se = to_virtual_engine(src);
5789 	struct intel_context *dst;
5790 
5791 	dst = intel_execlists_create_virtual(se->siblings,
5792 					     se->num_siblings);
5793 	if (IS_ERR(dst))
5794 		return dst;
5795 
5796 	if (se->num_bonds) {
5797 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5798 
5799 		de->bonds = kmemdup(se->bonds,
5800 				    sizeof(*se->bonds) * se->num_bonds,
5801 				    GFP_KERNEL);
5802 		if (!de->bonds) {
5803 			intel_context_put(dst);
5804 			return ERR_PTR(-ENOMEM);
5805 		}
5806 
5807 		de->num_bonds = se->num_bonds;
5808 	}
5809 
5810 	return dst;
5811 }
5812 
5813 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5814 				     const struct intel_engine_cs *master,
5815 				     const struct intel_engine_cs *sibling)
5816 {
5817 	struct virtual_engine *ve = to_virtual_engine(engine);
5818 	struct ve_bond *bond;
5819 	int n;
5820 
5821 	/* Sanity check the sibling is part of the virtual engine */
5822 	for (n = 0; n < ve->num_siblings; n++)
5823 		if (sibling == ve->siblings[n])
5824 			break;
5825 	if (n == ve->num_siblings)
5826 		return -EINVAL;
5827 
5828 	bond = virtual_find_bond(ve, master);
5829 	if (bond) {
5830 		bond->sibling_mask |= sibling->mask;
5831 		return 0;
5832 	}
5833 
5834 	bond = krealloc(ve->bonds,
5835 			sizeof(*bond) * (ve->num_bonds + 1),
5836 			GFP_KERNEL);
5837 	if (!bond)
5838 		return -ENOMEM;
5839 
5840 	bond[ve->num_bonds].master = master;
5841 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5842 
5843 	ve->bonds = bond;
5844 	ve->num_bonds++;
5845 
5846 	return 0;
5847 }
5848 
5849 struct intel_engine_cs *
5850 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5851 				 unsigned int sibling)
5852 {
5853 	struct virtual_engine *ve = to_virtual_engine(engine);
5854 
5855 	if (sibling >= ve->num_siblings)
5856 		return NULL;
5857 
5858 	return ve->siblings[sibling];
5859 }
5860 
5861 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5862 				   struct drm_printer *m,
5863 				   void (*show_request)(struct drm_printer *m,
5864 							struct i915_request *rq,
5865 							const char *prefix),
5866 				   unsigned int max)
5867 {
5868 	const struct intel_engine_execlists *execlists = &engine->execlists;
5869 	struct i915_request *rq, *last;
5870 	unsigned long flags;
5871 	unsigned int count;
5872 	struct rb_node *rb;
5873 
5874 	spin_lock_irqsave(&engine->active.lock, flags);
5875 
5876 	last = NULL;
5877 	count = 0;
5878 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5879 		if (count++ < max - 1)
5880 			show_request(m, rq, "\t\tE ");
5881 		else
5882 			last = rq;
5883 	}
5884 	if (last) {
5885 		if (count > max) {
5886 			drm_printf(m,
5887 				   "\t\t...skipping %d executing requests...\n",
5888 				   count - max);
5889 		}
5890 		show_request(m, last, "\t\tE ");
5891 	}
5892 
5893 	if (execlists->switch_priority_hint != INT_MIN)
5894 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5895 			   READ_ONCE(execlists->switch_priority_hint));
5896 	if (execlists->queue_priority_hint != INT_MIN)
5897 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5898 			   READ_ONCE(execlists->queue_priority_hint));
5899 
5900 	last = NULL;
5901 	count = 0;
5902 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5903 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5904 		int i;
5905 
5906 		priolist_for_each_request(rq, p, i) {
5907 			if (count++ < max - 1)
5908 				show_request(m, rq, "\t\tQ ");
5909 			else
5910 				last = rq;
5911 		}
5912 	}
5913 	if (last) {
5914 		if (count > max) {
5915 			drm_printf(m,
5916 				   "\t\t...skipping %d queued requests...\n",
5917 				   count - max);
5918 		}
5919 		show_request(m, last, "\t\tQ ");
5920 	}
5921 
5922 	last = NULL;
5923 	count = 0;
5924 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5925 		struct virtual_engine *ve =
5926 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5927 		struct i915_request *rq = READ_ONCE(ve->request);
5928 
5929 		if (rq) {
5930 			if (count++ < max - 1)
5931 				show_request(m, rq, "\t\tV ");
5932 			else
5933 				last = rq;
5934 		}
5935 	}
5936 	if (last) {
5937 		if (count > max) {
5938 			drm_printf(m,
5939 				   "\t\t...skipping %d virtual requests...\n",
5940 				   count - max);
5941 		}
5942 		show_request(m, last, "\t\tV ");
5943 	}
5944 
5945 	spin_unlock_irqrestore(&engine->active.lock, flags);
5946 }
5947 
5948 void intel_lr_context_reset(struct intel_engine_cs *engine,
5949 			    struct intel_context *ce,
5950 			    u32 head,
5951 			    bool scrub)
5952 {
5953 	GEM_BUG_ON(!intel_context_is_pinned(ce));
5954 
5955 	/*
5956 	 * We want a simple context + ring to execute the breadcrumb update.
5957 	 * We cannot rely on the context being intact across the GPU hang,
5958 	 * so clear it and rebuild just what we need for the breadcrumb.
5959 	 * All pending requests for this context will be zapped, and any
5960 	 * future request will be after userspace has had the opportunity
5961 	 * to recreate its own state.
5962 	 */
5963 	if (scrub)
5964 		restore_default_state(ce, engine);
5965 
5966 	/* Rerun the request; its payload has been neutered (if guilty). */
5967 	__execlists_update_reg_state(ce, engine, head);
5968 }
5969 
5970 bool
5971 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5972 {
5973 	return engine->set_default_submission ==
5974 	       intel_execlists_set_default_submission;
5975 }
5976 
5977 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5978 #include "selftest_lrc.c"
5979 #endif
5980