xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 0b6613c6)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 #include "shmem_utils.h"
151 
152 #define RING_EXECLIST_QFULL		(1 << 0x2)
153 #define RING_EXECLIST1_VALID		(1 << 0x3)
154 #define RING_EXECLIST0_VALID		(1 << 0x4)
155 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
156 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
157 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
158 
159 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
160 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
162 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
163 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
164 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
165 
166 #define GEN8_CTX_STATUS_COMPLETED_MASK \
167 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
168 
169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
170 
171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
173 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
174 #define GEN12_IDLE_CTX_ID		0x7FF
175 #define GEN12_CSB_CTX_VALID(csb_dw) \
176 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
177 
178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
180 
181 struct virtual_engine {
182 	struct intel_engine_cs base;
183 	struct intel_context context;
184 
185 	/*
186 	 * We allow only a single request through the virtual engine at a time
187 	 * (each request in the timeline waits for the completion fence of
188 	 * the previous before being submitted). By restricting ourselves to
189 	 * only submitting a single request, each request is placed on to a
190 	 * physical to maximise load spreading (by virtue of the late greedy
191 	 * scheduling -- each real engine takes the next available request
192 	 * upon idling).
193 	 */
194 	struct i915_request *request;
195 
196 	/*
197 	 * We keep a rbtree of available virtual engines inside each physical
198 	 * engine, sorted by priority. Here we preallocate the nodes we need
199 	 * for the virtual engine, indexed by physical_engine->id.
200 	 */
201 	struct ve_node {
202 		struct rb_node rb;
203 		int prio;
204 	} nodes[I915_NUM_ENGINES];
205 
206 	/*
207 	 * Keep track of bonded pairs -- restrictions upon on our selection
208 	 * of physical engines any particular request may be submitted to.
209 	 * If we receive a submit-fence from a master engine, we will only
210 	 * use one of sibling_mask physical engines.
211 	 */
212 	struct ve_bond {
213 		const struct intel_engine_cs *master;
214 		intel_engine_mask_t sibling_mask;
215 	} *bonds;
216 	unsigned int num_bonds;
217 
218 	/* And finally, which physical engines this virtual engine maps onto. */
219 	unsigned int num_siblings;
220 	struct intel_engine_cs *siblings[];
221 };
222 
223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
224 {
225 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
226 	return container_of(engine, struct virtual_engine, base);
227 }
228 
229 static int __execlists_context_alloc(struct intel_context *ce,
230 				     struct intel_engine_cs *engine);
231 
232 static void execlists_init_reg_state(u32 *reg_state,
233 				     const struct intel_context *ce,
234 				     const struct intel_engine_cs *engine,
235 				     const struct intel_ring *ring,
236 				     bool close);
237 static void
238 __execlists_update_reg_state(const struct intel_context *ce,
239 			     const struct intel_engine_cs *engine,
240 			     u32 head);
241 
242 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
243 {
244 	if (INTEL_GEN(engine->i915) >= 12)
245 		return 0x60;
246 	else if (INTEL_GEN(engine->i915) >= 9)
247 		return 0x54;
248 	else if (engine->class == RENDER_CLASS)
249 		return 0x58;
250 	else
251 		return -1;
252 }
253 
254 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
255 {
256 	if (INTEL_GEN(engine->i915) >= 12)
257 		return 0x74;
258 	else if (INTEL_GEN(engine->i915) >= 9)
259 		return 0x68;
260 	else if (engine->class == RENDER_CLASS)
261 		return 0xd8;
262 	else
263 		return -1;
264 }
265 
266 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
267 {
268 	if (INTEL_GEN(engine->i915) >= 12)
269 		return 0x12;
270 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
271 		return 0x18;
272 	else
273 		return -1;
274 }
275 
276 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
277 {
278 	int x;
279 
280 	x = lrc_ring_wa_bb_per_ctx(engine);
281 	if (x < 0)
282 		return x;
283 
284 	return x + 2;
285 }
286 
287 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
288 {
289 	int x;
290 
291 	x = lrc_ring_indirect_ptr(engine);
292 	if (x < 0)
293 		return x;
294 
295 	return x + 2;
296 }
297 
298 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
299 {
300 	if (engine->class != RENDER_CLASS)
301 		return -1;
302 
303 	if (INTEL_GEN(engine->i915) >= 12)
304 		return 0xb6;
305 	else if (INTEL_GEN(engine->i915) >= 11)
306 		return 0xaa;
307 	else
308 		return -1;
309 }
310 
311 static u32
312 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
313 {
314 	switch (INTEL_GEN(engine->i915)) {
315 	default:
316 		MISSING_CASE(INTEL_GEN(engine->i915));
317 		fallthrough;
318 	case 12:
319 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
320 	case 11:
321 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
322 	case 10:
323 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
324 	case 9:
325 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
326 	case 8:
327 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
328 	}
329 }
330 
331 static void
332 lrc_ring_setup_indirect_ctx(u32 *regs,
333 			    const struct intel_engine_cs *engine,
334 			    u32 ctx_bb_ggtt_addr,
335 			    u32 size)
336 {
337 	GEM_BUG_ON(!size);
338 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
339 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
340 	regs[lrc_ring_indirect_ptr(engine) + 1] =
341 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
342 
343 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
344 	regs[lrc_ring_indirect_offset(engine) + 1] =
345 		lrc_ring_indirect_offset_default(engine) << 6;
346 }
347 
348 static u32 intel_context_get_runtime(const struct intel_context *ce)
349 {
350 	/*
351 	 * We can use either ppHWSP[16] which is recorded before the context
352 	 * switch (and so excludes the cost of context switches) or use the
353 	 * value from the context image itself, which is saved/restored earlier
354 	 * and so includes the cost of the save.
355 	 */
356 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
357 }
358 
359 static void mark_eio(struct i915_request *rq)
360 {
361 	if (i915_request_completed(rq))
362 		return;
363 
364 	GEM_BUG_ON(i915_request_signaled(rq));
365 
366 	i915_request_set_error_once(rq, -EIO);
367 	i915_request_mark_complete(rq);
368 }
369 
370 static struct i915_request *
371 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
372 {
373 	struct i915_request *active = rq;
374 
375 	rcu_read_lock();
376 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
377 		if (i915_request_completed(rq))
378 			break;
379 
380 		active = rq;
381 	}
382 	rcu_read_unlock();
383 
384 	return active;
385 }
386 
387 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
388 {
389 	return (i915_ggtt_offset(engine->status_page.vma) +
390 		I915_GEM_HWS_PREEMPT_ADDR);
391 }
392 
393 static inline void
394 ring_set_paused(const struct intel_engine_cs *engine, int state)
395 {
396 	/*
397 	 * We inspect HWS_PREEMPT with a semaphore inside
398 	 * engine->emit_fini_breadcrumb. If the dword is true,
399 	 * the ring is paused as the semaphore will busywait
400 	 * until the dword is false.
401 	 */
402 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
403 	if (state)
404 		wmb();
405 }
406 
407 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
408 {
409 	return rb_entry(rb, struct i915_priolist, node);
410 }
411 
412 static inline int rq_prio(const struct i915_request *rq)
413 {
414 	return READ_ONCE(rq->sched.attr.priority);
415 }
416 
417 static int effective_prio(const struct i915_request *rq)
418 {
419 	int prio = rq_prio(rq);
420 
421 	/*
422 	 * If this request is special and must not be interrupted at any
423 	 * cost, so be it. Note we are only checking the most recent request
424 	 * in the context and so may be masking an earlier vip request. It
425 	 * is hoped that under the conditions where nopreempt is used, this
426 	 * will not matter (i.e. all requests to that context will be
427 	 * nopreempt for as long as desired).
428 	 */
429 	if (i915_request_has_nopreempt(rq))
430 		prio = I915_PRIORITY_UNPREEMPTABLE;
431 
432 	return prio;
433 }
434 
435 static int queue_prio(const struct intel_engine_execlists *execlists)
436 {
437 	struct i915_priolist *p;
438 	struct rb_node *rb;
439 
440 	rb = rb_first_cached(&execlists->queue);
441 	if (!rb)
442 		return INT_MIN;
443 
444 	/*
445 	 * As the priolist[] are inverted, with the highest priority in [0],
446 	 * we have to flip the index value to become priority.
447 	 */
448 	p = to_priolist(rb);
449 	if (!I915_USER_PRIORITY_SHIFT)
450 		return p->priority;
451 
452 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
453 }
454 
455 static inline bool need_preempt(const struct intel_engine_cs *engine,
456 				const struct i915_request *rq,
457 				struct rb_node *rb)
458 {
459 	int last_prio;
460 
461 	if (!intel_engine_has_semaphores(engine))
462 		return false;
463 
464 	/*
465 	 * Check if the current priority hint merits a preemption attempt.
466 	 *
467 	 * We record the highest value priority we saw during rescheduling
468 	 * prior to this dequeue, therefore we know that if it is strictly
469 	 * less than the current tail of ESLP[0], we do not need to force
470 	 * a preempt-to-idle cycle.
471 	 *
472 	 * However, the priority hint is a mere hint that we may need to
473 	 * preempt. If that hint is stale or we may be trying to preempt
474 	 * ourselves, ignore the request.
475 	 *
476 	 * More naturally we would write
477 	 *      prio >= max(0, last);
478 	 * except that we wish to prevent triggering preemption at the same
479 	 * priority level: the task that is running should remain running
480 	 * to preserve FIFO ordering of dependencies.
481 	 */
482 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
483 	if (engine->execlists.queue_priority_hint <= last_prio)
484 		return false;
485 
486 	/*
487 	 * Check against the first request in ELSP[1], it will, thanks to the
488 	 * power of PI, be the highest priority of that context.
489 	 */
490 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
491 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
492 		return true;
493 
494 	if (rb) {
495 		struct virtual_engine *ve =
496 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
497 		bool preempt = false;
498 
499 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
500 			struct i915_request *next;
501 
502 			rcu_read_lock();
503 			next = READ_ONCE(ve->request);
504 			if (next)
505 				preempt = rq_prio(next) > last_prio;
506 			rcu_read_unlock();
507 		}
508 
509 		if (preempt)
510 			return preempt;
511 	}
512 
513 	/*
514 	 * If the inflight context did not trigger the preemption, then maybe
515 	 * it was the set of queued requests? Pick the highest priority in
516 	 * the queue (the first active priolist) and see if it deserves to be
517 	 * running instead of ELSP[0].
518 	 *
519 	 * The highest priority request in the queue can not be either
520 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
521 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
522 	 */
523 	return queue_prio(&engine->execlists) > last_prio;
524 }
525 
526 __maybe_unused static inline bool
527 assert_priority_queue(const struct i915_request *prev,
528 		      const struct i915_request *next)
529 {
530 	/*
531 	 * Without preemption, the prev may refer to the still active element
532 	 * which we refuse to let go.
533 	 *
534 	 * Even with preemption, there are times when we think it is better not
535 	 * to preempt and leave an ostensibly lower priority request in flight.
536 	 */
537 	if (i915_request_is_active(prev))
538 		return true;
539 
540 	return rq_prio(prev) >= rq_prio(next);
541 }
542 
543 /*
544  * The context descriptor encodes various attributes of a context,
545  * including its GTT address and some flags. Because it's fairly
546  * expensive to calculate, we'll just do it once and cache the result,
547  * which remains valid until the context is unpinned.
548  *
549  * This is what a descriptor looks like, from LSB to MSB::
550  *
551  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
552  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
553  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
554  *      bits 53-54:    mbz, reserved for use by hardware
555  *      bits 55-63:    group ID, currently unused and set to 0
556  *
557  * Starting from Gen11, the upper dword of the descriptor has a new format:
558  *
559  *      bits 32-36:    reserved
560  *      bits 37-47:    SW context ID
561  *      bits 48:53:    engine instance
562  *      bit 54:        mbz, reserved for use by hardware
563  *      bits 55-60:    SW counter
564  *      bits 61-63:    engine class
565  *
566  * engine info, SW context ID and SW counter need to form a unique number
567  * (Context ID) per lrc.
568  */
569 static u32
570 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
571 {
572 	u32 desc;
573 
574 	desc = INTEL_LEGACY_32B_CONTEXT;
575 	if (i915_vm_is_4lvl(ce->vm))
576 		desc = INTEL_LEGACY_64B_CONTEXT;
577 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
578 
579 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
580 	if (IS_GEN(engine->i915, 8))
581 		desc |= GEN8_CTX_L3LLC_COHERENT;
582 
583 	return i915_ggtt_offset(ce->state) | desc;
584 }
585 
586 static inline unsigned int dword_in_page(void *addr)
587 {
588 	return offset_in_page(addr) / sizeof(u32);
589 }
590 
591 static void set_offsets(u32 *regs,
592 			const u8 *data,
593 			const struct intel_engine_cs *engine,
594 			bool clear)
595 #define NOP(x) (BIT(7) | (x))
596 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
597 #define POSTED BIT(0)
598 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
599 #define REG16(x) \
600 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
601 	(((x) >> 2) & 0x7f)
602 #define END(total_state_size) 0, (total_state_size)
603 {
604 	const u32 base = engine->mmio_base;
605 
606 	while (*data) {
607 		u8 count, flags;
608 
609 		if (*data & BIT(7)) { /* skip */
610 			count = *data++ & ~BIT(7);
611 			if (clear)
612 				memset32(regs, MI_NOOP, count);
613 			regs += count;
614 			continue;
615 		}
616 
617 		count = *data & 0x3f;
618 		flags = *data >> 6;
619 		data++;
620 
621 		*regs = MI_LOAD_REGISTER_IMM(count);
622 		if (flags & POSTED)
623 			*regs |= MI_LRI_FORCE_POSTED;
624 		if (INTEL_GEN(engine->i915) >= 11)
625 			*regs |= MI_LRI_LRM_CS_MMIO;
626 		regs++;
627 
628 		GEM_BUG_ON(!count);
629 		do {
630 			u32 offset = 0;
631 			u8 v;
632 
633 			do {
634 				v = *data++;
635 				offset <<= 7;
636 				offset |= v & ~BIT(7);
637 			} while (v & BIT(7));
638 
639 			regs[0] = base + (offset << 2);
640 			if (clear)
641 				regs[1] = 0;
642 			regs += 2;
643 		} while (--count);
644 	}
645 
646 	if (clear) {
647 		u8 count = *++data;
648 
649 		/* Clear past the tail for HW access */
650 		GEM_BUG_ON(dword_in_page(regs) > count);
651 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
652 
653 		/* Close the batch; used mainly by live_lrc_layout() */
654 		*regs = MI_BATCH_BUFFER_END;
655 		if (INTEL_GEN(engine->i915) >= 10)
656 			*regs |= BIT(0);
657 	}
658 }
659 
660 static const u8 gen8_xcs_offsets[] = {
661 	NOP(1),
662 	LRI(11, 0),
663 	REG16(0x244),
664 	REG(0x034),
665 	REG(0x030),
666 	REG(0x038),
667 	REG(0x03c),
668 	REG(0x168),
669 	REG(0x140),
670 	REG(0x110),
671 	REG(0x11c),
672 	REG(0x114),
673 	REG(0x118),
674 
675 	NOP(9),
676 	LRI(9, 0),
677 	REG16(0x3a8),
678 	REG16(0x28c),
679 	REG16(0x288),
680 	REG16(0x284),
681 	REG16(0x280),
682 	REG16(0x27c),
683 	REG16(0x278),
684 	REG16(0x274),
685 	REG16(0x270),
686 
687 	NOP(13),
688 	LRI(2, 0),
689 	REG16(0x200),
690 	REG(0x028),
691 
692 	END(80)
693 };
694 
695 static const u8 gen9_xcs_offsets[] = {
696 	NOP(1),
697 	LRI(14, POSTED),
698 	REG16(0x244),
699 	REG(0x034),
700 	REG(0x030),
701 	REG(0x038),
702 	REG(0x03c),
703 	REG(0x168),
704 	REG(0x140),
705 	REG(0x110),
706 	REG(0x11c),
707 	REG(0x114),
708 	REG(0x118),
709 	REG(0x1c0),
710 	REG(0x1c4),
711 	REG(0x1c8),
712 
713 	NOP(3),
714 	LRI(9, POSTED),
715 	REG16(0x3a8),
716 	REG16(0x28c),
717 	REG16(0x288),
718 	REG16(0x284),
719 	REG16(0x280),
720 	REG16(0x27c),
721 	REG16(0x278),
722 	REG16(0x274),
723 	REG16(0x270),
724 
725 	NOP(13),
726 	LRI(1, POSTED),
727 	REG16(0x200),
728 
729 	NOP(13),
730 	LRI(44, POSTED),
731 	REG(0x028),
732 	REG(0x09c),
733 	REG(0x0c0),
734 	REG(0x178),
735 	REG(0x17c),
736 	REG16(0x358),
737 	REG(0x170),
738 	REG(0x150),
739 	REG(0x154),
740 	REG(0x158),
741 	REG16(0x41c),
742 	REG16(0x600),
743 	REG16(0x604),
744 	REG16(0x608),
745 	REG16(0x60c),
746 	REG16(0x610),
747 	REG16(0x614),
748 	REG16(0x618),
749 	REG16(0x61c),
750 	REG16(0x620),
751 	REG16(0x624),
752 	REG16(0x628),
753 	REG16(0x62c),
754 	REG16(0x630),
755 	REG16(0x634),
756 	REG16(0x638),
757 	REG16(0x63c),
758 	REG16(0x640),
759 	REG16(0x644),
760 	REG16(0x648),
761 	REG16(0x64c),
762 	REG16(0x650),
763 	REG16(0x654),
764 	REG16(0x658),
765 	REG16(0x65c),
766 	REG16(0x660),
767 	REG16(0x664),
768 	REG16(0x668),
769 	REG16(0x66c),
770 	REG16(0x670),
771 	REG16(0x674),
772 	REG16(0x678),
773 	REG16(0x67c),
774 	REG(0x068),
775 
776 	END(176)
777 };
778 
779 static const u8 gen12_xcs_offsets[] = {
780 	NOP(1),
781 	LRI(13, POSTED),
782 	REG16(0x244),
783 	REG(0x034),
784 	REG(0x030),
785 	REG(0x038),
786 	REG(0x03c),
787 	REG(0x168),
788 	REG(0x140),
789 	REG(0x110),
790 	REG(0x1c0),
791 	REG(0x1c4),
792 	REG(0x1c8),
793 	REG(0x180),
794 	REG16(0x2b4),
795 
796 	NOP(5),
797 	LRI(9, POSTED),
798 	REG16(0x3a8),
799 	REG16(0x28c),
800 	REG16(0x288),
801 	REG16(0x284),
802 	REG16(0x280),
803 	REG16(0x27c),
804 	REG16(0x278),
805 	REG16(0x274),
806 	REG16(0x270),
807 
808 	END(80)
809 };
810 
811 static const u8 gen8_rcs_offsets[] = {
812 	NOP(1),
813 	LRI(14, POSTED),
814 	REG16(0x244),
815 	REG(0x034),
816 	REG(0x030),
817 	REG(0x038),
818 	REG(0x03c),
819 	REG(0x168),
820 	REG(0x140),
821 	REG(0x110),
822 	REG(0x11c),
823 	REG(0x114),
824 	REG(0x118),
825 	REG(0x1c0),
826 	REG(0x1c4),
827 	REG(0x1c8),
828 
829 	NOP(3),
830 	LRI(9, POSTED),
831 	REG16(0x3a8),
832 	REG16(0x28c),
833 	REG16(0x288),
834 	REG16(0x284),
835 	REG16(0x280),
836 	REG16(0x27c),
837 	REG16(0x278),
838 	REG16(0x274),
839 	REG16(0x270),
840 
841 	NOP(13),
842 	LRI(1, 0),
843 	REG(0x0c8),
844 
845 	END(80)
846 };
847 
848 static const u8 gen9_rcs_offsets[] = {
849 	NOP(1),
850 	LRI(14, POSTED),
851 	REG16(0x244),
852 	REG(0x34),
853 	REG(0x30),
854 	REG(0x38),
855 	REG(0x3c),
856 	REG(0x168),
857 	REG(0x140),
858 	REG(0x110),
859 	REG(0x11c),
860 	REG(0x114),
861 	REG(0x118),
862 	REG(0x1c0),
863 	REG(0x1c4),
864 	REG(0x1c8),
865 
866 	NOP(3),
867 	LRI(9, POSTED),
868 	REG16(0x3a8),
869 	REG16(0x28c),
870 	REG16(0x288),
871 	REG16(0x284),
872 	REG16(0x280),
873 	REG16(0x27c),
874 	REG16(0x278),
875 	REG16(0x274),
876 	REG16(0x270),
877 
878 	NOP(13),
879 	LRI(1, 0),
880 	REG(0xc8),
881 
882 	NOP(13),
883 	LRI(44, POSTED),
884 	REG(0x28),
885 	REG(0x9c),
886 	REG(0xc0),
887 	REG(0x178),
888 	REG(0x17c),
889 	REG16(0x358),
890 	REG(0x170),
891 	REG(0x150),
892 	REG(0x154),
893 	REG(0x158),
894 	REG16(0x41c),
895 	REG16(0x600),
896 	REG16(0x604),
897 	REG16(0x608),
898 	REG16(0x60c),
899 	REG16(0x610),
900 	REG16(0x614),
901 	REG16(0x618),
902 	REG16(0x61c),
903 	REG16(0x620),
904 	REG16(0x624),
905 	REG16(0x628),
906 	REG16(0x62c),
907 	REG16(0x630),
908 	REG16(0x634),
909 	REG16(0x638),
910 	REG16(0x63c),
911 	REG16(0x640),
912 	REG16(0x644),
913 	REG16(0x648),
914 	REG16(0x64c),
915 	REG16(0x650),
916 	REG16(0x654),
917 	REG16(0x658),
918 	REG16(0x65c),
919 	REG16(0x660),
920 	REG16(0x664),
921 	REG16(0x668),
922 	REG16(0x66c),
923 	REG16(0x670),
924 	REG16(0x674),
925 	REG16(0x678),
926 	REG16(0x67c),
927 	REG(0x68),
928 
929 	END(176)
930 };
931 
932 static const u8 gen11_rcs_offsets[] = {
933 	NOP(1),
934 	LRI(15, POSTED),
935 	REG16(0x244),
936 	REG(0x034),
937 	REG(0x030),
938 	REG(0x038),
939 	REG(0x03c),
940 	REG(0x168),
941 	REG(0x140),
942 	REG(0x110),
943 	REG(0x11c),
944 	REG(0x114),
945 	REG(0x118),
946 	REG(0x1c0),
947 	REG(0x1c4),
948 	REG(0x1c8),
949 	REG(0x180),
950 
951 	NOP(1),
952 	LRI(9, POSTED),
953 	REG16(0x3a8),
954 	REG16(0x28c),
955 	REG16(0x288),
956 	REG16(0x284),
957 	REG16(0x280),
958 	REG16(0x27c),
959 	REG16(0x278),
960 	REG16(0x274),
961 	REG16(0x270),
962 
963 	LRI(1, POSTED),
964 	REG(0x1b0),
965 
966 	NOP(10),
967 	LRI(1, 0),
968 	REG(0x0c8),
969 
970 	END(80)
971 };
972 
973 static const u8 gen12_rcs_offsets[] = {
974 	NOP(1),
975 	LRI(13, POSTED),
976 	REG16(0x244),
977 	REG(0x034),
978 	REG(0x030),
979 	REG(0x038),
980 	REG(0x03c),
981 	REG(0x168),
982 	REG(0x140),
983 	REG(0x110),
984 	REG(0x1c0),
985 	REG(0x1c4),
986 	REG(0x1c8),
987 	REG(0x180),
988 	REG16(0x2b4),
989 
990 	NOP(5),
991 	LRI(9, POSTED),
992 	REG16(0x3a8),
993 	REG16(0x28c),
994 	REG16(0x288),
995 	REG16(0x284),
996 	REG16(0x280),
997 	REG16(0x27c),
998 	REG16(0x278),
999 	REG16(0x274),
1000 	REG16(0x270),
1001 
1002 	LRI(3, POSTED),
1003 	REG(0x1b0),
1004 	REG16(0x5a8),
1005 	REG16(0x5ac),
1006 
1007 	NOP(6),
1008 	LRI(1, 0),
1009 	REG(0x0c8),
1010 	NOP(3 + 9 + 1),
1011 
1012 	LRI(51, POSTED),
1013 	REG16(0x588),
1014 	REG16(0x588),
1015 	REG16(0x588),
1016 	REG16(0x588),
1017 	REG16(0x588),
1018 	REG16(0x588),
1019 	REG(0x028),
1020 	REG(0x09c),
1021 	REG(0x0c0),
1022 	REG(0x178),
1023 	REG(0x17c),
1024 	REG16(0x358),
1025 	REG(0x170),
1026 	REG(0x150),
1027 	REG(0x154),
1028 	REG(0x158),
1029 	REG16(0x41c),
1030 	REG16(0x600),
1031 	REG16(0x604),
1032 	REG16(0x608),
1033 	REG16(0x60c),
1034 	REG16(0x610),
1035 	REG16(0x614),
1036 	REG16(0x618),
1037 	REG16(0x61c),
1038 	REG16(0x620),
1039 	REG16(0x624),
1040 	REG16(0x628),
1041 	REG16(0x62c),
1042 	REG16(0x630),
1043 	REG16(0x634),
1044 	REG16(0x638),
1045 	REG16(0x63c),
1046 	REG16(0x640),
1047 	REG16(0x644),
1048 	REG16(0x648),
1049 	REG16(0x64c),
1050 	REG16(0x650),
1051 	REG16(0x654),
1052 	REG16(0x658),
1053 	REG16(0x65c),
1054 	REG16(0x660),
1055 	REG16(0x664),
1056 	REG16(0x668),
1057 	REG16(0x66c),
1058 	REG16(0x670),
1059 	REG16(0x674),
1060 	REG16(0x678),
1061 	REG16(0x67c),
1062 	REG(0x068),
1063 	REG(0x084),
1064 	NOP(1),
1065 
1066 	END(192)
1067 };
1068 
1069 #undef END
1070 #undef REG16
1071 #undef REG
1072 #undef LRI
1073 #undef NOP
1074 
1075 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1076 {
1077 	/*
1078 	 * The gen12+ lists only have the registers we program in the basic
1079 	 * default state. We rely on the context image using relative
1080 	 * addressing to automatic fixup the register state between the
1081 	 * physical engines for virtual engine.
1082 	 */
1083 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1084 		   !intel_engine_has_relative_mmio(engine));
1085 
1086 	if (engine->class == RENDER_CLASS) {
1087 		if (INTEL_GEN(engine->i915) >= 12)
1088 			return gen12_rcs_offsets;
1089 		else if (INTEL_GEN(engine->i915) >= 11)
1090 			return gen11_rcs_offsets;
1091 		else if (INTEL_GEN(engine->i915) >= 9)
1092 			return gen9_rcs_offsets;
1093 		else
1094 			return gen8_rcs_offsets;
1095 	} else {
1096 		if (INTEL_GEN(engine->i915) >= 12)
1097 			return gen12_xcs_offsets;
1098 		else if (INTEL_GEN(engine->i915) >= 9)
1099 			return gen9_xcs_offsets;
1100 		else
1101 			return gen8_xcs_offsets;
1102 	}
1103 }
1104 
1105 static struct i915_request *
1106 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1107 {
1108 	struct i915_request *rq, *rn, *active = NULL;
1109 	struct list_head *uninitialized_var(pl);
1110 	int prio = I915_PRIORITY_INVALID;
1111 
1112 	lockdep_assert_held(&engine->active.lock);
1113 
1114 	list_for_each_entry_safe_reverse(rq, rn,
1115 					 &engine->active.requests,
1116 					 sched.link) {
1117 		if (i915_request_completed(rq))
1118 			continue; /* XXX */
1119 
1120 		__i915_request_unsubmit(rq);
1121 
1122 		/*
1123 		 * Push the request back into the queue for later resubmission.
1124 		 * If this request is not native to this physical engine (i.e.
1125 		 * it came from a virtual source), push it back onto the virtual
1126 		 * engine so that it can be moved across onto another physical
1127 		 * engine as load dictates.
1128 		 */
1129 		if (likely(rq->execution_mask == engine->mask)) {
1130 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1131 			if (rq_prio(rq) != prio) {
1132 				prio = rq_prio(rq);
1133 				pl = i915_sched_lookup_priolist(engine, prio);
1134 			}
1135 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1136 
1137 			list_move(&rq->sched.link, pl);
1138 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1139 
1140 			/* Check in case we rollback so far we wrap [size/2] */
1141 			if (intel_ring_direction(rq->ring,
1142 						 intel_ring_wrap(rq->ring,
1143 								 rq->tail),
1144 						 rq->ring->tail) > 0)
1145 				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1146 
1147 			active = rq;
1148 		} else {
1149 			struct intel_engine_cs *owner = rq->context->engine;
1150 
1151 			/*
1152 			 * Decouple the virtual breadcrumb before moving it
1153 			 * back to the virtual engine -- we don't want the
1154 			 * request to complete in the background and try
1155 			 * and cancel the breadcrumb on the virtual engine
1156 			 * (instead of the old engine where it is linked)!
1157 			 */
1158 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1159 				     &rq->fence.flags)) {
1160 				spin_lock_nested(&rq->lock,
1161 						 SINGLE_DEPTH_NESTING);
1162 				i915_request_cancel_breadcrumb(rq);
1163 				spin_unlock(&rq->lock);
1164 			}
1165 			WRITE_ONCE(rq->engine, owner);
1166 			owner->submit_request(rq);
1167 			active = NULL;
1168 		}
1169 	}
1170 
1171 	return active;
1172 }
1173 
1174 struct i915_request *
1175 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1176 {
1177 	struct intel_engine_cs *engine =
1178 		container_of(execlists, typeof(*engine), execlists);
1179 
1180 	return __unwind_incomplete_requests(engine);
1181 }
1182 
1183 static inline void
1184 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1185 {
1186 	/*
1187 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1188 	 * The compiler should eliminate this function as dead-code.
1189 	 */
1190 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1191 		return;
1192 
1193 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1194 				   status, rq);
1195 }
1196 
1197 static void intel_engine_context_in(struct intel_engine_cs *engine)
1198 {
1199 	unsigned long flags;
1200 
1201 	if (atomic_add_unless(&engine->stats.active, 1, 0))
1202 		return;
1203 
1204 	write_seqlock_irqsave(&engine->stats.lock, flags);
1205 	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1206 		engine->stats.start = ktime_get();
1207 		atomic_inc(&engine->stats.active);
1208 	}
1209 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1210 }
1211 
1212 static void intel_engine_context_out(struct intel_engine_cs *engine)
1213 {
1214 	unsigned long flags;
1215 
1216 	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1217 
1218 	if (atomic_add_unless(&engine->stats.active, -1, 1))
1219 		return;
1220 
1221 	write_seqlock_irqsave(&engine->stats.lock, flags);
1222 	if (atomic_dec_and_test(&engine->stats.active)) {
1223 		engine->stats.total =
1224 			ktime_add(engine->stats.total,
1225 				  ktime_sub(ktime_get(), engine->stats.start));
1226 	}
1227 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1228 }
1229 
1230 static void
1231 execlists_check_context(const struct intel_context *ce,
1232 			const struct intel_engine_cs *engine)
1233 {
1234 	const struct intel_ring *ring = ce->ring;
1235 	u32 *regs = ce->lrc_reg_state;
1236 	bool valid = true;
1237 	int x;
1238 
1239 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1240 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1241 		       engine->name,
1242 		       regs[CTX_RING_START],
1243 		       i915_ggtt_offset(ring->vma));
1244 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1245 		valid = false;
1246 	}
1247 
1248 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1249 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1250 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1251 		       engine->name,
1252 		       regs[CTX_RING_CTL],
1253 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1254 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1255 		valid = false;
1256 	}
1257 
1258 	x = lrc_ring_mi_mode(engine);
1259 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1260 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1261 		       engine->name, regs[x + 1]);
1262 		regs[x + 1] &= ~STOP_RING;
1263 		regs[x + 1] |= STOP_RING << 16;
1264 		valid = false;
1265 	}
1266 
1267 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1268 }
1269 
1270 static void restore_default_state(struct intel_context *ce,
1271 				  struct intel_engine_cs *engine)
1272 {
1273 	u32 *regs;
1274 
1275 	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1276 	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1277 
1278 	ce->runtime.last = intel_context_get_runtime(ce);
1279 }
1280 
1281 static void reset_active(struct i915_request *rq,
1282 			 struct intel_engine_cs *engine)
1283 {
1284 	struct intel_context * const ce = rq->context;
1285 	u32 head;
1286 
1287 	/*
1288 	 * The executing context has been cancelled. We want to prevent
1289 	 * further execution along this context and propagate the error on
1290 	 * to anything depending on its results.
1291 	 *
1292 	 * In __i915_request_submit(), we apply the -EIO and remove the
1293 	 * requests' payloads for any banned requests. But first, we must
1294 	 * rewind the context back to the start of the incomplete request so
1295 	 * that we do not jump back into the middle of the batch.
1296 	 *
1297 	 * We preserve the breadcrumbs and semaphores of the incomplete
1298 	 * requests so that inter-timeline dependencies (i.e other timelines)
1299 	 * remain correctly ordered. And we defer to __i915_request_submit()
1300 	 * so that all asynchronous waits are correctly handled.
1301 	 */
1302 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1303 		     rq->fence.context, rq->fence.seqno);
1304 
1305 	/* On resubmission of the active request, payload will be scrubbed */
1306 	if (i915_request_completed(rq))
1307 		head = rq->tail;
1308 	else
1309 		head = active_request(ce->timeline, rq)->head;
1310 	head = intel_ring_wrap(ce->ring, head);
1311 
1312 	/* Scrub the context image to prevent replaying the previous batch */
1313 	restore_default_state(ce, engine);
1314 	__execlists_update_reg_state(ce, engine, head);
1315 
1316 	/* We've switched away, so this should be a no-op, but intent matters */
1317 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1318 }
1319 
1320 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1321 {
1322 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1323 	ce->runtime.num_underflow += dt < 0;
1324 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1325 #endif
1326 }
1327 
1328 static void intel_context_update_runtime(struct intel_context *ce)
1329 {
1330 	u32 old;
1331 	s32 dt;
1332 
1333 	if (intel_context_is_barrier(ce))
1334 		return;
1335 
1336 	old = ce->runtime.last;
1337 	ce->runtime.last = intel_context_get_runtime(ce);
1338 	dt = ce->runtime.last - old;
1339 
1340 	if (unlikely(dt <= 0)) {
1341 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1342 			 old, ce->runtime.last, dt);
1343 		st_update_runtime_underflow(ce, dt);
1344 		return;
1345 	}
1346 
1347 	ewma_runtime_add(&ce->runtime.avg, dt);
1348 	ce->runtime.total += dt;
1349 }
1350 
1351 static inline struct intel_engine_cs *
1352 __execlists_schedule_in(struct i915_request *rq)
1353 {
1354 	struct intel_engine_cs * const engine = rq->engine;
1355 	struct intel_context * const ce = rq->context;
1356 
1357 	intel_context_get(ce);
1358 
1359 	if (unlikely(intel_context_is_banned(ce)))
1360 		reset_active(rq, engine);
1361 
1362 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1363 		execlists_check_context(ce, engine);
1364 
1365 	if (ce->tag) {
1366 		/* Use a fixed tag for OA and friends */
1367 		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1368 		ce->lrc.ccid = ce->tag;
1369 	} else {
1370 		/* We don't need a strict matching tag, just different values */
1371 		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1372 
1373 		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1374 		clear_bit(tag - 1, &engine->context_tag);
1375 		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1376 
1377 		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1378 	}
1379 
1380 	ce->lrc.ccid |= engine->execlists.ccid;
1381 
1382 	__intel_gt_pm_get(engine->gt);
1383 	if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1384 		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1385 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1386 	intel_engine_context_in(engine);
1387 
1388 	return engine;
1389 }
1390 
1391 static inline struct i915_request *
1392 execlists_schedule_in(struct i915_request *rq, int idx)
1393 {
1394 	struct intel_context * const ce = rq->context;
1395 	struct intel_engine_cs *old;
1396 
1397 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1398 	trace_i915_request_in(rq, idx);
1399 
1400 	old = READ_ONCE(ce->inflight);
1401 	do {
1402 		if (!old) {
1403 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1404 			break;
1405 		}
1406 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1407 
1408 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1409 	return i915_request_get(rq);
1410 }
1411 
1412 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1413 {
1414 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1415 	struct i915_request *next = READ_ONCE(ve->request);
1416 
1417 	if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1418 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
1419 }
1420 
1421 static inline void
1422 __execlists_schedule_out(struct i915_request *rq,
1423 			 struct intel_engine_cs * const engine,
1424 			 unsigned int ccid)
1425 {
1426 	struct intel_context * const ce = rq->context;
1427 
1428 	/*
1429 	 * NB process_csb() is not under the engine->active.lock and hence
1430 	 * schedule_out can race with schedule_in meaning that we should
1431 	 * refrain from doing non-trivial work here.
1432 	 */
1433 
1434 	/*
1435 	 * If we have just completed this context, the engine may now be
1436 	 * idle and we want to re-enter powersaving.
1437 	 */
1438 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1439 	    i915_request_completed(rq))
1440 		intel_engine_add_retire(engine, ce->timeline);
1441 
1442 	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1443 	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1444 	if (ccid < BITS_PER_LONG) {
1445 		GEM_BUG_ON(ccid == 0);
1446 		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1447 		set_bit(ccid - 1, &engine->context_tag);
1448 	}
1449 
1450 	intel_context_update_runtime(ce);
1451 	intel_engine_context_out(engine);
1452 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1453 	if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1454 		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1455 	intel_gt_pm_put_async(engine->gt);
1456 
1457 	/*
1458 	 * If this is part of a virtual engine, its next request may
1459 	 * have been blocked waiting for access to the active context.
1460 	 * We have to kick all the siblings again in case we need to
1461 	 * switch (e.g. the next request is not runnable on this
1462 	 * engine). Hopefully, we will already have submitted the next
1463 	 * request before the tasklet runs and do not need to rebuild
1464 	 * each virtual tree and kick everyone again.
1465 	 */
1466 	if (ce->engine != engine)
1467 		kick_siblings(rq, ce);
1468 
1469 	intel_context_put(ce);
1470 }
1471 
1472 static inline void
1473 execlists_schedule_out(struct i915_request *rq)
1474 {
1475 	struct intel_context * const ce = rq->context;
1476 	struct intel_engine_cs *cur, *old;
1477 	u32 ccid;
1478 
1479 	trace_i915_request_out(rq);
1480 
1481 	ccid = rq->context->lrc.ccid;
1482 	old = READ_ONCE(ce->inflight);
1483 	do
1484 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1485 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1486 	if (!cur)
1487 		__execlists_schedule_out(rq, old, ccid);
1488 
1489 	i915_request_put(rq);
1490 }
1491 
1492 static u64 execlists_update_context(struct i915_request *rq)
1493 {
1494 	struct intel_context *ce = rq->context;
1495 	u64 desc = ce->lrc.desc;
1496 	u32 tail, prev;
1497 
1498 	/*
1499 	 * WaIdleLiteRestore:bdw,skl
1500 	 *
1501 	 * We should never submit the context with the same RING_TAIL twice
1502 	 * just in case we submit an empty ring, which confuses the HW.
1503 	 *
1504 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1505 	 * the normal request to be able to always advance the RING_TAIL on
1506 	 * subsequent resubmissions (for lite restore). Should that fail us,
1507 	 * and we try and submit the same tail again, force the context
1508 	 * reload.
1509 	 *
1510 	 * If we need to return to a preempted context, we need to skip the
1511 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1512 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1513 	 * an earlier request.
1514 	 */
1515 	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1516 	prev = rq->ring->tail;
1517 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1518 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1519 		desc |= CTX_DESC_FORCE_RESTORE;
1520 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1521 	rq->tail = rq->wa_tail;
1522 
1523 	/*
1524 	 * Make sure the context image is complete before we submit it to HW.
1525 	 *
1526 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1527 	 * an uncached write such as our mmio register access, the empirical
1528 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1529 	 * may not be visible to the HW prior to the completion of the UC
1530 	 * register write and that we may begin execution from the context
1531 	 * before its image is complete leading to invalid PD chasing.
1532 	 */
1533 	wmb();
1534 
1535 	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1536 	return desc;
1537 }
1538 
1539 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1540 {
1541 	if (execlists->ctrl_reg) {
1542 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1543 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1544 	} else {
1545 		writel(upper_32_bits(desc), execlists->submit_reg);
1546 		writel(lower_32_bits(desc), execlists->submit_reg);
1547 	}
1548 }
1549 
1550 static __maybe_unused char *
1551 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1552 {
1553 	if (!rq)
1554 		return "";
1555 
1556 	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1557 		 prefix,
1558 		 rq->context->lrc.ccid,
1559 		 rq->fence.context, rq->fence.seqno,
1560 		 i915_request_completed(rq) ? "!" :
1561 		 i915_request_started(rq) ? "*" :
1562 		 "",
1563 		 rq_prio(rq));
1564 
1565 	return buf;
1566 }
1567 
1568 static __maybe_unused void
1569 trace_ports(const struct intel_engine_execlists *execlists,
1570 	    const char *msg,
1571 	    struct i915_request * const *ports)
1572 {
1573 	const struct intel_engine_cs *engine =
1574 		container_of(execlists, typeof(*engine), execlists);
1575 	char __maybe_unused p0[40], p1[40];
1576 
1577 	if (!ports[0])
1578 		return;
1579 
1580 	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1581 		     dump_port(p0, sizeof(p0), "", ports[0]),
1582 		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1583 }
1584 
1585 static inline bool
1586 reset_in_progress(const struct intel_engine_execlists *execlists)
1587 {
1588 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1589 }
1590 
1591 static __maybe_unused bool
1592 assert_pending_valid(const struct intel_engine_execlists *execlists,
1593 		     const char *msg)
1594 {
1595 	struct intel_engine_cs *engine =
1596 		container_of(execlists, typeof(*engine), execlists);
1597 	struct i915_request * const *port, *rq;
1598 	struct intel_context *ce = NULL;
1599 	bool sentinel = false;
1600 	u32 ccid = -1;
1601 
1602 	trace_ports(execlists, msg, execlists->pending);
1603 
1604 	/* We may be messing around with the lists during reset, lalala */
1605 	if (reset_in_progress(execlists))
1606 		return true;
1607 
1608 	if (!execlists->pending[0]) {
1609 		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1610 			      engine->name);
1611 		return false;
1612 	}
1613 
1614 	if (execlists->pending[execlists_num_ports(execlists)]) {
1615 		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1616 			      engine->name, execlists_num_ports(execlists));
1617 		return false;
1618 	}
1619 
1620 	for (port = execlists->pending; (rq = *port); port++) {
1621 		unsigned long flags;
1622 		bool ok = true;
1623 
1624 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1625 		GEM_BUG_ON(!i915_request_is_active(rq));
1626 
1627 		if (ce == rq->context) {
1628 			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1629 				      engine->name,
1630 				      ce->timeline->fence_context,
1631 				      port - execlists->pending);
1632 			return false;
1633 		}
1634 		ce = rq->context;
1635 
1636 		if (ccid == ce->lrc.ccid) {
1637 			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1638 				      engine->name,
1639 				      ccid, ce->timeline->fence_context,
1640 				      port - execlists->pending);
1641 			return false;
1642 		}
1643 		ccid = ce->lrc.ccid;
1644 
1645 		/*
1646 		 * Sentinels are supposed to be the last request so they flush
1647 		 * the current execution off the HW. Check that they are the only
1648 		 * request in the pending submission.
1649 		 */
1650 		if (sentinel) {
1651 			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1652 				      engine->name,
1653 				      ce->timeline->fence_context,
1654 				      port - execlists->pending);
1655 			return false;
1656 		}
1657 		sentinel = i915_request_has_sentinel(rq);
1658 
1659 		/* Hold tightly onto the lock to prevent concurrent retires! */
1660 		if (!spin_trylock_irqsave(&rq->lock, flags))
1661 			continue;
1662 
1663 		if (i915_request_completed(rq))
1664 			goto unlock;
1665 
1666 		if (i915_active_is_idle(&ce->active) &&
1667 		    !intel_context_is_barrier(ce)) {
1668 			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1669 				      engine->name,
1670 				      ce->timeline->fence_context,
1671 				      port - execlists->pending);
1672 			ok = false;
1673 			goto unlock;
1674 		}
1675 
1676 		if (!i915_vma_is_pinned(ce->state)) {
1677 			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1678 				      engine->name,
1679 				      ce->timeline->fence_context,
1680 				      port - execlists->pending);
1681 			ok = false;
1682 			goto unlock;
1683 		}
1684 
1685 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1686 			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1687 				      engine->name,
1688 				      ce->timeline->fence_context,
1689 				      port - execlists->pending);
1690 			ok = false;
1691 			goto unlock;
1692 		}
1693 
1694 unlock:
1695 		spin_unlock_irqrestore(&rq->lock, flags);
1696 		if (!ok)
1697 			return false;
1698 	}
1699 
1700 	return ce;
1701 }
1702 
1703 static void execlists_submit_ports(struct intel_engine_cs *engine)
1704 {
1705 	struct intel_engine_execlists *execlists = &engine->execlists;
1706 	unsigned int n;
1707 
1708 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1709 
1710 	/*
1711 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1712 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1713 	 * not be relinquished until the device is idle (see
1714 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1715 	 * that all ELSP are drained i.e. we have processed the CSB,
1716 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1717 	 */
1718 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1719 
1720 	/*
1721 	 * ELSQ note: the submit queue is not cleared after being submitted
1722 	 * to the HW so we need to make sure we always clean it up. This is
1723 	 * currently ensured by the fact that we always write the same number
1724 	 * of elsq entries, keep this in mind before changing the loop below.
1725 	 */
1726 	for (n = execlists_num_ports(execlists); n--; ) {
1727 		struct i915_request *rq = execlists->pending[n];
1728 
1729 		write_desc(execlists,
1730 			   rq ? execlists_update_context(rq) : 0,
1731 			   n);
1732 	}
1733 
1734 	/* we need to manually load the submit queue */
1735 	if (execlists->ctrl_reg)
1736 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1737 }
1738 
1739 static bool ctx_single_port_submission(const struct intel_context *ce)
1740 {
1741 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1742 		intel_context_force_single_submission(ce));
1743 }
1744 
1745 static bool can_merge_ctx(const struct intel_context *prev,
1746 			  const struct intel_context *next)
1747 {
1748 	if (prev != next)
1749 		return false;
1750 
1751 	if (ctx_single_port_submission(prev))
1752 		return false;
1753 
1754 	return true;
1755 }
1756 
1757 static unsigned long i915_request_flags(const struct i915_request *rq)
1758 {
1759 	return READ_ONCE(rq->fence.flags);
1760 }
1761 
1762 static bool can_merge_rq(const struct i915_request *prev,
1763 			 const struct i915_request *next)
1764 {
1765 	GEM_BUG_ON(prev == next);
1766 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1767 
1768 	/*
1769 	 * We do not submit known completed requests. Therefore if the next
1770 	 * request is already completed, we can pretend to merge it in
1771 	 * with the previous context (and we will skip updating the ELSP
1772 	 * and tracking). Thus hopefully keeping the ELSP full with active
1773 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1774 	 * us.
1775 	 */
1776 	if (i915_request_completed(next))
1777 		return true;
1778 
1779 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1780 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1781 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1782 		return false;
1783 
1784 	if (!can_merge_ctx(prev->context, next->context))
1785 		return false;
1786 
1787 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1788 	return true;
1789 }
1790 
1791 static void virtual_update_register_offsets(u32 *regs,
1792 					    struct intel_engine_cs *engine)
1793 {
1794 	set_offsets(regs, reg_offsets(engine), engine, false);
1795 }
1796 
1797 static bool virtual_matches(const struct virtual_engine *ve,
1798 			    const struct i915_request *rq,
1799 			    const struct intel_engine_cs *engine)
1800 {
1801 	const struct intel_engine_cs *inflight;
1802 
1803 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1804 		return false;
1805 
1806 	/*
1807 	 * We track when the HW has completed saving the context image
1808 	 * (i.e. when we have seen the final CS event switching out of
1809 	 * the context) and must not overwrite the context image before
1810 	 * then. This restricts us to only using the active engine
1811 	 * while the previous virtualized request is inflight (so
1812 	 * we reuse the register offsets). This is a very small
1813 	 * hystersis on the greedy seelction algorithm.
1814 	 */
1815 	inflight = intel_context_inflight(&ve->context);
1816 	if (inflight && inflight != engine)
1817 		return false;
1818 
1819 	return true;
1820 }
1821 
1822 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve)
1823 {
1824 	/*
1825 	 * All the outstanding signals on ve->siblings[0] must have
1826 	 * been completed, just pending the interrupt handler. As those
1827 	 * signals still refer to the old sibling (via rq->engine), we must
1828 	 * transfer those to the old irq_worker to keep our locking
1829 	 * consistent.
1830 	 */
1831 	intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context);
1832 }
1833 
1834 #define for_each_waiter(p__, rq__) \
1835 	list_for_each_entry_lockless(p__, \
1836 				     &(rq__)->sched.waiters_list, \
1837 				     wait_link)
1838 
1839 #define for_each_signaler(p__, rq__) \
1840 	list_for_each_entry_rcu(p__, \
1841 				&(rq__)->sched.signalers_list, \
1842 				signal_link)
1843 
1844 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1845 {
1846 	LIST_HEAD(list);
1847 
1848 	/*
1849 	 * We want to move the interrupted request to the back of
1850 	 * the round-robin list (i.e. its priority level), but
1851 	 * in doing so, we must then move all requests that were in
1852 	 * flight and were waiting for the interrupted request to
1853 	 * be run after it again.
1854 	 */
1855 	do {
1856 		struct i915_dependency *p;
1857 
1858 		GEM_BUG_ON(i915_request_is_active(rq));
1859 		list_move_tail(&rq->sched.link, pl);
1860 
1861 		for_each_waiter(p, rq) {
1862 			struct i915_request *w =
1863 				container_of(p->waiter, typeof(*w), sched);
1864 
1865 			if (p->flags & I915_DEPENDENCY_WEAK)
1866 				continue;
1867 
1868 			/* Leave semaphores spinning on the other engines */
1869 			if (w->engine != rq->engine)
1870 				continue;
1871 
1872 			/* No waiter should start before its signaler */
1873 			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1874 				   i915_request_started(w) &&
1875 				   !i915_request_completed(rq));
1876 
1877 			GEM_BUG_ON(i915_request_is_active(w));
1878 			if (!i915_request_is_ready(w))
1879 				continue;
1880 
1881 			if (rq_prio(w) < rq_prio(rq))
1882 				continue;
1883 
1884 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1885 			list_move_tail(&w->sched.link, &list);
1886 		}
1887 
1888 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1889 	} while (rq);
1890 }
1891 
1892 static void defer_active(struct intel_engine_cs *engine)
1893 {
1894 	struct i915_request *rq;
1895 
1896 	rq = __unwind_incomplete_requests(engine);
1897 	if (!rq)
1898 		return;
1899 
1900 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1901 }
1902 
1903 static bool
1904 need_timeslice(const struct intel_engine_cs *engine,
1905 	       const struct i915_request *rq,
1906 	       const struct rb_node *rb)
1907 {
1908 	int hint;
1909 
1910 	if (!intel_engine_has_timeslices(engine))
1911 		return false;
1912 
1913 	hint = engine->execlists.queue_priority_hint;
1914 
1915 	if (rb) {
1916 		const struct virtual_engine *ve =
1917 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1918 		const struct intel_engine_cs *inflight =
1919 			intel_context_inflight(&ve->context);
1920 
1921 		if (!inflight || inflight == engine) {
1922 			struct i915_request *next;
1923 
1924 			rcu_read_lock();
1925 			next = READ_ONCE(ve->request);
1926 			if (next)
1927 				hint = max(hint, rq_prio(next));
1928 			rcu_read_unlock();
1929 		}
1930 	}
1931 
1932 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1933 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1934 
1935 	GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1936 	return hint >= effective_prio(rq);
1937 }
1938 
1939 static bool
1940 timeslice_yield(const struct intel_engine_execlists *el,
1941 		const struct i915_request *rq)
1942 {
1943 	/*
1944 	 * Once bitten, forever smitten!
1945 	 *
1946 	 * If the active context ever busy-waited on a semaphore,
1947 	 * it will be treated as a hog until the end of its timeslice (i.e.
1948 	 * until it is scheduled out and replaced by a new submission,
1949 	 * possibly even its own lite-restore). The HW only sends an interrupt
1950 	 * on the first miss, and we do know if that semaphore has been
1951 	 * signaled, or even if it is now stuck on another semaphore. Play
1952 	 * safe, yield if it might be stuck -- it will be given a fresh
1953 	 * timeslice in the near future.
1954 	 */
1955 	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1956 }
1957 
1958 static bool
1959 timeslice_expired(const struct intel_engine_execlists *el,
1960 		  const struct i915_request *rq)
1961 {
1962 	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1963 }
1964 
1965 static int
1966 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1967 {
1968 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1969 		return engine->execlists.queue_priority_hint;
1970 
1971 	return rq_prio(list_next_entry(rq, sched.link));
1972 }
1973 
1974 static inline unsigned long
1975 timeslice(const struct intel_engine_cs *engine)
1976 {
1977 	return READ_ONCE(engine->props.timeslice_duration_ms);
1978 }
1979 
1980 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1981 {
1982 	const struct intel_engine_execlists *execlists = &engine->execlists;
1983 	const struct i915_request *rq = *execlists->active;
1984 
1985 	if (!rq || i915_request_completed(rq))
1986 		return 0;
1987 
1988 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1989 		return 0;
1990 
1991 	return timeslice(engine);
1992 }
1993 
1994 static void set_timeslice(struct intel_engine_cs *engine)
1995 {
1996 	unsigned long duration;
1997 
1998 	if (!intel_engine_has_timeslices(engine))
1999 		return;
2000 
2001 	duration = active_timeslice(engine);
2002 	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2003 
2004 	set_timer_ms(&engine->execlists.timer, duration);
2005 }
2006 
2007 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2008 {
2009 	struct intel_engine_execlists *execlists = &engine->execlists;
2010 	unsigned long duration;
2011 
2012 	if (!intel_engine_has_timeslices(engine))
2013 		return;
2014 
2015 	WRITE_ONCE(execlists->switch_priority_hint, prio);
2016 	if (prio == INT_MIN)
2017 		return;
2018 
2019 	if (timer_pending(&execlists->timer))
2020 		return;
2021 
2022 	duration = timeslice(engine);
2023 	ENGINE_TRACE(engine,
2024 		     "start timeslicing, prio:%d, interval:%lu",
2025 		     prio, duration);
2026 
2027 	set_timer_ms(&execlists->timer, duration);
2028 }
2029 
2030 static void record_preemption(struct intel_engine_execlists *execlists)
2031 {
2032 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2033 }
2034 
2035 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2036 					    const struct i915_request *rq)
2037 {
2038 	if (!rq)
2039 		return 0;
2040 
2041 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2042 	if (unlikely(intel_context_is_banned(rq->context)))
2043 		return 1;
2044 
2045 	return READ_ONCE(engine->props.preempt_timeout_ms);
2046 }
2047 
2048 static void set_preempt_timeout(struct intel_engine_cs *engine,
2049 				const struct i915_request *rq)
2050 {
2051 	if (!intel_engine_has_preempt_reset(engine))
2052 		return;
2053 
2054 	set_timer_ms(&engine->execlists.preempt,
2055 		     active_preempt_timeout(engine, rq));
2056 }
2057 
2058 static inline void clear_ports(struct i915_request **ports, int count)
2059 {
2060 	memset_p((void **)ports, NULL, count);
2061 }
2062 
2063 static void execlists_dequeue(struct intel_engine_cs *engine)
2064 {
2065 	struct intel_engine_execlists * const execlists = &engine->execlists;
2066 	struct i915_request **port = execlists->pending;
2067 	struct i915_request ** const last_port = port + execlists->port_mask;
2068 	struct i915_request * const *active;
2069 	struct i915_request *last;
2070 	struct rb_node *rb;
2071 	bool submit = false;
2072 
2073 	/*
2074 	 * Hardware submission is through 2 ports. Conceptually each port
2075 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2076 	 * static for a context, and unique to each, so we only execute
2077 	 * requests belonging to a single context from each ring. RING_HEAD
2078 	 * is maintained by the CS in the context image, it marks the place
2079 	 * where it got up to last time, and through RING_TAIL we tell the CS
2080 	 * where we want to execute up to this time.
2081 	 *
2082 	 * In this list the requests are in order of execution. Consecutive
2083 	 * requests from the same context are adjacent in the ringbuffer. We
2084 	 * can combine these requests into a single RING_TAIL update:
2085 	 *
2086 	 *              RING_HEAD...req1...req2
2087 	 *                                    ^- RING_TAIL
2088 	 * since to execute req2 the CS must first execute req1.
2089 	 *
2090 	 * Our goal then is to point each port to the end of a consecutive
2091 	 * sequence of requests as being the most optimal (fewest wake ups
2092 	 * and context switches) submission.
2093 	 */
2094 
2095 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2096 		struct virtual_engine *ve =
2097 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2098 		struct i915_request *rq = READ_ONCE(ve->request);
2099 
2100 		if (!rq) { /* lazily cleanup after another engine handled rq */
2101 			rb_erase_cached(rb, &execlists->virtual);
2102 			RB_CLEAR_NODE(rb);
2103 			rb = rb_first_cached(&execlists->virtual);
2104 			continue;
2105 		}
2106 
2107 		if (!virtual_matches(ve, rq, engine)) {
2108 			rb = rb_next(rb);
2109 			continue;
2110 		}
2111 
2112 		break;
2113 	}
2114 
2115 	/*
2116 	 * If the queue is higher priority than the last
2117 	 * request in the currently active context, submit afresh.
2118 	 * We will resubmit again afterwards in case we need to split
2119 	 * the active context to interject the preemption request,
2120 	 * i.e. we will retrigger preemption following the ack in case
2121 	 * of trouble.
2122 	 */
2123 	active = READ_ONCE(execlists->active);
2124 
2125 	/*
2126 	 * In theory we can skip over completed contexts that have not
2127 	 * yet been processed by events (as those events are in flight):
2128 	 *
2129 	 * while ((last = *active) && i915_request_completed(last))
2130 	 *	active++;
2131 	 *
2132 	 * However, the GPU cannot handle this as it will ultimately
2133 	 * find itself trying to jump back into a context it has just
2134 	 * completed and barf.
2135 	 */
2136 
2137 	if ((last = *active)) {
2138 		if (need_preempt(engine, last, rb)) {
2139 			if (i915_request_completed(last)) {
2140 				tasklet_hi_schedule(&execlists->tasklet);
2141 				return;
2142 			}
2143 
2144 			ENGINE_TRACE(engine,
2145 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2146 				     last->fence.context,
2147 				     last->fence.seqno,
2148 				     last->sched.attr.priority,
2149 				     execlists->queue_priority_hint);
2150 			record_preemption(execlists);
2151 
2152 			/*
2153 			 * Don't let the RING_HEAD advance past the breadcrumb
2154 			 * as we unwind (and until we resubmit) so that we do
2155 			 * not accidentally tell it to go backwards.
2156 			 */
2157 			ring_set_paused(engine, 1);
2158 
2159 			/*
2160 			 * Note that we have not stopped the GPU at this point,
2161 			 * so we are unwinding the incomplete requests as they
2162 			 * remain inflight and so by the time we do complete
2163 			 * the preemption, some of the unwound requests may
2164 			 * complete!
2165 			 */
2166 			__unwind_incomplete_requests(engine);
2167 
2168 			last = NULL;
2169 		} else if (need_timeslice(engine, last, rb) &&
2170 			   timeslice_expired(execlists, last)) {
2171 			if (i915_request_completed(last)) {
2172 				tasklet_hi_schedule(&execlists->tasklet);
2173 				return;
2174 			}
2175 
2176 			ENGINE_TRACE(engine,
2177 				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2178 				     last->fence.context,
2179 				     last->fence.seqno,
2180 				     last->sched.attr.priority,
2181 				     execlists->queue_priority_hint,
2182 				     yesno(timeslice_yield(execlists, last)));
2183 
2184 			ring_set_paused(engine, 1);
2185 			defer_active(engine);
2186 
2187 			/*
2188 			 * Unlike for preemption, if we rewind and continue
2189 			 * executing the same context as previously active,
2190 			 * the order of execution will remain the same and
2191 			 * the tail will only advance. We do not need to
2192 			 * force a full context restore, as a lite-restore
2193 			 * is sufficient to resample the monotonic TAIL.
2194 			 *
2195 			 * If we switch to any other context, similarly we
2196 			 * will not rewind TAIL of current context, and
2197 			 * normal save/restore will preserve state and allow
2198 			 * us to later continue executing the same request.
2199 			 */
2200 			last = NULL;
2201 		} else {
2202 			/*
2203 			 * Otherwise if we already have a request pending
2204 			 * for execution after the current one, we can
2205 			 * just wait until the next CS event before
2206 			 * queuing more. In either case we will force a
2207 			 * lite-restore preemption event, but if we wait
2208 			 * we hopefully coalesce several updates into a single
2209 			 * submission.
2210 			 */
2211 			if (!list_is_last(&last->sched.link,
2212 					  &engine->active.requests)) {
2213 				/*
2214 				 * Even if ELSP[1] is occupied and not worthy
2215 				 * of timeslices, our queue might be.
2216 				 */
2217 				start_timeslice(engine, queue_prio(execlists));
2218 				return;
2219 			}
2220 		}
2221 	}
2222 
2223 	while (rb) { /* XXX virtual is always taking precedence */
2224 		struct virtual_engine *ve =
2225 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2226 		struct i915_request *rq;
2227 
2228 		spin_lock(&ve->base.active.lock);
2229 
2230 		rq = ve->request;
2231 		if (unlikely(!rq)) { /* lost the race to a sibling */
2232 			spin_unlock(&ve->base.active.lock);
2233 			rb_erase_cached(rb, &execlists->virtual);
2234 			RB_CLEAR_NODE(rb);
2235 			rb = rb_first_cached(&execlists->virtual);
2236 			continue;
2237 		}
2238 
2239 		GEM_BUG_ON(rq != ve->request);
2240 		GEM_BUG_ON(rq->engine != &ve->base);
2241 		GEM_BUG_ON(rq->context != &ve->context);
2242 
2243 		if (rq_prio(rq) >= queue_prio(execlists)) {
2244 			if (!virtual_matches(ve, rq, engine)) {
2245 				spin_unlock(&ve->base.active.lock);
2246 				rb = rb_next(rb);
2247 				continue;
2248 			}
2249 
2250 			if (last && !can_merge_rq(last, rq)) {
2251 				spin_unlock(&ve->base.active.lock);
2252 				start_timeslice(engine, rq_prio(rq));
2253 				return; /* leave this for another sibling */
2254 			}
2255 
2256 			ENGINE_TRACE(engine,
2257 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2258 				     rq->fence.context,
2259 				     rq->fence.seqno,
2260 				     i915_request_completed(rq) ? "!" :
2261 				     i915_request_started(rq) ? "*" :
2262 				     "",
2263 				     yesno(engine != ve->siblings[0]));
2264 
2265 			WRITE_ONCE(ve->request, NULL);
2266 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2267 				   INT_MIN);
2268 			rb_erase_cached(rb, &execlists->virtual);
2269 			RB_CLEAR_NODE(rb);
2270 
2271 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2272 			WRITE_ONCE(rq->engine, engine);
2273 
2274 			if (engine != ve->siblings[0]) {
2275 				u32 *regs = ve->context.lrc_reg_state;
2276 				unsigned int n;
2277 
2278 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2279 
2280 				if (!intel_engine_has_relative_mmio(engine))
2281 					virtual_update_register_offsets(regs,
2282 									engine);
2283 
2284 				if (!list_empty(&ve->context.signals))
2285 					virtual_xfer_breadcrumbs(ve);
2286 
2287 				/*
2288 				 * Move the bound engine to the top of the list
2289 				 * for future execution. We then kick this
2290 				 * tasklet first before checking others, so that
2291 				 * we preferentially reuse this set of bound
2292 				 * registers.
2293 				 */
2294 				for (n = 1; n < ve->num_siblings; n++) {
2295 					if (ve->siblings[n] == engine) {
2296 						swap(ve->siblings[n],
2297 						     ve->siblings[0]);
2298 						break;
2299 					}
2300 				}
2301 
2302 				GEM_BUG_ON(ve->siblings[0] != engine);
2303 			}
2304 
2305 			if (__i915_request_submit(rq)) {
2306 				submit = true;
2307 				last = rq;
2308 			}
2309 			i915_request_put(rq);
2310 
2311 			/*
2312 			 * Hmm, we have a bunch of virtual engine requests,
2313 			 * but the first one was already completed (thanks
2314 			 * preempt-to-busy!). Keep looking at the veng queue
2315 			 * until we have no more relevant requests (i.e.
2316 			 * the normal submit queue has higher priority).
2317 			 */
2318 			if (!submit) {
2319 				spin_unlock(&ve->base.active.lock);
2320 				rb = rb_first_cached(&execlists->virtual);
2321 				continue;
2322 			}
2323 		}
2324 
2325 		spin_unlock(&ve->base.active.lock);
2326 		break;
2327 	}
2328 
2329 	while ((rb = rb_first_cached(&execlists->queue))) {
2330 		struct i915_priolist *p = to_priolist(rb);
2331 		struct i915_request *rq, *rn;
2332 		int i;
2333 
2334 		priolist_for_each_request_consume(rq, rn, p, i) {
2335 			bool merge = true;
2336 
2337 			/*
2338 			 * Can we combine this request with the current port?
2339 			 * It has to be the same context/ringbuffer and not
2340 			 * have any exceptions (e.g. GVT saying never to
2341 			 * combine contexts).
2342 			 *
2343 			 * If we can combine the requests, we can execute both
2344 			 * by updating the RING_TAIL to point to the end of the
2345 			 * second request, and so we never need to tell the
2346 			 * hardware about the first.
2347 			 */
2348 			if (last && !can_merge_rq(last, rq)) {
2349 				/*
2350 				 * If we are on the second port and cannot
2351 				 * combine this request with the last, then we
2352 				 * are done.
2353 				 */
2354 				if (port == last_port)
2355 					goto done;
2356 
2357 				/*
2358 				 * We must not populate both ELSP[] with the
2359 				 * same LRCA, i.e. we must submit 2 different
2360 				 * contexts if we submit 2 ELSP.
2361 				 */
2362 				if (last->context == rq->context)
2363 					goto done;
2364 
2365 				if (i915_request_has_sentinel(last))
2366 					goto done;
2367 
2368 				/*
2369 				 * If GVT overrides us we only ever submit
2370 				 * port[0], leaving port[1] empty. Note that we
2371 				 * also have to be careful that we don't queue
2372 				 * the same context (even though a different
2373 				 * request) to the second port.
2374 				 */
2375 				if (ctx_single_port_submission(last->context) ||
2376 				    ctx_single_port_submission(rq->context))
2377 					goto done;
2378 
2379 				merge = false;
2380 			}
2381 
2382 			if (__i915_request_submit(rq)) {
2383 				if (!merge) {
2384 					*port = execlists_schedule_in(last, port - execlists->pending);
2385 					port++;
2386 					last = NULL;
2387 				}
2388 
2389 				GEM_BUG_ON(last &&
2390 					   !can_merge_ctx(last->context,
2391 							  rq->context));
2392 				GEM_BUG_ON(last &&
2393 					   i915_seqno_passed(last->fence.seqno,
2394 							     rq->fence.seqno));
2395 
2396 				submit = true;
2397 				last = rq;
2398 			}
2399 		}
2400 
2401 		rb_erase_cached(&p->node, &execlists->queue);
2402 		i915_priolist_free(p);
2403 	}
2404 
2405 done:
2406 	/*
2407 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2408 	 *
2409 	 * We choose the priority hint such that if we add a request of greater
2410 	 * priority than this, we kick the submission tasklet to decide on
2411 	 * the right order of submitting the requests to hardware. We must
2412 	 * also be prepared to reorder requests as they are in-flight on the
2413 	 * HW. We derive the priority hint then as the first "hole" in
2414 	 * the HW submission ports and if there are no available slots,
2415 	 * the priority of the lowest executing request, i.e. last.
2416 	 *
2417 	 * When we do receive a higher priority request ready to run from the
2418 	 * user, see queue_request(), the priority hint is bumped to that
2419 	 * request triggering preemption on the next dequeue (or subsequent
2420 	 * interrupt for secondary ports).
2421 	 */
2422 	execlists->queue_priority_hint = queue_prio(execlists);
2423 
2424 	if (submit) {
2425 		*port = execlists_schedule_in(last, port - execlists->pending);
2426 		execlists->switch_priority_hint =
2427 			switch_prio(engine, *execlists->pending);
2428 
2429 		/*
2430 		 * Skip if we ended up with exactly the same set of requests,
2431 		 * e.g. trying to timeslice a pair of ordered contexts
2432 		 */
2433 		if (!memcmp(active, execlists->pending,
2434 			    (port - execlists->pending + 1) * sizeof(*port))) {
2435 			do
2436 				execlists_schedule_out(fetch_and_zero(port));
2437 			while (port-- != execlists->pending);
2438 
2439 			goto skip_submit;
2440 		}
2441 		clear_ports(port + 1, last_port - port);
2442 
2443 		WRITE_ONCE(execlists->yield, -1);
2444 		set_preempt_timeout(engine, *active);
2445 		execlists_submit_ports(engine);
2446 	} else {
2447 		start_timeslice(engine, execlists->queue_priority_hint);
2448 skip_submit:
2449 		ring_set_paused(engine, 0);
2450 	}
2451 }
2452 
2453 static void
2454 cancel_port_requests(struct intel_engine_execlists * const execlists)
2455 {
2456 	struct i915_request * const *port;
2457 
2458 	for (port = execlists->pending; *port; port++)
2459 		execlists_schedule_out(*port);
2460 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2461 
2462 	/* Mark the end of active before we overwrite *active */
2463 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2464 		execlists_schedule_out(*port);
2465 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2466 
2467 	smp_wmb(); /* complete the seqlock for execlists_active() */
2468 	WRITE_ONCE(execlists->active, execlists->inflight);
2469 }
2470 
2471 static inline void
2472 invalidate_csb_entries(const u32 *first, const u32 *last)
2473 {
2474 	clflush((void *)first);
2475 	clflush((void *)last);
2476 }
2477 
2478 /*
2479  * Starting with Gen12, the status has a new format:
2480  *
2481  *     bit  0:     switched to new queue
2482  *     bit  1:     reserved
2483  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2484  *                 switch detail is set to "wait on semaphore"
2485  *     bits 3-5:   engine class
2486  *     bits 6-11:  engine instance
2487  *     bits 12-14: reserved
2488  *     bits 15-25: sw context id of the lrc the GT switched to
2489  *     bits 26-31: sw counter of the lrc the GT switched to
2490  *     bits 32-35: context switch detail
2491  *                  - 0: ctx complete
2492  *                  - 1: wait on sync flip
2493  *                  - 2: wait on vblank
2494  *                  - 3: wait on scanline
2495  *                  - 4: wait on semaphore
2496  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2497  *                       WAIT_FOR_EVENT)
2498  *     bit  36:    reserved
2499  *     bits 37-43: wait detail (for switch detail 1 to 4)
2500  *     bits 44-46: reserved
2501  *     bits 47-57: sw context id of the lrc the GT switched away from
2502  *     bits 58-63: sw counter of the lrc the GT switched away from
2503  */
2504 static inline bool
2505 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2506 {
2507 	u32 lower_dw = csb[0];
2508 	u32 upper_dw = csb[1];
2509 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2510 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2511 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2512 
2513 	/*
2514 	 * The context switch detail is not guaranteed to be 5 when a preemption
2515 	 * occurs, so we can't just check for that. The check below works for
2516 	 * all the cases we care about, including preemptions of WAIT
2517 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2518 	 * would require some extra handling, but we don't support that.
2519 	 */
2520 	if (!ctx_away_valid || new_queue) {
2521 		GEM_BUG_ON(!ctx_to_valid);
2522 		return true;
2523 	}
2524 
2525 	/*
2526 	 * switch detail = 5 is covered by the case above and we do not expect a
2527 	 * context switch on an unsuccessful wait instruction since we always
2528 	 * use polling mode.
2529 	 */
2530 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2531 	return false;
2532 }
2533 
2534 static inline bool
2535 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2536 {
2537 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2538 }
2539 
2540 static void process_csb(struct intel_engine_cs *engine)
2541 {
2542 	struct intel_engine_execlists * const execlists = &engine->execlists;
2543 	const u32 * const buf = execlists->csb_status;
2544 	const u8 num_entries = execlists->csb_size;
2545 	u8 head, tail;
2546 
2547 	/*
2548 	 * As we modify our execlists state tracking we require exclusive
2549 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2550 	 * and we assume that is only inside the reset paths and so serialised.
2551 	 */
2552 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2553 		   !reset_in_progress(execlists));
2554 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2555 
2556 	/*
2557 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2558 	 * When reading from the csb_write mmio register, we have to be
2559 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2560 	 * the low 4bits. As it happens we know the next 4bits are always
2561 	 * zero and so we can simply masked off the low u8 of the register
2562 	 * and treat it identically to reading from the HWSP (without having
2563 	 * to use explicit shifting and masking, and probably bifurcating
2564 	 * the code to handle the legacy mmio read).
2565 	 */
2566 	head = execlists->csb_head;
2567 	tail = READ_ONCE(*execlists->csb_write);
2568 	if (unlikely(head == tail))
2569 		return;
2570 
2571 	/*
2572 	 * Hopefully paired with a wmb() in HW!
2573 	 *
2574 	 * We must complete the read of the write pointer before any reads
2575 	 * from the CSB, so that we do not see stale values. Without an rmb
2576 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2577 	 * we perform the READ_ONCE(*csb_write).
2578 	 */
2579 	rmb();
2580 
2581 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2582 	do {
2583 		bool promote;
2584 
2585 		if (++head == num_entries)
2586 			head = 0;
2587 
2588 		/*
2589 		 * We are flying near dragons again.
2590 		 *
2591 		 * We hold a reference to the request in execlist_port[]
2592 		 * but no more than that. We are operating in softirq
2593 		 * context and so cannot hold any mutex or sleep. That
2594 		 * prevents us stopping the requests we are processing
2595 		 * in port[] from being retired simultaneously (the
2596 		 * breadcrumb will be complete before we see the
2597 		 * context-switch). As we only hold the reference to the
2598 		 * request, any pointer chasing underneath the request
2599 		 * is subject to a potential use-after-free. Thus we
2600 		 * store all of the bookkeeping within port[] as
2601 		 * required, and avoid using unguarded pointers beneath
2602 		 * request itself. The same applies to the atomic
2603 		 * status notifier.
2604 		 */
2605 
2606 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2607 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2608 
2609 		if (INTEL_GEN(engine->i915) >= 12)
2610 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2611 		else
2612 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2613 		if (promote) {
2614 			struct i915_request * const *old = execlists->active;
2615 
2616 			ring_set_paused(engine, 0);
2617 
2618 			/* Point active to the new ELSP; prevent overwriting */
2619 			WRITE_ONCE(execlists->active, execlists->pending);
2620 			smp_wmb(); /* notify execlists_active() */
2621 
2622 			/* cancel old inflight, prepare for switch */
2623 			trace_ports(execlists, "preempted", old);
2624 			while (*old)
2625 				execlists_schedule_out(*old++);
2626 
2627 			/* switch pending to inflight */
2628 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2629 			memcpy(execlists->inflight,
2630 			       execlists->pending,
2631 			       execlists_num_ports(execlists) *
2632 			       sizeof(*execlists->pending));
2633 			smp_wmb(); /* complete the seqlock */
2634 			WRITE_ONCE(execlists->active, execlists->inflight);
2635 
2636 			WRITE_ONCE(execlists->pending[0], NULL);
2637 		} else {
2638 			GEM_BUG_ON(!*execlists->active);
2639 
2640 			/* port0 completed, advanced to port1 */
2641 			trace_ports(execlists, "completed", execlists->active);
2642 
2643 			/*
2644 			 * We rely on the hardware being strongly
2645 			 * ordered, that the breadcrumb write is
2646 			 * coherent (visible from the CPU) before the
2647 			 * user interrupt is processed. One might assume
2648 			 * that the breadcrumb write being before the
2649 			 * user interrupt and the CS event for the context
2650 			 * switch would therefore be before the CS event
2651 			 * itself...
2652 			 */
2653 			if (GEM_SHOW_DEBUG() &&
2654 			    !i915_request_completed(*execlists->active)) {
2655 				struct i915_request *rq = *execlists->active;
2656 				const u32 *regs __maybe_unused =
2657 					rq->context->lrc_reg_state;
2658 
2659 				ENGINE_TRACE(engine,
2660 					     "context completed before request!\n");
2661 				ENGINE_TRACE(engine,
2662 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2663 					     ENGINE_READ(engine, RING_START),
2664 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2665 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2666 					     ENGINE_READ(engine, RING_CTL),
2667 					     ENGINE_READ(engine, RING_MI_MODE));
2668 				ENGINE_TRACE(engine,
2669 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2670 					     i915_ggtt_offset(rq->ring->vma),
2671 					     rq->head, rq->tail,
2672 					     rq->fence.context,
2673 					     lower_32_bits(rq->fence.seqno),
2674 					     hwsp_seqno(rq));
2675 				ENGINE_TRACE(engine,
2676 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2677 					     regs[CTX_RING_START],
2678 					     regs[CTX_RING_HEAD],
2679 					     regs[CTX_RING_TAIL]);
2680 			}
2681 
2682 			execlists_schedule_out(*execlists->active++);
2683 
2684 			GEM_BUG_ON(execlists->active - execlists->inflight >
2685 				   execlists_num_ports(execlists));
2686 		}
2687 	} while (head != tail);
2688 
2689 	execlists->csb_head = head;
2690 	set_timeslice(engine);
2691 
2692 	/*
2693 	 * Gen11 has proven to fail wrt global observation point between
2694 	 * entry and tail update, failing on the ordering and thus
2695 	 * we see an old entry in the context status buffer.
2696 	 *
2697 	 * Forcibly evict out entries for the next gpu csb update,
2698 	 * to increase the odds that we get a fresh entries with non
2699 	 * working hardware. The cost for doing so comes out mostly with
2700 	 * the wash as hardware, working or not, will need to do the
2701 	 * invalidation before.
2702 	 */
2703 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2704 }
2705 
2706 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2707 {
2708 	lockdep_assert_held(&engine->active.lock);
2709 	if (!READ_ONCE(engine->execlists.pending[0])) {
2710 		rcu_read_lock(); /* protect peeking at execlists->active */
2711 		execlists_dequeue(engine);
2712 		rcu_read_unlock();
2713 	}
2714 }
2715 
2716 static void __execlists_hold(struct i915_request *rq)
2717 {
2718 	LIST_HEAD(list);
2719 
2720 	do {
2721 		struct i915_dependency *p;
2722 
2723 		if (i915_request_is_active(rq))
2724 			__i915_request_unsubmit(rq);
2725 
2726 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2727 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2728 		i915_request_set_hold(rq);
2729 		RQ_TRACE(rq, "on hold\n");
2730 
2731 		for_each_waiter(p, rq) {
2732 			struct i915_request *w =
2733 				container_of(p->waiter, typeof(*w), sched);
2734 
2735 			/* Leave semaphores spinning on the other engines */
2736 			if (w->engine != rq->engine)
2737 				continue;
2738 
2739 			if (!i915_request_is_ready(w))
2740 				continue;
2741 
2742 			if (i915_request_completed(w))
2743 				continue;
2744 
2745 			if (i915_request_on_hold(w))
2746 				continue;
2747 
2748 			list_move_tail(&w->sched.link, &list);
2749 		}
2750 
2751 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2752 	} while (rq);
2753 }
2754 
2755 static bool execlists_hold(struct intel_engine_cs *engine,
2756 			   struct i915_request *rq)
2757 {
2758 	spin_lock_irq(&engine->active.lock);
2759 
2760 	if (i915_request_completed(rq)) { /* too late! */
2761 		rq = NULL;
2762 		goto unlock;
2763 	}
2764 
2765 	if (rq->engine != engine) { /* preempted virtual engine */
2766 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2767 
2768 		/*
2769 		 * intel_context_inflight() is only protected by virtue
2770 		 * of process_csb() being called only by the tasklet (or
2771 		 * directly from inside reset while the tasklet is suspended).
2772 		 * Assert that neither of those are allowed to run while we
2773 		 * poke at the request queues.
2774 		 */
2775 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2776 
2777 		/*
2778 		 * An unsubmitted request along a virtual engine will
2779 		 * remain on the active (this) engine until we are able
2780 		 * to process the context switch away (and so mark the
2781 		 * context as no longer in flight). That cannot have happened
2782 		 * yet, otherwise we would not be hanging!
2783 		 */
2784 		spin_lock(&ve->base.active.lock);
2785 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2786 		GEM_BUG_ON(ve->request != rq);
2787 		ve->request = NULL;
2788 		spin_unlock(&ve->base.active.lock);
2789 		i915_request_put(rq);
2790 
2791 		rq->engine = engine;
2792 	}
2793 
2794 	/*
2795 	 * Transfer this request onto the hold queue to prevent it
2796 	 * being resumbitted to HW (and potentially completed) before we have
2797 	 * released it. Since we may have already submitted following
2798 	 * requests, we need to remove those as well.
2799 	 */
2800 	GEM_BUG_ON(i915_request_on_hold(rq));
2801 	GEM_BUG_ON(rq->engine != engine);
2802 	__execlists_hold(rq);
2803 	GEM_BUG_ON(list_empty(&engine->active.hold));
2804 
2805 unlock:
2806 	spin_unlock_irq(&engine->active.lock);
2807 	return rq;
2808 }
2809 
2810 static bool hold_request(const struct i915_request *rq)
2811 {
2812 	struct i915_dependency *p;
2813 	bool result = false;
2814 
2815 	/*
2816 	 * If one of our ancestors is on hold, we must also be on hold,
2817 	 * otherwise we will bypass it and execute before it.
2818 	 */
2819 	rcu_read_lock();
2820 	for_each_signaler(p, rq) {
2821 		const struct i915_request *s =
2822 			container_of(p->signaler, typeof(*s), sched);
2823 
2824 		if (s->engine != rq->engine)
2825 			continue;
2826 
2827 		result = i915_request_on_hold(s);
2828 		if (result)
2829 			break;
2830 	}
2831 	rcu_read_unlock();
2832 
2833 	return result;
2834 }
2835 
2836 static void __execlists_unhold(struct i915_request *rq)
2837 {
2838 	LIST_HEAD(list);
2839 
2840 	do {
2841 		struct i915_dependency *p;
2842 
2843 		RQ_TRACE(rq, "hold release\n");
2844 
2845 		GEM_BUG_ON(!i915_request_on_hold(rq));
2846 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2847 
2848 		i915_request_clear_hold(rq);
2849 		list_move_tail(&rq->sched.link,
2850 			       i915_sched_lookup_priolist(rq->engine,
2851 							  rq_prio(rq)));
2852 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2853 
2854 		/* Also release any children on this engine that are ready */
2855 		for_each_waiter(p, rq) {
2856 			struct i915_request *w =
2857 				container_of(p->waiter, typeof(*w), sched);
2858 
2859 			/* Propagate any change in error status */
2860 			if (rq->fence.error)
2861 				i915_request_set_error_once(w, rq->fence.error);
2862 
2863 			if (w->engine != rq->engine)
2864 				continue;
2865 
2866 			if (!i915_request_on_hold(w))
2867 				continue;
2868 
2869 			/* Check that no other parents are also on hold */
2870 			if (hold_request(w))
2871 				continue;
2872 
2873 			list_move_tail(&w->sched.link, &list);
2874 		}
2875 
2876 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2877 	} while (rq);
2878 }
2879 
2880 static void execlists_unhold(struct intel_engine_cs *engine,
2881 			     struct i915_request *rq)
2882 {
2883 	spin_lock_irq(&engine->active.lock);
2884 
2885 	/*
2886 	 * Move this request back to the priority queue, and all of its
2887 	 * children and grandchildren that were suspended along with it.
2888 	 */
2889 	__execlists_unhold(rq);
2890 
2891 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2892 		engine->execlists.queue_priority_hint = rq_prio(rq);
2893 		tasklet_hi_schedule(&engine->execlists.tasklet);
2894 	}
2895 
2896 	spin_unlock_irq(&engine->active.lock);
2897 }
2898 
2899 struct execlists_capture {
2900 	struct work_struct work;
2901 	struct i915_request *rq;
2902 	struct i915_gpu_coredump *error;
2903 };
2904 
2905 static void execlists_capture_work(struct work_struct *work)
2906 {
2907 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2908 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2909 	struct intel_engine_cs *engine = cap->rq->engine;
2910 	struct intel_gt_coredump *gt = cap->error->gt;
2911 	struct intel_engine_capture_vma *vma;
2912 
2913 	/* Compress all the objects attached to the request, slow! */
2914 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2915 	if (vma) {
2916 		struct i915_vma_compress *compress =
2917 			i915_vma_capture_prepare(gt);
2918 
2919 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2920 		i915_vma_capture_finish(gt, compress);
2921 	}
2922 
2923 	gt->simulated = gt->engine->simulated;
2924 	cap->error->simulated = gt->simulated;
2925 
2926 	/* Publish the error state, and announce it to the world */
2927 	i915_error_state_store(cap->error);
2928 	i915_gpu_coredump_put(cap->error);
2929 
2930 	/* Return this request and all that depend upon it for signaling */
2931 	execlists_unhold(engine, cap->rq);
2932 	i915_request_put(cap->rq);
2933 
2934 	kfree(cap);
2935 }
2936 
2937 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2938 {
2939 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2940 	struct execlists_capture *cap;
2941 
2942 	cap = kmalloc(sizeof(*cap), gfp);
2943 	if (!cap)
2944 		return NULL;
2945 
2946 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2947 	if (!cap->error)
2948 		goto err_cap;
2949 
2950 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2951 	if (!cap->error->gt)
2952 		goto err_gpu;
2953 
2954 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2955 	if (!cap->error->gt->engine)
2956 		goto err_gt;
2957 
2958 	return cap;
2959 
2960 err_gt:
2961 	kfree(cap->error->gt);
2962 err_gpu:
2963 	kfree(cap->error);
2964 err_cap:
2965 	kfree(cap);
2966 	return NULL;
2967 }
2968 
2969 static struct i915_request *
2970 active_context(struct intel_engine_cs *engine, u32 ccid)
2971 {
2972 	const struct intel_engine_execlists * const el = &engine->execlists;
2973 	struct i915_request * const *port, *rq;
2974 
2975 	/*
2976 	 * Use the most recent result from process_csb(), but just in case
2977 	 * we trigger an error (via interrupt) before the first CS event has
2978 	 * been written, peek at the next submission.
2979 	 */
2980 
2981 	for (port = el->active; (rq = *port); port++) {
2982 		if (rq->context->lrc.ccid == ccid) {
2983 			ENGINE_TRACE(engine,
2984 				     "ccid found at active:%zd\n",
2985 				     port - el->active);
2986 			return rq;
2987 		}
2988 	}
2989 
2990 	for (port = el->pending; (rq = *port); port++) {
2991 		if (rq->context->lrc.ccid == ccid) {
2992 			ENGINE_TRACE(engine,
2993 				     "ccid found at pending:%zd\n",
2994 				     port - el->pending);
2995 			return rq;
2996 		}
2997 	}
2998 
2999 	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3000 	return NULL;
3001 }
3002 
3003 static u32 active_ccid(struct intel_engine_cs *engine)
3004 {
3005 	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3006 }
3007 
3008 static bool execlists_capture(struct intel_engine_cs *engine)
3009 {
3010 	struct execlists_capture *cap;
3011 
3012 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3013 		return true;
3014 
3015 	/*
3016 	 * We need to _quickly_ capture the engine state before we reset.
3017 	 * We are inside an atomic section (softirq) here and we are delaying
3018 	 * the forced preemption event.
3019 	 */
3020 	cap = capture_regs(engine);
3021 	if (!cap)
3022 		return true;
3023 
3024 	spin_lock_irq(&engine->active.lock);
3025 	cap->rq = active_context(engine, active_ccid(engine));
3026 	if (cap->rq) {
3027 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3028 		cap->rq = i915_request_get_rcu(cap->rq);
3029 	}
3030 	spin_unlock_irq(&engine->active.lock);
3031 	if (!cap->rq)
3032 		goto err_free;
3033 
3034 	/*
3035 	 * Remove the request from the execlists queue, and take ownership
3036 	 * of the request. We pass it to our worker who will _slowly_ compress
3037 	 * all the pages the _user_ requested for debugging their batch, after
3038 	 * which we return it to the queue for signaling.
3039 	 *
3040 	 * By removing them from the execlists queue, we also remove the
3041 	 * requests from being processed by __unwind_incomplete_requests()
3042 	 * during the intel_engine_reset(), and so they will *not* be replayed
3043 	 * afterwards.
3044 	 *
3045 	 * Note that because we have not yet reset the engine at this point,
3046 	 * it is possible for the request that we have identified as being
3047 	 * guilty, did in fact complete and we will then hit an arbitration
3048 	 * point allowing the outstanding preemption to succeed. The likelihood
3049 	 * of that is very low (as capturing of the engine registers should be
3050 	 * fast enough to run inside an irq-off atomic section!), so we will
3051 	 * simply hold that request accountable for being non-preemptible
3052 	 * long enough to force the reset.
3053 	 */
3054 	if (!execlists_hold(engine, cap->rq))
3055 		goto err_rq;
3056 
3057 	INIT_WORK(&cap->work, execlists_capture_work);
3058 	schedule_work(&cap->work);
3059 	return true;
3060 
3061 err_rq:
3062 	i915_request_put(cap->rq);
3063 err_free:
3064 	i915_gpu_coredump_put(cap->error);
3065 	kfree(cap);
3066 	return false;
3067 }
3068 
3069 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3070 {
3071 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3072 	unsigned long *lock = &engine->gt->reset.flags;
3073 
3074 	if (!intel_has_reset_engine(engine->gt))
3075 		return;
3076 
3077 	if (test_and_set_bit(bit, lock))
3078 		return;
3079 
3080 	ENGINE_TRACE(engine, "reset for %s\n", msg);
3081 
3082 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3083 	tasklet_disable_nosync(&engine->execlists.tasklet);
3084 
3085 	ring_set_paused(engine, 1); /* Freeze the current request in place */
3086 	if (execlists_capture(engine))
3087 		intel_engine_reset(engine, msg);
3088 	else
3089 		ring_set_paused(engine, 0);
3090 
3091 	tasklet_enable(&engine->execlists.tasklet);
3092 	clear_and_wake_up_bit(bit, lock);
3093 }
3094 
3095 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3096 {
3097 	const struct timer_list *t = &engine->execlists.preempt;
3098 
3099 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3100 		return false;
3101 
3102 	if (!timer_expired(t))
3103 		return false;
3104 
3105 	return READ_ONCE(engine->execlists.pending[0]);
3106 }
3107 
3108 /*
3109  * Check the unread Context Status Buffers and manage the submission of new
3110  * contexts to the ELSP accordingly.
3111  */
3112 static void execlists_submission_tasklet(unsigned long data)
3113 {
3114 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3115 	bool timeout = preempt_timeout(engine);
3116 
3117 	process_csb(engine);
3118 
3119 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3120 		engine->execlists.error_interrupt = 0;
3121 		if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
3122 			execlists_reset(engine, "CS error");
3123 	}
3124 
3125 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3126 		unsigned long flags;
3127 
3128 		spin_lock_irqsave(&engine->active.lock, flags);
3129 		__execlists_submission_tasklet(engine);
3130 		spin_unlock_irqrestore(&engine->active.lock, flags);
3131 
3132 		/* Recheck after serialising with direct-submission */
3133 		if (unlikely(timeout && preempt_timeout(engine)))
3134 			execlists_reset(engine, "preemption time out");
3135 	}
3136 }
3137 
3138 static void __execlists_kick(struct intel_engine_execlists *execlists)
3139 {
3140 	/* Kick the tasklet for some interrupt coalescing and reset handling */
3141 	tasklet_hi_schedule(&execlists->tasklet);
3142 }
3143 
3144 #define execlists_kick(t, member) \
3145 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3146 
3147 static void execlists_timeslice(struct timer_list *timer)
3148 {
3149 	execlists_kick(timer, timer);
3150 }
3151 
3152 static void execlists_preempt(struct timer_list *timer)
3153 {
3154 	execlists_kick(timer, preempt);
3155 }
3156 
3157 static void queue_request(struct intel_engine_cs *engine,
3158 			  struct i915_request *rq)
3159 {
3160 	GEM_BUG_ON(!list_empty(&rq->sched.link));
3161 	list_add_tail(&rq->sched.link,
3162 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3163 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3164 }
3165 
3166 static void __submit_queue_imm(struct intel_engine_cs *engine)
3167 {
3168 	struct intel_engine_execlists * const execlists = &engine->execlists;
3169 
3170 	if (reset_in_progress(execlists))
3171 		return; /* defer until we restart the engine following reset */
3172 
3173 	__execlists_submission_tasklet(engine);
3174 }
3175 
3176 static void submit_queue(struct intel_engine_cs *engine,
3177 			 const struct i915_request *rq)
3178 {
3179 	struct intel_engine_execlists *execlists = &engine->execlists;
3180 
3181 	if (rq_prio(rq) <= execlists->queue_priority_hint)
3182 		return;
3183 
3184 	execlists->queue_priority_hint = rq_prio(rq);
3185 	__submit_queue_imm(engine);
3186 }
3187 
3188 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3189 			     const struct i915_request *rq)
3190 {
3191 	GEM_BUG_ON(i915_request_on_hold(rq));
3192 	return !list_empty(&engine->active.hold) && hold_request(rq);
3193 }
3194 
3195 static void flush_csb(struct intel_engine_cs *engine)
3196 {
3197 	struct intel_engine_execlists *el = &engine->execlists;
3198 
3199 	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3200 		if (!reset_in_progress(el))
3201 			process_csb(engine);
3202 		tasklet_unlock(&el->tasklet);
3203 	}
3204 }
3205 
3206 static void execlists_submit_request(struct i915_request *request)
3207 {
3208 	struct intel_engine_cs *engine = request->engine;
3209 	unsigned long flags;
3210 
3211 	/* Hopefully we clear execlists->pending[] to let us through */
3212 	flush_csb(engine);
3213 
3214 	/* Will be called from irq-context when using foreign fences. */
3215 	spin_lock_irqsave(&engine->active.lock, flags);
3216 
3217 	if (unlikely(ancestor_on_hold(engine, request))) {
3218 		RQ_TRACE(request, "ancestor on hold\n");
3219 		list_add_tail(&request->sched.link, &engine->active.hold);
3220 		i915_request_set_hold(request);
3221 	} else {
3222 		queue_request(engine, request);
3223 
3224 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3225 		GEM_BUG_ON(list_empty(&request->sched.link));
3226 
3227 		submit_queue(engine, request);
3228 	}
3229 
3230 	spin_unlock_irqrestore(&engine->active.lock, flags);
3231 }
3232 
3233 static void __execlists_context_fini(struct intel_context *ce)
3234 {
3235 	intel_ring_put(ce->ring);
3236 	i915_vma_put(ce->state);
3237 }
3238 
3239 static void execlists_context_destroy(struct kref *kref)
3240 {
3241 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3242 
3243 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3244 	GEM_BUG_ON(intel_context_is_pinned(ce));
3245 
3246 	if (ce->state)
3247 		__execlists_context_fini(ce);
3248 
3249 	intel_context_fini(ce);
3250 	intel_context_free(ce);
3251 }
3252 
3253 static void
3254 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3255 {
3256 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3257 		return;
3258 
3259 	vaddr += engine->context_size;
3260 
3261 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3262 }
3263 
3264 static void
3265 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3266 {
3267 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3268 		return;
3269 
3270 	vaddr += engine->context_size;
3271 
3272 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3273 		drm_err_once(&engine->i915->drm,
3274 			     "%s context redzone overwritten!\n",
3275 			     engine->name);
3276 }
3277 
3278 static void execlists_context_unpin(struct intel_context *ce)
3279 {
3280 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3281 		      ce->engine);
3282 
3283 	i915_gem_object_unpin_map(ce->state->obj);
3284 }
3285 
3286 static u32 *
3287 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3288 {
3289 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3290 		MI_SRM_LRM_GLOBAL_GTT |
3291 		MI_LRI_LRM_CS_MMIO;
3292 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3293 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3294 		CTX_TIMESTAMP * sizeof(u32);
3295 	*cs++ = 0;
3296 
3297 	*cs++ = MI_LOAD_REGISTER_REG |
3298 		MI_LRR_SOURCE_CS_MMIO |
3299 		MI_LRI_LRM_CS_MMIO;
3300 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3301 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3302 
3303 	*cs++ = MI_LOAD_REGISTER_REG |
3304 		MI_LRR_SOURCE_CS_MMIO |
3305 		MI_LRI_LRM_CS_MMIO;
3306 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3307 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3308 
3309 	return cs;
3310 }
3311 
3312 static u32 *
3313 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3314 {
3315 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3316 
3317 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3318 		MI_SRM_LRM_GLOBAL_GTT |
3319 		MI_LRI_LRM_CS_MMIO;
3320 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3321 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3322 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3323 	*cs++ = 0;
3324 
3325 	return cs;
3326 }
3327 
3328 static u32 *
3329 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3330 {
3331 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3332 
3333 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3334 		MI_SRM_LRM_GLOBAL_GTT |
3335 		MI_LRI_LRM_CS_MMIO;
3336 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3337 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3338 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3339 	*cs++ = 0;
3340 
3341 	*cs++ = MI_LOAD_REGISTER_REG |
3342 		MI_LRR_SOURCE_CS_MMIO |
3343 		MI_LRI_LRM_CS_MMIO;
3344 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3345 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3346 
3347 	return cs;
3348 }
3349 
3350 static u32 *
3351 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3352 {
3353 	cs = gen12_emit_timestamp_wa(ce, cs);
3354 	cs = gen12_emit_cmd_buf_wa(ce, cs);
3355 	cs = gen12_emit_restore_scratch(ce, cs);
3356 
3357 	return cs;
3358 }
3359 
3360 static u32 *
3361 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3362 {
3363 	cs = gen12_emit_timestamp_wa(ce, cs);
3364 	cs = gen12_emit_restore_scratch(ce, cs);
3365 
3366 	return cs;
3367 }
3368 
3369 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3370 {
3371 	return PAGE_SIZE * ce->wa_bb_page;
3372 }
3373 
3374 static u32 *context_indirect_bb(const struct intel_context *ce)
3375 {
3376 	void *ptr;
3377 
3378 	GEM_BUG_ON(!ce->wa_bb_page);
3379 
3380 	ptr = ce->lrc_reg_state;
3381 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3382 	ptr += context_wa_bb_offset(ce);
3383 
3384 	return ptr;
3385 }
3386 
3387 static void
3388 setup_indirect_ctx_bb(const struct intel_context *ce,
3389 		      const struct intel_engine_cs *engine,
3390 		      u32 *(*emit)(const struct intel_context *, u32 *))
3391 {
3392 	u32 * const start = context_indirect_bb(ce);
3393 	u32 *cs;
3394 
3395 	cs = emit(ce, start);
3396 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3397 	while ((unsigned long)cs % CACHELINE_BYTES)
3398 		*cs++ = MI_NOOP;
3399 
3400 	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3401 				    i915_ggtt_offset(ce->state) +
3402 				    context_wa_bb_offset(ce),
3403 				    (cs - start) * sizeof(*cs));
3404 }
3405 
3406 static void
3407 __execlists_update_reg_state(const struct intel_context *ce,
3408 			     const struct intel_engine_cs *engine,
3409 			     u32 head)
3410 {
3411 	struct intel_ring *ring = ce->ring;
3412 	u32 *regs = ce->lrc_reg_state;
3413 
3414 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3415 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3416 
3417 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3418 	regs[CTX_RING_HEAD] = head;
3419 	regs[CTX_RING_TAIL] = ring->tail;
3420 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3421 
3422 	/* RPCS */
3423 	if (engine->class == RENDER_CLASS) {
3424 		regs[CTX_R_PWR_CLK_STATE] =
3425 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3426 
3427 		i915_oa_init_reg_state(ce, engine);
3428 	}
3429 
3430 	if (ce->wa_bb_page) {
3431 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3432 
3433 		fn = gen12_emit_indirect_ctx_xcs;
3434 		if (ce->engine->class == RENDER_CLASS)
3435 			fn = gen12_emit_indirect_ctx_rcs;
3436 
3437 		/* Mutually exclusive wrt to global indirect bb */
3438 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3439 		setup_indirect_ctx_bb(ce, engine, fn);
3440 	}
3441 }
3442 
3443 static int
3444 __execlists_context_pin(struct intel_context *ce,
3445 			struct intel_engine_cs *engine)
3446 {
3447 	void *vaddr;
3448 
3449 	GEM_BUG_ON(!ce->state);
3450 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3451 
3452 	vaddr = i915_gem_object_pin_map(ce->state->obj,
3453 					i915_coherent_map_type(engine->i915) |
3454 					I915_MAP_OVERRIDE);
3455 	if (IS_ERR(vaddr))
3456 		return PTR_ERR(vaddr);
3457 
3458 	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3459 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3460 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3461 
3462 	return 0;
3463 }
3464 
3465 static int execlists_context_pin(struct intel_context *ce)
3466 {
3467 	return __execlists_context_pin(ce, ce->engine);
3468 }
3469 
3470 static int execlists_context_alloc(struct intel_context *ce)
3471 {
3472 	return __execlists_context_alloc(ce, ce->engine);
3473 }
3474 
3475 static void execlists_context_reset(struct intel_context *ce)
3476 {
3477 	CE_TRACE(ce, "reset\n");
3478 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3479 
3480 	intel_ring_reset(ce->ring, ce->ring->emit);
3481 
3482 	/* Scrub away the garbage */
3483 	execlists_init_reg_state(ce->lrc_reg_state,
3484 				 ce, ce->engine, ce->ring, true);
3485 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3486 
3487 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3488 }
3489 
3490 static const struct intel_context_ops execlists_context_ops = {
3491 	.alloc = execlists_context_alloc,
3492 
3493 	.pin = execlists_context_pin,
3494 	.unpin = execlists_context_unpin,
3495 
3496 	.enter = intel_context_enter_engine,
3497 	.exit = intel_context_exit_engine,
3498 
3499 	.reset = execlists_context_reset,
3500 	.destroy = execlists_context_destroy,
3501 };
3502 
3503 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3504 {
3505 	u32 *cs;
3506 
3507 	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3508 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3509 		return 0;
3510 
3511 	cs = intel_ring_begin(rq, 6);
3512 	if (IS_ERR(cs))
3513 		return PTR_ERR(cs);
3514 
3515 	/*
3516 	 * Check if we have been preempted before we even get started.
3517 	 *
3518 	 * After this point i915_request_started() reports true, even if
3519 	 * we get preempted and so are no longer running.
3520 	 */
3521 	*cs++ = MI_ARB_CHECK;
3522 	*cs++ = MI_NOOP;
3523 
3524 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3525 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3526 	*cs++ = 0;
3527 	*cs++ = rq->fence.seqno - 1;
3528 
3529 	intel_ring_advance(rq, cs);
3530 
3531 	/* Record the updated position of the request's payload */
3532 	rq->infix = intel_ring_offset(rq, cs);
3533 
3534 	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3535 
3536 	return 0;
3537 }
3538 
3539 static int emit_pdps(struct i915_request *rq)
3540 {
3541 	const struct intel_engine_cs * const engine = rq->engine;
3542 	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3543 	int err, i;
3544 	u32 *cs;
3545 
3546 	GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3547 
3548 	/*
3549 	 * Beware ye of the dragons, this sequence is magic!
3550 	 *
3551 	 * Small changes to this sequence can cause anything from
3552 	 * GPU hangs to forcewake errors and machine lockups!
3553 	 */
3554 
3555 	/* Flush any residual operations from the context load */
3556 	err = engine->emit_flush(rq, EMIT_FLUSH);
3557 	if (err)
3558 		return err;
3559 
3560 	/* Magic required to prevent forcewake errors! */
3561 	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3562 	if (err)
3563 		return err;
3564 
3565 	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3566 	if (IS_ERR(cs))
3567 		return PTR_ERR(cs);
3568 
3569 	/* Ensure the LRI have landed before we invalidate & continue */
3570 	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3571 	for (i = GEN8_3LVL_PDPES; i--; ) {
3572 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3573 		u32 base = engine->mmio_base;
3574 
3575 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3576 		*cs++ = upper_32_bits(pd_daddr);
3577 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3578 		*cs++ = lower_32_bits(pd_daddr);
3579 	}
3580 	*cs++ = MI_NOOP;
3581 
3582 	intel_ring_advance(rq, cs);
3583 
3584 	return 0;
3585 }
3586 
3587 static int execlists_request_alloc(struct i915_request *request)
3588 {
3589 	int ret;
3590 
3591 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3592 
3593 	/*
3594 	 * Flush enough space to reduce the likelihood of waiting after
3595 	 * we start building the request - in which case we will just
3596 	 * have to repeat work.
3597 	 */
3598 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3599 
3600 	/*
3601 	 * Note that after this point, we have committed to using
3602 	 * this request as it is being used to both track the
3603 	 * state of engine initialisation and liveness of the
3604 	 * golden renderstate above. Think twice before you try
3605 	 * to cancel/unwind this request now.
3606 	 */
3607 
3608 	if (!i915_vm_is_4lvl(request->context->vm)) {
3609 		ret = emit_pdps(request);
3610 		if (ret)
3611 			return ret;
3612 	}
3613 
3614 	/* Unconditionally invalidate GPU caches and TLBs. */
3615 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3616 	if (ret)
3617 		return ret;
3618 
3619 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3620 	return 0;
3621 }
3622 
3623 /*
3624  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3625  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3626  * but there is a slight complication as this is applied in WA batch where the
3627  * values are only initialized once so we cannot take register value at the
3628  * beginning and reuse it further; hence we save its value to memory, upload a
3629  * constant value with bit21 set and then we restore it back with the saved value.
3630  * To simplify the WA, a constant value is formed by using the default value
3631  * of this register. This shouldn't be a problem because we are only modifying
3632  * it for a short period and this batch in non-premptible. We can ofcourse
3633  * use additional instructions that read the actual value of the register
3634  * at that time and set our bit of interest but it makes the WA complicated.
3635  *
3636  * This WA is also required for Gen9 so extracting as a function avoids
3637  * code duplication.
3638  */
3639 static u32 *
3640 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3641 {
3642 	/* NB no one else is allowed to scribble over scratch + 256! */
3643 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3644 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3645 	*batch++ = intel_gt_scratch_offset(engine->gt,
3646 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3647 	*batch++ = 0;
3648 
3649 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3650 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3651 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3652 
3653 	batch = gen8_emit_pipe_control(batch,
3654 				       PIPE_CONTROL_CS_STALL |
3655 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3656 				       0);
3657 
3658 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3659 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3660 	*batch++ = intel_gt_scratch_offset(engine->gt,
3661 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3662 	*batch++ = 0;
3663 
3664 	return batch;
3665 }
3666 
3667 /*
3668  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3669  * initialized at the beginning and shared across all contexts but this field
3670  * helps us to have multiple batches at different offsets and select them based
3671  * on a criteria. At the moment this batch always start at the beginning of the page
3672  * and at this point we don't have multiple wa_ctx batch buffers.
3673  *
3674  * The number of WA applied are not known at the beginning; we use this field
3675  * to return the no of DWORDS written.
3676  *
3677  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3678  * so it adds NOOPs as padding to make it cacheline aligned.
3679  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3680  * makes a complete batch buffer.
3681  */
3682 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3683 {
3684 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3685 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3686 
3687 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3688 	if (IS_BROADWELL(engine->i915))
3689 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3690 
3691 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3692 	/* Actual scratch location is at 128 bytes offset */
3693 	batch = gen8_emit_pipe_control(batch,
3694 				       PIPE_CONTROL_FLUSH_L3 |
3695 				       PIPE_CONTROL_STORE_DATA_INDEX |
3696 				       PIPE_CONTROL_CS_STALL |
3697 				       PIPE_CONTROL_QW_WRITE,
3698 				       LRC_PPHWSP_SCRATCH_ADDR);
3699 
3700 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3701 
3702 	/* Pad to end of cacheline */
3703 	while ((unsigned long)batch % CACHELINE_BYTES)
3704 		*batch++ = MI_NOOP;
3705 
3706 	/*
3707 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3708 	 * execution depends on the length specified in terms of cache lines
3709 	 * in the register CTX_RCS_INDIRECT_CTX
3710 	 */
3711 
3712 	return batch;
3713 }
3714 
3715 struct lri {
3716 	i915_reg_t reg;
3717 	u32 value;
3718 };
3719 
3720 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3721 {
3722 	GEM_BUG_ON(!count || count > 63);
3723 
3724 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3725 	do {
3726 		*batch++ = i915_mmio_reg_offset(lri->reg);
3727 		*batch++ = lri->value;
3728 	} while (lri++, --count);
3729 	*batch++ = MI_NOOP;
3730 
3731 	return batch;
3732 }
3733 
3734 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3735 {
3736 	static const struct lri lri[] = {
3737 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3738 		{
3739 			COMMON_SLICE_CHICKEN2,
3740 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3741 				       0),
3742 		},
3743 
3744 		/* BSpec: 11391 */
3745 		{
3746 			FF_SLICE_CHICKEN,
3747 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3748 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3749 		},
3750 
3751 		/* BSpec: 11299 */
3752 		{
3753 			_3D_CHICKEN3,
3754 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3755 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3756 		}
3757 	};
3758 
3759 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3760 
3761 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3762 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3763 
3764 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3765 	batch = gen8_emit_pipe_control(batch,
3766 				       PIPE_CONTROL_FLUSH_L3 |
3767 				       PIPE_CONTROL_STORE_DATA_INDEX |
3768 				       PIPE_CONTROL_CS_STALL |
3769 				       PIPE_CONTROL_QW_WRITE,
3770 				       LRC_PPHWSP_SCRATCH_ADDR);
3771 
3772 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3773 
3774 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3775 	if (HAS_POOLED_EU(engine->i915)) {
3776 		/*
3777 		 * EU pool configuration is setup along with golden context
3778 		 * during context initialization. This value depends on
3779 		 * device type (2x6 or 3x6) and needs to be updated based
3780 		 * on which subslice is disabled especially for 2x6
3781 		 * devices, however it is safe to load default
3782 		 * configuration of 3x6 device instead of masking off
3783 		 * corresponding bits because HW ignores bits of a disabled
3784 		 * subslice and drops down to appropriate config. Please
3785 		 * see render_state_setup() in i915_gem_render_state.c for
3786 		 * possible configurations, to avoid duplication they are
3787 		 * not shown here again.
3788 		 */
3789 		*batch++ = GEN9_MEDIA_POOL_STATE;
3790 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3791 		*batch++ = 0x00777000;
3792 		*batch++ = 0;
3793 		*batch++ = 0;
3794 		*batch++ = 0;
3795 	}
3796 
3797 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3798 
3799 	/* Pad to end of cacheline */
3800 	while ((unsigned long)batch % CACHELINE_BYTES)
3801 		*batch++ = MI_NOOP;
3802 
3803 	return batch;
3804 }
3805 
3806 static u32 *
3807 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3808 {
3809 	int i;
3810 
3811 	/*
3812 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3813 	 *
3814 	 * Ensure the engine is idle prior to programming a
3815 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3816 	 */
3817 	batch = gen8_emit_pipe_control(batch,
3818 				       PIPE_CONTROL_CS_STALL,
3819 				       0);
3820 	/*
3821 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3822 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3823 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3824 	 * confusing. Since gen8_emit_pipe_control() already advances the
3825 	 * batch by 6 dwords, we advance the other 10 here, completing a
3826 	 * cacheline. It's not clear if the workaround requires this padding
3827 	 * before other commands, or if it's just the regular padding we would
3828 	 * already have for the workaround bb, so leave it here for now.
3829 	 */
3830 	for (i = 0; i < 10; i++)
3831 		*batch++ = MI_NOOP;
3832 
3833 	/* Pad to end of cacheline */
3834 	while ((unsigned long)batch % CACHELINE_BYTES)
3835 		*batch++ = MI_NOOP;
3836 
3837 	return batch;
3838 }
3839 
3840 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3841 
3842 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3843 {
3844 	struct drm_i915_gem_object *obj;
3845 	struct i915_vma *vma;
3846 	int err;
3847 
3848 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3849 	if (IS_ERR(obj))
3850 		return PTR_ERR(obj);
3851 
3852 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3853 	if (IS_ERR(vma)) {
3854 		err = PTR_ERR(vma);
3855 		goto err;
3856 	}
3857 
3858 	err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3859 	if (err)
3860 		goto err;
3861 
3862 	engine->wa_ctx.vma = vma;
3863 	return 0;
3864 
3865 err:
3866 	i915_gem_object_put(obj);
3867 	return err;
3868 }
3869 
3870 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3871 {
3872 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3873 }
3874 
3875 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3876 
3877 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3878 {
3879 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3880 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3881 					    &wa_ctx->per_ctx };
3882 	wa_bb_func_t wa_bb_fn[2];
3883 	struct page *page;
3884 	void *batch, *batch_ptr;
3885 	unsigned int i;
3886 	int ret;
3887 
3888 	if (engine->class != RENDER_CLASS)
3889 		return 0;
3890 
3891 	switch (INTEL_GEN(engine->i915)) {
3892 	case 12:
3893 	case 11:
3894 		return 0;
3895 	case 10:
3896 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3897 		wa_bb_fn[1] = NULL;
3898 		break;
3899 	case 9:
3900 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3901 		wa_bb_fn[1] = NULL;
3902 		break;
3903 	case 8:
3904 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3905 		wa_bb_fn[1] = NULL;
3906 		break;
3907 	default:
3908 		MISSING_CASE(INTEL_GEN(engine->i915));
3909 		return 0;
3910 	}
3911 
3912 	ret = lrc_setup_wa_ctx(engine);
3913 	if (ret) {
3914 		drm_dbg(&engine->i915->drm,
3915 			"Failed to setup context WA page: %d\n", ret);
3916 		return ret;
3917 	}
3918 
3919 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3920 	batch = batch_ptr = kmap_atomic(page);
3921 
3922 	/*
3923 	 * Emit the two workaround batch buffers, recording the offset from the
3924 	 * start of the workaround batch buffer object for each and their
3925 	 * respective sizes.
3926 	 */
3927 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3928 		wa_bb[i]->offset = batch_ptr - batch;
3929 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3930 						  CACHELINE_BYTES))) {
3931 			ret = -EINVAL;
3932 			break;
3933 		}
3934 		if (wa_bb_fn[i])
3935 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3936 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3937 	}
3938 
3939 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3940 
3941 	kunmap_atomic(batch);
3942 	if (ret)
3943 		lrc_destroy_wa_ctx(engine);
3944 
3945 	return ret;
3946 }
3947 
3948 static void reset_csb_pointers(struct intel_engine_cs *engine)
3949 {
3950 	struct intel_engine_execlists * const execlists = &engine->execlists;
3951 	const unsigned int reset_value = execlists->csb_size - 1;
3952 
3953 	ring_set_paused(engine, 0);
3954 
3955 	/*
3956 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3957 	 * Bludgeon them with a mmio update to be sure.
3958 	 */
3959 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3960 		     0xffff << 16 | reset_value << 8 | reset_value);
3961 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3962 
3963 	/*
3964 	 * After a reset, the HW starts writing into CSB entry [0]. We
3965 	 * therefore have to set our HEAD pointer back one entry so that
3966 	 * the *first* entry we check is entry 0. To complicate this further,
3967 	 * as we don't wait for the first interrupt after reset, we have to
3968 	 * fake the HW write to point back to the last entry so that our
3969 	 * inline comparison of our cached head position against the last HW
3970 	 * write works even before the first interrupt.
3971 	 */
3972 	execlists->csb_head = reset_value;
3973 	WRITE_ONCE(*execlists->csb_write, reset_value);
3974 	wmb(); /* Make sure this is visible to HW (paranoia?) */
3975 
3976 	invalidate_csb_entries(&execlists->csb_status[0],
3977 			       &execlists->csb_status[reset_value]);
3978 
3979 	/* Once more for luck and our trusty paranoia */
3980 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3981 		     0xffff << 16 | reset_value << 8 | reset_value);
3982 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3983 
3984 	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
3985 }
3986 
3987 static void execlists_sanitize(struct intel_engine_cs *engine)
3988 {
3989 	/*
3990 	 * Poison residual state on resume, in case the suspend didn't!
3991 	 *
3992 	 * We have to assume that across suspend/resume (or other loss
3993 	 * of control) that the contents of our pinned buffers has been
3994 	 * lost, replaced by garbage. Since this doesn't always happen,
3995 	 * let's poison such state so that we more quickly spot when
3996 	 * we falsely assume it has been preserved.
3997 	 */
3998 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3999 		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4000 
4001 	reset_csb_pointers(engine);
4002 
4003 	/*
4004 	 * The kernel_context HWSP is stored in the status_page. As above,
4005 	 * that may be lost on resume/initialisation, and so we need to
4006 	 * reset the value in the HWSP.
4007 	 */
4008 	intel_timeline_reset_seqno(engine->kernel_context->timeline);
4009 
4010 	/* And scrub the dirty cachelines for the HWSP */
4011 	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4012 }
4013 
4014 static void enable_error_interrupt(struct intel_engine_cs *engine)
4015 {
4016 	u32 status;
4017 
4018 	engine->execlists.error_interrupt = 0;
4019 	ENGINE_WRITE(engine, RING_EMR, ~0u);
4020 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4021 
4022 	status = ENGINE_READ(engine, RING_ESR);
4023 	if (unlikely(status)) {
4024 		drm_err(&engine->i915->drm,
4025 			"engine '%s' resumed still in error: %08x\n",
4026 			engine->name, status);
4027 		__intel_gt_reset(engine->gt, engine->mask);
4028 	}
4029 
4030 	/*
4031 	 * On current gen8+, we have 2 signals to play with
4032 	 *
4033 	 * - I915_ERROR_INSTUCTION (bit 0)
4034 	 *
4035 	 *    Generate an error if the command parser encounters an invalid
4036 	 *    instruction
4037 	 *
4038 	 *    This is a fatal error.
4039 	 *
4040 	 * - CP_PRIV (bit 2)
4041 	 *
4042 	 *    Generate an error on privilege violation (where the CP replaces
4043 	 *    the instruction with a no-op). This also fires for writes into
4044 	 *    read-only scratch pages.
4045 	 *
4046 	 *    This is a non-fatal error, parsing continues.
4047 	 *
4048 	 * * there are a few others defined for odd HW that we do not use
4049 	 *
4050 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4051 	 * error (as the HW is validating and suppressing the mistakes), we
4052 	 * only unmask the instruction error bit.
4053 	 */
4054 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4055 }
4056 
4057 static void enable_execlists(struct intel_engine_cs *engine)
4058 {
4059 	u32 mode;
4060 
4061 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4062 
4063 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4064 
4065 	if (INTEL_GEN(engine->i915) >= 11)
4066 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4067 	else
4068 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4069 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4070 
4071 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4072 
4073 	ENGINE_WRITE_FW(engine,
4074 			RING_HWS_PGA,
4075 			i915_ggtt_offset(engine->status_page.vma));
4076 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4077 
4078 	enable_error_interrupt(engine);
4079 
4080 	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4081 }
4082 
4083 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4084 {
4085 	bool unexpected = false;
4086 
4087 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4088 		drm_dbg(&engine->i915->drm,
4089 			"STOP_RING still set in RING_MI_MODE\n");
4090 		unexpected = true;
4091 	}
4092 
4093 	return unexpected;
4094 }
4095 
4096 static int execlists_resume(struct intel_engine_cs *engine)
4097 {
4098 	intel_mocs_init_engine(engine);
4099 
4100 	intel_engine_reset_breadcrumbs(engine);
4101 
4102 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4103 		struct drm_printer p = drm_debug_printer(__func__);
4104 
4105 		intel_engine_dump(engine, &p, NULL);
4106 	}
4107 
4108 	enable_execlists(engine);
4109 
4110 	return 0;
4111 }
4112 
4113 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4114 {
4115 	struct intel_engine_execlists * const execlists = &engine->execlists;
4116 	unsigned long flags;
4117 
4118 	ENGINE_TRACE(engine, "depth<-%d\n",
4119 		     atomic_read(&execlists->tasklet.count));
4120 
4121 	/*
4122 	 * Prevent request submission to the hardware until we have
4123 	 * completed the reset in i915_gem_reset_finish(). If a request
4124 	 * is completed by one engine, it may then queue a request
4125 	 * to a second via its execlists->tasklet *just* as we are
4126 	 * calling engine->resume() and also writing the ELSP.
4127 	 * Turning off the execlists->tasklet until the reset is over
4128 	 * prevents the race.
4129 	 */
4130 	__tasklet_disable_sync_once(&execlists->tasklet);
4131 	GEM_BUG_ON(!reset_in_progress(execlists));
4132 
4133 	/* And flush any current direct submission. */
4134 	spin_lock_irqsave(&engine->active.lock, flags);
4135 	spin_unlock_irqrestore(&engine->active.lock, flags);
4136 
4137 	/*
4138 	 * We stop engines, otherwise we might get failed reset and a
4139 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4140 	 * from system hang if batchbuffer is progressing when
4141 	 * the reset is issued, regardless of READY_TO_RESET ack.
4142 	 * Thus assume it is best to stop engines on all gens
4143 	 * where we have a gpu reset.
4144 	 *
4145 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4146 	 *
4147 	 * FIXME: Wa for more modern gens needs to be validated
4148 	 */
4149 	ring_set_paused(engine, 1);
4150 	intel_engine_stop_cs(engine);
4151 
4152 	engine->execlists.reset_ccid = active_ccid(engine);
4153 }
4154 
4155 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4156 {
4157 	int x;
4158 
4159 	x = lrc_ring_mi_mode(engine);
4160 	if (x != -1) {
4161 		regs[x + 1] &= ~STOP_RING;
4162 		regs[x + 1] |= STOP_RING << 16;
4163 	}
4164 }
4165 
4166 static void __execlists_reset_reg_state(const struct intel_context *ce,
4167 					const struct intel_engine_cs *engine)
4168 {
4169 	u32 *regs = ce->lrc_reg_state;
4170 
4171 	__reset_stop_ring(regs, engine);
4172 }
4173 
4174 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4175 {
4176 	struct intel_engine_execlists * const execlists = &engine->execlists;
4177 	struct intel_context *ce;
4178 	struct i915_request *rq;
4179 	u32 head;
4180 
4181 	mb(); /* paranoia: read the CSB pointers from after the reset */
4182 	clflush(execlists->csb_write);
4183 	mb();
4184 
4185 	process_csb(engine); /* drain preemption events */
4186 
4187 	/* Following the reset, we need to reload the CSB read/write pointers */
4188 	reset_csb_pointers(engine);
4189 
4190 	/*
4191 	 * Save the currently executing context, even if we completed
4192 	 * its request, it was still running at the time of the
4193 	 * reset and will have been clobbered.
4194 	 */
4195 	rq = active_context(engine, engine->execlists.reset_ccid);
4196 	if (!rq)
4197 		goto unwind;
4198 
4199 	ce = rq->context;
4200 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4201 
4202 	if (i915_request_completed(rq)) {
4203 		/* Idle context; tidy up the ring so we can restart afresh */
4204 		head = intel_ring_wrap(ce->ring, rq->tail);
4205 		goto out_replay;
4206 	}
4207 
4208 	/* We still have requests in-flight; the engine should be active */
4209 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4210 
4211 	/* Context has requests still in-flight; it should not be idle! */
4212 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4213 
4214 	rq = active_request(ce->timeline, rq);
4215 	head = intel_ring_wrap(ce->ring, rq->head);
4216 	GEM_BUG_ON(head == ce->ring->tail);
4217 
4218 	/*
4219 	 * If this request hasn't started yet, e.g. it is waiting on a
4220 	 * semaphore, we need to avoid skipping the request or else we
4221 	 * break the signaling chain. However, if the context is corrupt
4222 	 * the request will not restart and we will be stuck with a wedged
4223 	 * device. It is quite often the case that if we issue a reset
4224 	 * while the GPU is loading the context image, that the context
4225 	 * image becomes corrupt.
4226 	 *
4227 	 * Otherwise, if we have not started yet, the request should replay
4228 	 * perfectly and we do not need to flag the result as being erroneous.
4229 	 */
4230 	if (!i915_request_started(rq))
4231 		goto out_replay;
4232 
4233 	/*
4234 	 * If the request was innocent, we leave the request in the ELSP
4235 	 * and will try to replay it on restarting. The context image may
4236 	 * have been corrupted by the reset, in which case we may have
4237 	 * to service a new GPU hang, but more likely we can continue on
4238 	 * without impact.
4239 	 *
4240 	 * If the request was guilty, we presume the context is corrupt
4241 	 * and have to at least restore the RING register in the context
4242 	 * image back to the expected values to skip over the guilty request.
4243 	 */
4244 	__i915_request_reset(rq, stalled);
4245 
4246 	/*
4247 	 * We want a simple context + ring to execute the breadcrumb update.
4248 	 * We cannot rely on the context being intact across the GPU hang,
4249 	 * so clear it and rebuild just what we need for the breadcrumb.
4250 	 * All pending requests for this context will be zapped, and any
4251 	 * future request will be after userspace has had the opportunity
4252 	 * to recreate its own state.
4253 	 */
4254 out_replay:
4255 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4256 		     head, ce->ring->tail);
4257 	__execlists_reset_reg_state(ce, engine);
4258 	__execlists_update_reg_state(ce, engine, head);
4259 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4260 
4261 unwind:
4262 	/* Push back any incomplete requests for replay after the reset. */
4263 	cancel_port_requests(execlists);
4264 	__unwind_incomplete_requests(engine);
4265 }
4266 
4267 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4268 {
4269 	unsigned long flags;
4270 
4271 	ENGINE_TRACE(engine, "\n");
4272 
4273 	spin_lock_irqsave(&engine->active.lock, flags);
4274 
4275 	__execlists_reset(engine, stalled);
4276 
4277 	spin_unlock_irqrestore(&engine->active.lock, flags);
4278 }
4279 
4280 static void nop_submission_tasklet(unsigned long data)
4281 {
4282 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4283 
4284 	/* The driver is wedged; don't process any more events. */
4285 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4286 }
4287 
4288 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4289 {
4290 	struct intel_engine_execlists * const execlists = &engine->execlists;
4291 	struct i915_request *rq, *rn;
4292 	struct rb_node *rb;
4293 	unsigned long flags;
4294 
4295 	ENGINE_TRACE(engine, "\n");
4296 
4297 	/*
4298 	 * Before we call engine->cancel_requests(), we should have exclusive
4299 	 * access to the submission state. This is arranged for us by the
4300 	 * caller disabling the interrupt generation, the tasklet and other
4301 	 * threads that may then access the same state, giving us a free hand
4302 	 * to reset state. However, we still need to let lockdep be aware that
4303 	 * we know this state may be accessed in hardirq context, so we
4304 	 * disable the irq around this manipulation and we want to keep
4305 	 * the spinlock focused on its duties and not accidentally conflate
4306 	 * coverage to the submission's irq state. (Similarly, although we
4307 	 * shouldn't need to disable irq around the manipulation of the
4308 	 * submission's irq state, we also wish to remind ourselves that
4309 	 * it is irq state.)
4310 	 */
4311 	spin_lock_irqsave(&engine->active.lock, flags);
4312 
4313 	__execlists_reset(engine, true);
4314 
4315 	/* Mark all executing requests as skipped. */
4316 	list_for_each_entry(rq, &engine->active.requests, sched.link)
4317 		mark_eio(rq);
4318 
4319 	/* Flush the queued requests to the timeline list (for retiring). */
4320 	while ((rb = rb_first_cached(&execlists->queue))) {
4321 		struct i915_priolist *p = to_priolist(rb);
4322 		int i;
4323 
4324 		priolist_for_each_request_consume(rq, rn, p, i) {
4325 			mark_eio(rq);
4326 			__i915_request_submit(rq);
4327 		}
4328 
4329 		rb_erase_cached(&p->node, &execlists->queue);
4330 		i915_priolist_free(p);
4331 	}
4332 
4333 	/* On-hold requests will be flushed to timeline upon their release */
4334 	list_for_each_entry(rq, &engine->active.hold, sched.link)
4335 		mark_eio(rq);
4336 
4337 	/* Cancel all attached virtual engines */
4338 	while ((rb = rb_first_cached(&execlists->virtual))) {
4339 		struct virtual_engine *ve =
4340 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4341 
4342 		rb_erase_cached(rb, &execlists->virtual);
4343 		RB_CLEAR_NODE(rb);
4344 
4345 		spin_lock(&ve->base.active.lock);
4346 		rq = fetch_and_zero(&ve->request);
4347 		if (rq) {
4348 			mark_eio(rq);
4349 
4350 			rq->engine = engine;
4351 			__i915_request_submit(rq);
4352 			i915_request_put(rq);
4353 
4354 			ve->base.execlists.queue_priority_hint = INT_MIN;
4355 		}
4356 		spin_unlock(&ve->base.active.lock);
4357 	}
4358 
4359 	/* Remaining _unready_ requests will be nop'ed when submitted */
4360 
4361 	execlists->queue_priority_hint = INT_MIN;
4362 	execlists->queue = RB_ROOT_CACHED;
4363 
4364 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4365 	execlists->tasklet.func = nop_submission_tasklet;
4366 
4367 	spin_unlock_irqrestore(&engine->active.lock, flags);
4368 }
4369 
4370 static void execlists_reset_finish(struct intel_engine_cs *engine)
4371 {
4372 	struct intel_engine_execlists * const execlists = &engine->execlists;
4373 
4374 	/*
4375 	 * After a GPU reset, we may have requests to replay. Do so now while
4376 	 * we still have the forcewake to be sure that the GPU is not allowed
4377 	 * to sleep before we restart and reload a context.
4378 	 */
4379 	GEM_BUG_ON(!reset_in_progress(execlists));
4380 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4381 		execlists->tasklet.func(execlists->tasklet.data);
4382 
4383 	if (__tasklet_enable(&execlists->tasklet))
4384 		/* And kick in case we missed a new request submission. */
4385 		tasklet_hi_schedule(&execlists->tasklet);
4386 	ENGINE_TRACE(engine, "depth->%d\n",
4387 		     atomic_read(&execlists->tasklet.count));
4388 }
4389 
4390 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4391 				    u64 offset, u32 len,
4392 				    const unsigned int flags)
4393 {
4394 	u32 *cs;
4395 
4396 	cs = intel_ring_begin(rq, 4);
4397 	if (IS_ERR(cs))
4398 		return PTR_ERR(cs);
4399 
4400 	/*
4401 	 * WaDisableCtxRestoreArbitration:bdw,chv
4402 	 *
4403 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4404 	 * particular all the gen that do not need the w/a at all!), if we
4405 	 * took care to make sure that on every switch into this context
4406 	 * (both ordinary and for preemption) that arbitrartion was enabled
4407 	 * we would be fine.  However, for gen8 there is another w/a that
4408 	 * requires us to not preempt inside GPGPU execution, so we keep
4409 	 * arbitration disabled for gen8 batches. Arbitration will be
4410 	 * re-enabled before we close the request
4411 	 * (engine->emit_fini_breadcrumb).
4412 	 */
4413 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4414 
4415 	/* FIXME(BDW+): Address space and security selectors. */
4416 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4417 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4418 	*cs++ = lower_32_bits(offset);
4419 	*cs++ = upper_32_bits(offset);
4420 
4421 	intel_ring_advance(rq, cs);
4422 
4423 	return 0;
4424 }
4425 
4426 static int gen8_emit_bb_start(struct i915_request *rq,
4427 			      u64 offset, u32 len,
4428 			      const unsigned int flags)
4429 {
4430 	u32 *cs;
4431 
4432 	cs = intel_ring_begin(rq, 6);
4433 	if (IS_ERR(cs))
4434 		return PTR_ERR(cs);
4435 
4436 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4437 
4438 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4439 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4440 	*cs++ = lower_32_bits(offset);
4441 	*cs++ = upper_32_bits(offset);
4442 
4443 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4444 	*cs++ = MI_NOOP;
4445 
4446 	intel_ring_advance(rq, cs);
4447 
4448 	return 0;
4449 }
4450 
4451 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4452 {
4453 	ENGINE_WRITE(engine, RING_IMR,
4454 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4455 	ENGINE_POSTING_READ(engine, RING_IMR);
4456 }
4457 
4458 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4459 {
4460 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4461 }
4462 
4463 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4464 {
4465 	u32 cmd, *cs;
4466 
4467 	cs = intel_ring_begin(request, 4);
4468 	if (IS_ERR(cs))
4469 		return PTR_ERR(cs);
4470 
4471 	cmd = MI_FLUSH_DW + 1;
4472 
4473 	/* We always require a command barrier so that subsequent
4474 	 * commands, such as breadcrumb interrupts, are strictly ordered
4475 	 * wrt the contents of the write cache being flushed to memory
4476 	 * (and thus being coherent from the CPU).
4477 	 */
4478 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4479 
4480 	if (mode & EMIT_INVALIDATE) {
4481 		cmd |= MI_INVALIDATE_TLB;
4482 		if (request->engine->class == VIDEO_DECODE_CLASS)
4483 			cmd |= MI_INVALIDATE_BSD;
4484 	}
4485 
4486 	*cs++ = cmd;
4487 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4488 	*cs++ = 0; /* upper addr */
4489 	*cs++ = 0; /* value */
4490 	intel_ring_advance(request, cs);
4491 
4492 	return 0;
4493 }
4494 
4495 static int gen8_emit_flush_render(struct i915_request *request,
4496 				  u32 mode)
4497 {
4498 	bool vf_flush_wa = false, dc_flush_wa = false;
4499 	u32 *cs, flags = 0;
4500 	int len;
4501 
4502 	flags |= PIPE_CONTROL_CS_STALL;
4503 
4504 	if (mode & EMIT_FLUSH) {
4505 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4506 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4507 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4508 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4509 	}
4510 
4511 	if (mode & EMIT_INVALIDATE) {
4512 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4513 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4514 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4515 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4516 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4517 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4518 		flags |= PIPE_CONTROL_QW_WRITE;
4519 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4520 
4521 		/*
4522 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4523 		 * pipe control.
4524 		 */
4525 		if (IS_GEN(request->engine->i915, 9))
4526 			vf_flush_wa = true;
4527 
4528 		/* WaForGAMHang:kbl */
4529 		if (IS_KBL_REVID(request->engine->i915, 0, KBL_REVID_B0))
4530 			dc_flush_wa = true;
4531 	}
4532 
4533 	len = 6;
4534 
4535 	if (vf_flush_wa)
4536 		len += 6;
4537 
4538 	if (dc_flush_wa)
4539 		len += 12;
4540 
4541 	cs = intel_ring_begin(request, len);
4542 	if (IS_ERR(cs))
4543 		return PTR_ERR(cs);
4544 
4545 	if (vf_flush_wa)
4546 		cs = gen8_emit_pipe_control(cs, 0, 0);
4547 
4548 	if (dc_flush_wa)
4549 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4550 					    0);
4551 
4552 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4553 
4554 	if (dc_flush_wa)
4555 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4556 
4557 	intel_ring_advance(request, cs);
4558 
4559 	return 0;
4560 }
4561 
4562 static int gen11_emit_flush_render(struct i915_request *request,
4563 				   u32 mode)
4564 {
4565 	if (mode & EMIT_FLUSH) {
4566 		u32 *cs;
4567 		u32 flags = 0;
4568 
4569 		flags |= PIPE_CONTROL_CS_STALL;
4570 
4571 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4572 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4573 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4574 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4575 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4576 		flags |= PIPE_CONTROL_QW_WRITE;
4577 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4578 
4579 		cs = intel_ring_begin(request, 6);
4580 		if (IS_ERR(cs))
4581 			return PTR_ERR(cs);
4582 
4583 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4584 		intel_ring_advance(request, cs);
4585 	}
4586 
4587 	if (mode & EMIT_INVALIDATE) {
4588 		u32 *cs;
4589 		u32 flags = 0;
4590 
4591 		flags |= PIPE_CONTROL_CS_STALL;
4592 
4593 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4594 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4595 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4596 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4597 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4598 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4599 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4600 		flags |= PIPE_CONTROL_QW_WRITE;
4601 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4602 
4603 		cs = intel_ring_begin(request, 6);
4604 		if (IS_ERR(cs))
4605 			return PTR_ERR(cs);
4606 
4607 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4608 		intel_ring_advance(request, cs);
4609 	}
4610 
4611 	return 0;
4612 }
4613 
4614 static u32 preparser_disable(bool state)
4615 {
4616 	return MI_ARB_CHECK | 1 << 8 | state;
4617 }
4618 
4619 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4620 {
4621 	static const i915_reg_t vd[] = {
4622 		GEN12_VD0_AUX_NV,
4623 		GEN12_VD1_AUX_NV,
4624 		GEN12_VD2_AUX_NV,
4625 		GEN12_VD3_AUX_NV,
4626 	};
4627 
4628 	static const i915_reg_t ve[] = {
4629 		GEN12_VE0_AUX_NV,
4630 		GEN12_VE1_AUX_NV,
4631 	};
4632 
4633 	if (engine->class == VIDEO_DECODE_CLASS)
4634 		return vd[engine->instance];
4635 
4636 	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4637 		return ve[engine->instance];
4638 
4639 	GEM_BUG_ON("unknown aux_inv_reg\n");
4640 
4641 	return INVALID_MMIO_REG;
4642 }
4643 
4644 static u32 *
4645 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4646 {
4647 	*cs++ = MI_LOAD_REGISTER_IMM(1);
4648 	*cs++ = i915_mmio_reg_offset(inv_reg);
4649 	*cs++ = AUX_INV;
4650 	*cs++ = MI_NOOP;
4651 
4652 	return cs;
4653 }
4654 
4655 static int gen12_emit_flush_render(struct i915_request *request,
4656 				   u32 mode)
4657 {
4658 	if (mode & EMIT_FLUSH) {
4659 		u32 flags = 0;
4660 		u32 *cs;
4661 
4662 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4663 		flags |= PIPE_CONTROL_FLUSH_L3;
4664 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4665 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4666 		/* Wa_1409600907:tgl */
4667 		flags |= PIPE_CONTROL_DEPTH_STALL;
4668 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4669 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4670 
4671 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4672 		flags |= PIPE_CONTROL_QW_WRITE;
4673 
4674 		flags |= PIPE_CONTROL_CS_STALL;
4675 
4676 		cs = intel_ring_begin(request, 6);
4677 		if (IS_ERR(cs))
4678 			return PTR_ERR(cs);
4679 
4680 		cs = gen12_emit_pipe_control(cs,
4681 					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4682 					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4683 		intel_ring_advance(request, cs);
4684 	}
4685 
4686 	if (mode & EMIT_INVALIDATE) {
4687 		u32 flags = 0;
4688 		u32 *cs;
4689 
4690 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4691 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4692 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4693 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4694 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4695 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4696 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4697 
4698 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4699 		flags |= PIPE_CONTROL_QW_WRITE;
4700 
4701 		flags |= PIPE_CONTROL_CS_STALL;
4702 
4703 		cs = intel_ring_begin(request, 8 + 4);
4704 		if (IS_ERR(cs))
4705 			return PTR_ERR(cs);
4706 
4707 		/*
4708 		 * Prevent the pre-parser from skipping past the TLB
4709 		 * invalidate and loading a stale page for the batch
4710 		 * buffer / request payload.
4711 		 */
4712 		*cs++ = preparser_disable(true);
4713 
4714 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4715 
4716 		/* hsdes: 1809175790 */
4717 		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4718 
4719 		*cs++ = preparser_disable(false);
4720 		intel_ring_advance(request, cs);
4721 	}
4722 
4723 	return 0;
4724 }
4725 
4726 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4727 {
4728 	intel_engine_mask_t aux_inv = 0;
4729 	u32 cmd, *cs;
4730 
4731 	if (mode & EMIT_INVALIDATE)
4732 		aux_inv = request->engine->mask & ~BIT(BCS0);
4733 
4734 	cs = intel_ring_begin(request,
4735 			      4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0));
4736 	if (IS_ERR(cs))
4737 		return PTR_ERR(cs);
4738 
4739 	cmd = MI_FLUSH_DW + 1;
4740 
4741 	/* We always require a command barrier so that subsequent
4742 	 * commands, such as breadcrumb interrupts, are strictly ordered
4743 	 * wrt the contents of the write cache being flushed to memory
4744 	 * (and thus being coherent from the CPU).
4745 	 */
4746 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4747 
4748 	if (mode & EMIT_INVALIDATE) {
4749 		cmd |= MI_INVALIDATE_TLB;
4750 		if (request->engine->class == VIDEO_DECODE_CLASS)
4751 			cmd |= MI_INVALIDATE_BSD;
4752 	}
4753 
4754 	*cs++ = cmd;
4755 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4756 	*cs++ = 0; /* upper addr */
4757 	*cs++ = 0; /* value */
4758 
4759 	if (aux_inv) { /* hsdes: 1809175790 */
4760 		struct intel_engine_cs *engine;
4761 		unsigned int tmp;
4762 
4763 		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4764 		for_each_engine_masked(engine, request->engine->gt,
4765 				       aux_inv, tmp) {
4766 			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4767 			*cs++ = AUX_INV;
4768 		}
4769 		*cs++ = MI_NOOP;
4770 	}
4771 	intel_ring_advance(request, cs);
4772 
4773 	return 0;
4774 }
4775 
4776 static void assert_request_valid(struct i915_request *rq)
4777 {
4778 	struct intel_ring *ring __maybe_unused = rq->ring;
4779 
4780 	/* Can we unwind this request without appearing to go forwards? */
4781 	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4782 }
4783 
4784 /*
4785  * Reserve space for 2 NOOPs at the end of each request to be
4786  * used as a workaround for not being allowed to do lite
4787  * restore with HEAD==TAIL (WaIdleLiteRestore).
4788  */
4789 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4790 {
4791 	/* Ensure there's always at least one preemption point per-request. */
4792 	*cs++ = MI_ARB_CHECK;
4793 	*cs++ = MI_NOOP;
4794 	request->wa_tail = intel_ring_offset(request, cs);
4795 
4796 	/* Check that entire request is less than half the ring */
4797 	assert_request_valid(request);
4798 
4799 	return cs;
4800 }
4801 
4802 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4803 {
4804 	*cs++ = MI_SEMAPHORE_WAIT |
4805 		MI_SEMAPHORE_GLOBAL_GTT |
4806 		MI_SEMAPHORE_POLL |
4807 		MI_SEMAPHORE_SAD_EQ_SDD;
4808 	*cs++ = 0;
4809 	*cs++ = intel_hws_preempt_address(request->engine);
4810 	*cs++ = 0;
4811 
4812 	return cs;
4813 }
4814 
4815 static __always_inline u32*
4816 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4817 {
4818 	*cs++ = MI_USER_INTERRUPT;
4819 
4820 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4821 	if (intel_engine_has_semaphores(request->engine))
4822 		cs = emit_preempt_busywait(request, cs);
4823 
4824 	request->tail = intel_ring_offset(request, cs);
4825 	assert_ring_tail_valid(request->ring, request->tail);
4826 
4827 	return gen8_emit_wa_tail(request, cs);
4828 }
4829 
4830 static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs)
4831 {
4832 	u32 addr = i915_request_active_timeline(request)->hwsp_offset;
4833 
4834 	return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0);
4835 }
4836 
4837 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4838 {
4839 	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4840 }
4841 
4842 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4843 {
4844 	cs = gen8_emit_pipe_control(cs,
4845 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4846 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4847 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4848 				    0);
4849 
4850 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4851 	cs = gen8_emit_ggtt_write_rcs(cs,
4852 				      request->fence.seqno,
4853 				      i915_request_active_timeline(request)->hwsp_offset,
4854 				      PIPE_CONTROL_FLUSH_ENABLE |
4855 				      PIPE_CONTROL_CS_STALL);
4856 
4857 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4858 }
4859 
4860 static u32 *
4861 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4862 {
4863 	cs = gen8_emit_ggtt_write_rcs(cs,
4864 				      request->fence.seqno,
4865 				      i915_request_active_timeline(request)->hwsp_offset,
4866 				      PIPE_CONTROL_CS_STALL |
4867 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4868 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4869 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4870 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4871 				      PIPE_CONTROL_FLUSH_ENABLE);
4872 
4873 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4874 }
4875 
4876 /*
4877  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4878  * flush and will continue pre-fetching the instructions after it before the
4879  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4880  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4881  * of the next request before the memory has been flushed, we're guaranteed that
4882  * we won't access the batch itself too early.
4883  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4884  * so, if the current request is modifying an instruction in the next request on
4885  * the same intel_context, we might pre-fetch and then execute the pre-update
4886  * instruction. To avoid this, the users of self-modifying code should either
4887  * disable the parser around the code emitting the memory writes, via a new flag
4888  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4889  * the in-kernel use-cases we've opted to use a separate context, see
4890  * reloc_gpu() as an example.
4891  * All the above applies only to the instructions themselves. Non-inline data
4892  * used by the instructions is not pre-fetched.
4893  */
4894 
4895 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4896 {
4897 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4898 		MI_SEMAPHORE_GLOBAL_GTT |
4899 		MI_SEMAPHORE_POLL |
4900 		MI_SEMAPHORE_SAD_EQ_SDD;
4901 	*cs++ = 0;
4902 	*cs++ = intel_hws_preempt_address(request->engine);
4903 	*cs++ = 0;
4904 	*cs++ = 0;
4905 	*cs++ = MI_NOOP;
4906 
4907 	return cs;
4908 }
4909 
4910 static __always_inline u32*
4911 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4912 {
4913 	*cs++ = MI_USER_INTERRUPT;
4914 
4915 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4916 	if (intel_engine_has_semaphores(request->engine))
4917 		cs = gen12_emit_preempt_busywait(request, cs);
4918 
4919 	request->tail = intel_ring_offset(request, cs);
4920 	assert_ring_tail_valid(request->ring, request->tail);
4921 
4922 	return gen8_emit_wa_tail(request, cs);
4923 }
4924 
4925 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4926 {
4927 	return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4928 }
4929 
4930 static u32 *
4931 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4932 {
4933 	cs = gen12_emit_ggtt_write_rcs(cs,
4934 				       request->fence.seqno,
4935 				       i915_request_active_timeline(request)->hwsp_offset,
4936 				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4937 				       PIPE_CONTROL_CS_STALL |
4938 				       PIPE_CONTROL_TILE_CACHE_FLUSH |
4939 				       PIPE_CONTROL_FLUSH_L3 |
4940 				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4941 				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4942 				       /* Wa_1409600907:tgl */
4943 				       PIPE_CONTROL_DEPTH_STALL |
4944 				       PIPE_CONTROL_DC_FLUSH_ENABLE |
4945 				       PIPE_CONTROL_FLUSH_ENABLE);
4946 
4947 	return gen12_emit_fini_breadcrumb_tail(request, cs);
4948 }
4949 
4950 static void execlists_park(struct intel_engine_cs *engine)
4951 {
4952 	cancel_timer(&engine->execlists.timer);
4953 	cancel_timer(&engine->execlists.preempt);
4954 }
4955 
4956 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4957 {
4958 	engine->submit_request = execlists_submit_request;
4959 	engine->schedule = i915_schedule;
4960 	engine->execlists.tasklet.func = execlists_submission_tasklet;
4961 
4962 	engine->reset.prepare = execlists_reset_prepare;
4963 	engine->reset.rewind = execlists_reset_rewind;
4964 	engine->reset.cancel = execlists_reset_cancel;
4965 	engine->reset.finish = execlists_reset_finish;
4966 
4967 	engine->park = execlists_park;
4968 	engine->unpark = NULL;
4969 
4970 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4971 	if (!intel_vgpu_active(engine->i915)) {
4972 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4973 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
4974 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4975 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
4976 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
4977 		}
4978 	}
4979 
4980 	if (INTEL_GEN(engine->i915) >= 12)
4981 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4982 
4983 	if (intel_engine_has_preemption(engine))
4984 		engine->emit_bb_start = gen8_emit_bb_start;
4985 	else
4986 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
4987 }
4988 
4989 static void execlists_shutdown(struct intel_engine_cs *engine)
4990 {
4991 	/* Synchronise with residual timers and any softirq they raise */
4992 	del_timer_sync(&engine->execlists.timer);
4993 	del_timer_sync(&engine->execlists.preempt);
4994 	tasklet_kill(&engine->execlists.tasklet);
4995 }
4996 
4997 static void execlists_release(struct intel_engine_cs *engine)
4998 {
4999 	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5000 
5001 	execlists_shutdown(engine);
5002 
5003 	intel_engine_cleanup_common(engine);
5004 	lrc_destroy_wa_ctx(engine);
5005 }
5006 
5007 static void
5008 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5009 {
5010 	/* Default vfuncs which can be overriden by each engine. */
5011 
5012 	engine->resume = execlists_resume;
5013 
5014 	engine->cops = &execlists_context_ops;
5015 	engine->request_alloc = execlists_request_alloc;
5016 
5017 	engine->emit_flush = gen8_emit_flush;
5018 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5019 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5020 	if (INTEL_GEN(engine->i915) >= 12) {
5021 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5022 		engine->emit_flush = gen12_emit_flush;
5023 	}
5024 	engine->set_default_submission = intel_execlists_set_default_submission;
5025 
5026 	if (INTEL_GEN(engine->i915) < 11) {
5027 		engine->irq_enable = gen8_logical_ring_enable_irq;
5028 		engine->irq_disable = gen8_logical_ring_disable_irq;
5029 	} else {
5030 		/*
5031 		 * TODO: On Gen11 interrupt masks need to be clear
5032 		 * to allow C6 entry. Keep interrupts enabled at
5033 		 * and take the hit of generating extra interrupts
5034 		 * until a more refined solution exists.
5035 		 */
5036 	}
5037 }
5038 
5039 static inline void
5040 logical_ring_default_irqs(struct intel_engine_cs *engine)
5041 {
5042 	unsigned int shift = 0;
5043 
5044 	if (INTEL_GEN(engine->i915) < 11) {
5045 		const u8 irq_shifts[] = {
5046 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5047 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5048 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5049 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5050 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5051 		};
5052 
5053 		shift = irq_shifts[engine->id];
5054 	}
5055 
5056 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5057 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5058 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5059 	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5060 }
5061 
5062 static void rcs_submission_override(struct intel_engine_cs *engine)
5063 {
5064 	switch (INTEL_GEN(engine->i915)) {
5065 	case 12:
5066 		engine->emit_flush = gen12_emit_flush_render;
5067 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5068 		break;
5069 	case 11:
5070 		engine->emit_flush = gen11_emit_flush_render;
5071 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5072 		break;
5073 	default:
5074 		engine->emit_flush = gen8_emit_flush_render;
5075 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5076 		break;
5077 	}
5078 }
5079 
5080 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5081 {
5082 	struct intel_engine_execlists * const execlists = &engine->execlists;
5083 	struct drm_i915_private *i915 = engine->i915;
5084 	struct intel_uncore *uncore = engine->uncore;
5085 	u32 base = engine->mmio_base;
5086 
5087 	tasklet_init(&engine->execlists.tasklet,
5088 		     execlists_submission_tasklet, (unsigned long)engine);
5089 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5090 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5091 
5092 	logical_ring_default_vfuncs(engine);
5093 	logical_ring_default_irqs(engine);
5094 
5095 	if (engine->class == RENDER_CLASS)
5096 		rcs_submission_override(engine);
5097 
5098 	if (intel_init_workaround_bb(engine))
5099 		/*
5100 		 * We continue even if we fail to initialize WA batch
5101 		 * because we only expect rare glitches but nothing
5102 		 * critical to prevent us from using GPU
5103 		 */
5104 		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5105 
5106 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5107 		execlists->submit_reg = uncore->regs +
5108 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5109 		execlists->ctrl_reg = uncore->regs +
5110 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5111 	} else {
5112 		execlists->submit_reg = uncore->regs +
5113 			i915_mmio_reg_offset(RING_ELSP(base));
5114 	}
5115 
5116 	execlists->csb_status =
5117 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5118 
5119 	execlists->csb_write =
5120 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5121 
5122 	if (INTEL_GEN(i915) < 11)
5123 		execlists->csb_size = GEN8_CSB_ENTRIES;
5124 	else
5125 		execlists->csb_size = GEN11_CSB_ENTRIES;
5126 
5127 	if (INTEL_GEN(engine->i915) >= 11) {
5128 		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5129 		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5130 	}
5131 
5132 	/* Finally, take ownership and responsibility for cleanup! */
5133 	engine->sanitize = execlists_sanitize;
5134 	engine->release = execlists_release;
5135 
5136 	return 0;
5137 }
5138 
5139 static void init_common_reg_state(u32 * const regs,
5140 				  const struct intel_engine_cs *engine,
5141 				  const struct intel_ring *ring,
5142 				  bool inhibit)
5143 {
5144 	u32 ctl;
5145 
5146 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5147 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5148 	if (inhibit)
5149 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5150 	if (INTEL_GEN(engine->i915) < 11)
5151 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5152 					   CTX_CTRL_RS_CTX_ENABLE);
5153 	regs[CTX_CONTEXT_CONTROL] = ctl;
5154 
5155 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5156 	regs[CTX_TIMESTAMP] = 0;
5157 }
5158 
5159 static void init_wa_bb_reg_state(u32 * const regs,
5160 				 const struct intel_engine_cs *engine)
5161 {
5162 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5163 
5164 	if (wa_ctx->per_ctx.size) {
5165 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5166 
5167 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5168 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5169 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5170 	}
5171 
5172 	if (wa_ctx->indirect_ctx.size) {
5173 		lrc_ring_setup_indirect_ctx(regs, engine,
5174 					    i915_ggtt_offset(wa_ctx->vma) +
5175 					    wa_ctx->indirect_ctx.offset,
5176 					    wa_ctx->indirect_ctx.size);
5177 	}
5178 }
5179 
5180 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5181 {
5182 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5183 		/* 64b PPGTT (48bit canonical)
5184 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5185 		 * other PDP Descriptors are ignored.
5186 		 */
5187 		ASSIGN_CTX_PML4(ppgtt, regs);
5188 	} else {
5189 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5190 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5191 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5192 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5193 	}
5194 }
5195 
5196 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5197 {
5198 	if (i915_is_ggtt(vm))
5199 		return i915_vm_to_ggtt(vm)->alias;
5200 	else
5201 		return i915_vm_to_ppgtt(vm);
5202 }
5203 
5204 static void execlists_init_reg_state(u32 *regs,
5205 				     const struct intel_context *ce,
5206 				     const struct intel_engine_cs *engine,
5207 				     const struct intel_ring *ring,
5208 				     bool inhibit)
5209 {
5210 	/*
5211 	 * A context is actually a big batch buffer with several
5212 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5213 	 * values we are setting here are only for the first context restore:
5214 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5215 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5216 	 * we are not initializing here).
5217 	 *
5218 	 * Must keep consistent with virtual_update_register_offsets().
5219 	 */
5220 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5221 
5222 	init_common_reg_state(regs, engine, ring, inhibit);
5223 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5224 
5225 	init_wa_bb_reg_state(regs, engine);
5226 
5227 	__reset_stop_ring(regs, engine);
5228 }
5229 
5230 static int
5231 populate_lr_context(struct intel_context *ce,
5232 		    struct drm_i915_gem_object *ctx_obj,
5233 		    struct intel_engine_cs *engine,
5234 		    struct intel_ring *ring)
5235 {
5236 	bool inhibit = true;
5237 	void *vaddr;
5238 
5239 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5240 	if (IS_ERR(vaddr)) {
5241 		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5242 		return PTR_ERR(vaddr);
5243 	}
5244 
5245 	set_redzone(vaddr, engine);
5246 
5247 	if (engine->default_state) {
5248 		shmem_read(engine->default_state, 0,
5249 			   vaddr, engine->context_size);
5250 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5251 		inhibit = false;
5252 	}
5253 
5254 	/* Clear the ppHWSP (inc. per-context counters) */
5255 	memset(vaddr, 0, PAGE_SIZE);
5256 
5257 	/*
5258 	 * The second page of the context object contains some registers which
5259 	 * must be set up prior to the first execution.
5260 	 */
5261 	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5262 				 ce, engine, ring, inhibit);
5263 
5264 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5265 	i915_gem_object_unpin_map(ctx_obj);
5266 	return 0;
5267 }
5268 
5269 static int __execlists_context_alloc(struct intel_context *ce,
5270 				     struct intel_engine_cs *engine)
5271 {
5272 	struct drm_i915_gem_object *ctx_obj;
5273 	struct intel_ring *ring;
5274 	struct i915_vma *vma;
5275 	u32 context_size;
5276 	int ret;
5277 
5278 	GEM_BUG_ON(ce->state);
5279 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5280 
5281 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5282 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5283 
5284 	if (INTEL_GEN(engine->i915) == 12) {
5285 		ce->wa_bb_page = context_size / PAGE_SIZE;
5286 		context_size += PAGE_SIZE;
5287 	}
5288 
5289 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5290 	if (IS_ERR(ctx_obj))
5291 		return PTR_ERR(ctx_obj);
5292 
5293 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5294 	if (IS_ERR(vma)) {
5295 		ret = PTR_ERR(vma);
5296 		goto error_deref_obj;
5297 	}
5298 
5299 	if (!ce->timeline) {
5300 		struct intel_timeline *tl;
5301 		struct i915_vma *hwsp;
5302 
5303 		/*
5304 		 * Use the static global HWSP for the kernel context, and
5305 		 * a dynamically allocated cacheline for everyone else.
5306 		 */
5307 		hwsp = NULL;
5308 		if (unlikely(intel_context_is_barrier(ce)))
5309 			hwsp = engine->status_page.vma;
5310 
5311 		tl = intel_timeline_create(engine->gt, hwsp);
5312 		if (IS_ERR(tl)) {
5313 			ret = PTR_ERR(tl);
5314 			goto error_deref_obj;
5315 		}
5316 
5317 		ce->timeline = tl;
5318 	}
5319 
5320 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5321 	if (IS_ERR(ring)) {
5322 		ret = PTR_ERR(ring);
5323 		goto error_deref_obj;
5324 	}
5325 
5326 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5327 	if (ret) {
5328 		drm_dbg(&engine->i915->drm,
5329 			"Failed to populate LRC: %d\n", ret);
5330 		goto error_ring_free;
5331 	}
5332 
5333 	ce->ring = ring;
5334 	ce->state = vma;
5335 
5336 	return 0;
5337 
5338 error_ring_free:
5339 	intel_ring_put(ring);
5340 error_deref_obj:
5341 	i915_gem_object_put(ctx_obj);
5342 	return ret;
5343 }
5344 
5345 static struct list_head *virtual_queue(struct virtual_engine *ve)
5346 {
5347 	return &ve->base.execlists.default_priolist.requests[0];
5348 }
5349 
5350 static void virtual_context_destroy(struct kref *kref)
5351 {
5352 	struct virtual_engine *ve =
5353 		container_of(kref, typeof(*ve), context.ref);
5354 	unsigned int n;
5355 
5356 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5357 	GEM_BUG_ON(ve->request);
5358 	GEM_BUG_ON(ve->context.inflight);
5359 
5360 	for (n = 0; n < ve->num_siblings; n++) {
5361 		struct intel_engine_cs *sibling = ve->siblings[n];
5362 		struct rb_node *node = &ve->nodes[sibling->id].rb;
5363 		unsigned long flags;
5364 
5365 		if (RB_EMPTY_NODE(node))
5366 			continue;
5367 
5368 		spin_lock_irqsave(&sibling->active.lock, flags);
5369 
5370 		/* Detachment is lazily performed in the execlists tasklet */
5371 		if (!RB_EMPTY_NODE(node))
5372 			rb_erase_cached(node, &sibling->execlists.virtual);
5373 
5374 		spin_unlock_irqrestore(&sibling->active.lock, flags);
5375 	}
5376 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5377 
5378 	if (ve->context.state)
5379 		__execlists_context_fini(&ve->context);
5380 	intel_context_fini(&ve->context);
5381 
5382 	intel_engine_free_request_pool(&ve->base);
5383 
5384 	kfree(ve->bonds);
5385 	kfree(ve);
5386 }
5387 
5388 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5389 {
5390 	int swp;
5391 
5392 	/*
5393 	 * Pick a random sibling on starting to help spread the load around.
5394 	 *
5395 	 * New contexts are typically created with exactly the same order
5396 	 * of siblings, and often started in batches. Due to the way we iterate
5397 	 * the array of sibling when submitting requests, sibling[0] is
5398 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5399 	 * randomised across the system, we also help spread the load by the
5400 	 * first engine we inspect being different each time.
5401 	 *
5402 	 * NB This does not force us to execute on this engine, it will just
5403 	 * typically be the first we inspect for submission.
5404 	 */
5405 	swp = prandom_u32_max(ve->num_siblings);
5406 	if (!swp)
5407 		return;
5408 
5409 	swap(ve->siblings[swp], ve->siblings[0]);
5410 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
5411 		virtual_update_register_offsets(ve->context.lrc_reg_state,
5412 						ve->siblings[0]);
5413 }
5414 
5415 static int virtual_context_alloc(struct intel_context *ce)
5416 {
5417 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5418 
5419 	return __execlists_context_alloc(ce, ve->siblings[0]);
5420 }
5421 
5422 static int virtual_context_pin(struct intel_context *ce)
5423 {
5424 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5425 	int err;
5426 
5427 	/* Note: we must use a real engine class for setting up reg state */
5428 	err = __execlists_context_pin(ce, ve->siblings[0]);
5429 	if (err)
5430 		return err;
5431 
5432 	virtual_engine_initial_hint(ve);
5433 	return 0;
5434 }
5435 
5436 static void virtual_context_enter(struct intel_context *ce)
5437 {
5438 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5439 	unsigned int n;
5440 
5441 	for (n = 0; n < ve->num_siblings; n++)
5442 		intel_engine_pm_get(ve->siblings[n]);
5443 
5444 	intel_timeline_enter(ce->timeline);
5445 }
5446 
5447 static void virtual_context_exit(struct intel_context *ce)
5448 {
5449 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5450 	unsigned int n;
5451 
5452 	intel_timeline_exit(ce->timeline);
5453 
5454 	for (n = 0; n < ve->num_siblings; n++)
5455 		intel_engine_pm_put(ve->siblings[n]);
5456 }
5457 
5458 static const struct intel_context_ops virtual_context_ops = {
5459 	.alloc = virtual_context_alloc,
5460 
5461 	.pin = virtual_context_pin,
5462 	.unpin = execlists_context_unpin,
5463 
5464 	.enter = virtual_context_enter,
5465 	.exit = virtual_context_exit,
5466 
5467 	.destroy = virtual_context_destroy,
5468 };
5469 
5470 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5471 {
5472 	struct i915_request *rq;
5473 	intel_engine_mask_t mask;
5474 
5475 	rq = READ_ONCE(ve->request);
5476 	if (!rq)
5477 		return 0;
5478 
5479 	/* The rq is ready for submission; rq->execution_mask is now stable. */
5480 	mask = rq->execution_mask;
5481 	if (unlikely(!mask)) {
5482 		/* Invalid selection, submit to a random engine in error */
5483 		i915_request_set_error_once(rq, -ENODEV);
5484 		mask = ve->siblings[0]->mask;
5485 	}
5486 
5487 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5488 		     rq->fence.context, rq->fence.seqno,
5489 		     mask, ve->base.execlists.queue_priority_hint);
5490 
5491 	return mask;
5492 }
5493 
5494 static void virtual_submission_tasklet(unsigned long data)
5495 {
5496 	struct virtual_engine * const ve = (struct virtual_engine *)data;
5497 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5498 	intel_engine_mask_t mask;
5499 	unsigned int n;
5500 
5501 	rcu_read_lock();
5502 	mask = virtual_submission_mask(ve);
5503 	rcu_read_unlock();
5504 	if (unlikely(!mask))
5505 		return;
5506 
5507 	local_irq_disable();
5508 	for (n = 0; n < ve->num_siblings; n++) {
5509 		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5510 		struct ve_node * const node = &ve->nodes[sibling->id];
5511 		struct rb_node **parent, *rb;
5512 		bool first;
5513 
5514 		if (!READ_ONCE(ve->request))
5515 			break; /* already handled by a sibling's tasklet */
5516 
5517 		if (unlikely(!(mask & sibling->mask))) {
5518 			if (!RB_EMPTY_NODE(&node->rb)) {
5519 				spin_lock(&sibling->active.lock);
5520 				rb_erase_cached(&node->rb,
5521 						&sibling->execlists.virtual);
5522 				RB_CLEAR_NODE(&node->rb);
5523 				spin_unlock(&sibling->active.lock);
5524 			}
5525 			continue;
5526 		}
5527 
5528 		spin_lock(&sibling->active.lock);
5529 
5530 		if (!RB_EMPTY_NODE(&node->rb)) {
5531 			/*
5532 			 * Cheat and avoid rebalancing the tree if we can
5533 			 * reuse this node in situ.
5534 			 */
5535 			first = rb_first_cached(&sibling->execlists.virtual) ==
5536 				&node->rb;
5537 			if (prio == node->prio || (prio > node->prio && first))
5538 				goto submit_engine;
5539 
5540 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5541 		}
5542 
5543 		rb = NULL;
5544 		first = true;
5545 		parent = &sibling->execlists.virtual.rb_root.rb_node;
5546 		while (*parent) {
5547 			struct ve_node *other;
5548 
5549 			rb = *parent;
5550 			other = rb_entry(rb, typeof(*other), rb);
5551 			if (prio > other->prio) {
5552 				parent = &rb->rb_left;
5553 			} else {
5554 				parent = &rb->rb_right;
5555 				first = false;
5556 			}
5557 		}
5558 
5559 		rb_link_node(&node->rb, rb, parent);
5560 		rb_insert_color_cached(&node->rb,
5561 				       &sibling->execlists.virtual,
5562 				       first);
5563 
5564 submit_engine:
5565 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5566 		node->prio = prio;
5567 		if (first && prio > sibling->execlists.queue_priority_hint)
5568 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5569 
5570 		spin_unlock(&sibling->active.lock);
5571 	}
5572 	local_irq_enable();
5573 }
5574 
5575 static void virtual_submit_request(struct i915_request *rq)
5576 {
5577 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5578 	struct i915_request *old;
5579 	unsigned long flags;
5580 
5581 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5582 		     rq->fence.context,
5583 		     rq->fence.seqno);
5584 
5585 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5586 
5587 	spin_lock_irqsave(&ve->base.active.lock, flags);
5588 
5589 	old = ve->request;
5590 	if (old) { /* background completion event from preempt-to-busy */
5591 		GEM_BUG_ON(!i915_request_completed(old));
5592 		__i915_request_submit(old);
5593 		i915_request_put(old);
5594 	}
5595 
5596 	if (i915_request_completed(rq)) {
5597 		__i915_request_submit(rq);
5598 
5599 		ve->base.execlists.queue_priority_hint = INT_MIN;
5600 		ve->request = NULL;
5601 	} else {
5602 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5603 		ve->request = i915_request_get(rq);
5604 
5605 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5606 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5607 
5608 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
5609 	}
5610 
5611 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5612 }
5613 
5614 static struct ve_bond *
5615 virtual_find_bond(struct virtual_engine *ve,
5616 		  const struct intel_engine_cs *master)
5617 {
5618 	int i;
5619 
5620 	for (i = 0; i < ve->num_bonds; i++) {
5621 		if (ve->bonds[i].master == master)
5622 			return &ve->bonds[i];
5623 	}
5624 
5625 	return NULL;
5626 }
5627 
5628 static void
5629 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5630 {
5631 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5632 	intel_engine_mask_t allowed, exec;
5633 	struct ve_bond *bond;
5634 
5635 	allowed = ~to_request(signal)->engine->mask;
5636 
5637 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5638 	if (bond)
5639 		allowed &= bond->sibling_mask;
5640 
5641 	/* Restrict the bonded request to run on only the available engines */
5642 	exec = READ_ONCE(rq->execution_mask);
5643 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5644 		;
5645 
5646 	/* Prevent the master from being re-run on the bonded engines */
5647 	to_request(signal)->execution_mask &= ~allowed;
5648 }
5649 
5650 struct intel_context *
5651 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5652 			       unsigned int count)
5653 {
5654 	struct virtual_engine *ve;
5655 	unsigned int n;
5656 	int err;
5657 
5658 	if (count == 0)
5659 		return ERR_PTR(-EINVAL);
5660 
5661 	if (count == 1)
5662 		return intel_context_create(siblings[0]);
5663 
5664 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5665 	if (!ve)
5666 		return ERR_PTR(-ENOMEM);
5667 
5668 	ve->base.i915 = siblings[0]->i915;
5669 	ve->base.gt = siblings[0]->gt;
5670 	ve->base.uncore = siblings[0]->uncore;
5671 	ve->base.id = -1;
5672 
5673 	ve->base.class = OTHER_CLASS;
5674 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5675 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5676 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5677 
5678 	/*
5679 	 * The decision on whether to submit a request using semaphores
5680 	 * depends on the saturated state of the engine. We only compute
5681 	 * this during HW submission of the request, and we need for this
5682 	 * state to be globally applied to all requests being submitted
5683 	 * to this engine. Virtual engines encompass more than one physical
5684 	 * engine and so we cannot accurately tell in advance if one of those
5685 	 * engines is already saturated and so cannot afford to use a semaphore
5686 	 * and be pessimized in priority for doing so -- if we are the only
5687 	 * context using semaphores after all other clients have stopped, we
5688 	 * will be starved on the saturated system. Such a global switch for
5689 	 * semaphores is less than ideal, but alas is the current compromise.
5690 	 */
5691 	ve->base.saturated = ALL_ENGINES;
5692 
5693 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5694 
5695 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5696 	intel_engine_init_breadcrumbs(&ve->base);
5697 	intel_engine_init_execlists(&ve->base);
5698 
5699 	ve->base.cops = &virtual_context_ops;
5700 	ve->base.request_alloc = execlists_request_alloc;
5701 
5702 	ve->base.schedule = i915_schedule;
5703 	ve->base.submit_request = virtual_submit_request;
5704 	ve->base.bond_execute = virtual_bond_execute;
5705 
5706 	INIT_LIST_HEAD(virtual_queue(ve));
5707 	ve->base.execlists.queue_priority_hint = INT_MIN;
5708 	tasklet_init(&ve->base.execlists.tasklet,
5709 		     virtual_submission_tasklet,
5710 		     (unsigned long)ve);
5711 
5712 	intel_context_init(&ve->context, &ve->base);
5713 
5714 	for (n = 0; n < count; n++) {
5715 		struct intel_engine_cs *sibling = siblings[n];
5716 
5717 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5718 		if (sibling->mask & ve->base.mask) {
5719 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5720 				  sibling->name);
5721 			err = -EINVAL;
5722 			goto err_put;
5723 		}
5724 
5725 		/*
5726 		 * The virtual engine implementation is tightly coupled to
5727 		 * the execlists backend -- we push out request directly
5728 		 * into a tree inside each physical engine. We could support
5729 		 * layering if we handle cloning of the requests and
5730 		 * submitting a copy into each backend.
5731 		 */
5732 		if (sibling->execlists.tasklet.func !=
5733 		    execlists_submission_tasklet) {
5734 			err = -ENODEV;
5735 			goto err_put;
5736 		}
5737 
5738 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5739 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5740 
5741 		ve->siblings[ve->num_siblings++] = sibling;
5742 		ve->base.mask |= sibling->mask;
5743 
5744 		/*
5745 		 * All physical engines must be compatible for their emission
5746 		 * functions (as we build the instructions during request
5747 		 * construction and do not alter them before submission
5748 		 * on the physical engine). We use the engine class as a guide
5749 		 * here, although that could be refined.
5750 		 */
5751 		if (ve->base.class != OTHER_CLASS) {
5752 			if (ve->base.class != sibling->class) {
5753 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5754 					  sibling->class, ve->base.class);
5755 				err = -EINVAL;
5756 				goto err_put;
5757 			}
5758 			continue;
5759 		}
5760 
5761 		ve->base.class = sibling->class;
5762 		ve->base.uabi_class = sibling->uabi_class;
5763 		snprintf(ve->base.name, sizeof(ve->base.name),
5764 			 "v%dx%d", ve->base.class, count);
5765 		ve->base.context_size = sibling->context_size;
5766 
5767 		ve->base.emit_bb_start = sibling->emit_bb_start;
5768 		ve->base.emit_flush = sibling->emit_flush;
5769 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5770 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5771 		ve->base.emit_fini_breadcrumb_dw =
5772 			sibling->emit_fini_breadcrumb_dw;
5773 
5774 		ve->base.flags = sibling->flags;
5775 	}
5776 
5777 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5778 
5779 	return &ve->context;
5780 
5781 err_put:
5782 	intel_context_put(&ve->context);
5783 	return ERR_PTR(err);
5784 }
5785 
5786 struct intel_context *
5787 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5788 {
5789 	struct virtual_engine *se = to_virtual_engine(src);
5790 	struct intel_context *dst;
5791 
5792 	dst = intel_execlists_create_virtual(se->siblings,
5793 					     se->num_siblings);
5794 	if (IS_ERR(dst))
5795 		return dst;
5796 
5797 	if (se->num_bonds) {
5798 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5799 
5800 		de->bonds = kmemdup(se->bonds,
5801 				    sizeof(*se->bonds) * se->num_bonds,
5802 				    GFP_KERNEL);
5803 		if (!de->bonds) {
5804 			intel_context_put(dst);
5805 			return ERR_PTR(-ENOMEM);
5806 		}
5807 
5808 		de->num_bonds = se->num_bonds;
5809 	}
5810 
5811 	return dst;
5812 }
5813 
5814 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5815 				     const struct intel_engine_cs *master,
5816 				     const struct intel_engine_cs *sibling)
5817 {
5818 	struct virtual_engine *ve = to_virtual_engine(engine);
5819 	struct ve_bond *bond;
5820 	int n;
5821 
5822 	/* Sanity check the sibling is part of the virtual engine */
5823 	for (n = 0; n < ve->num_siblings; n++)
5824 		if (sibling == ve->siblings[n])
5825 			break;
5826 	if (n == ve->num_siblings)
5827 		return -EINVAL;
5828 
5829 	bond = virtual_find_bond(ve, master);
5830 	if (bond) {
5831 		bond->sibling_mask |= sibling->mask;
5832 		return 0;
5833 	}
5834 
5835 	bond = krealloc(ve->bonds,
5836 			sizeof(*bond) * (ve->num_bonds + 1),
5837 			GFP_KERNEL);
5838 	if (!bond)
5839 		return -ENOMEM;
5840 
5841 	bond[ve->num_bonds].master = master;
5842 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5843 
5844 	ve->bonds = bond;
5845 	ve->num_bonds++;
5846 
5847 	return 0;
5848 }
5849 
5850 struct intel_engine_cs *
5851 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5852 				 unsigned int sibling)
5853 {
5854 	struct virtual_engine *ve = to_virtual_engine(engine);
5855 
5856 	if (sibling >= ve->num_siblings)
5857 		return NULL;
5858 
5859 	return ve->siblings[sibling];
5860 }
5861 
5862 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5863 				   struct drm_printer *m,
5864 				   void (*show_request)(struct drm_printer *m,
5865 							struct i915_request *rq,
5866 							const char *prefix),
5867 				   unsigned int max)
5868 {
5869 	const struct intel_engine_execlists *execlists = &engine->execlists;
5870 	struct i915_request *rq, *last;
5871 	unsigned long flags;
5872 	unsigned int count;
5873 	struct rb_node *rb;
5874 
5875 	spin_lock_irqsave(&engine->active.lock, flags);
5876 
5877 	last = NULL;
5878 	count = 0;
5879 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5880 		if (count++ < max - 1)
5881 			show_request(m, rq, "\t\tE ");
5882 		else
5883 			last = rq;
5884 	}
5885 	if (last) {
5886 		if (count > max) {
5887 			drm_printf(m,
5888 				   "\t\t...skipping %d executing requests...\n",
5889 				   count - max);
5890 		}
5891 		show_request(m, last, "\t\tE ");
5892 	}
5893 
5894 	if (execlists->switch_priority_hint != INT_MIN)
5895 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5896 			   READ_ONCE(execlists->switch_priority_hint));
5897 	if (execlists->queue_priority_hint != INT_MIN)
5898 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5899 			   READ_ONCE(execlists->queue_priority_hint));
5900 
5901 	last = NULL;
5902 	count = 0;
5903 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5904 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5905 		int i;
5906 
5907 		priolist_for_each_request(rq, p, i) {
5908 			if (count++ < max - 1)
5909 				show_request(m, rq, "\t\tQ ");
5910 			else
5911 				last = rq;
5912 		}
5913 	}
5914 	if (last) {
5915 		if (count > max) {
5916 			drm_printf(m,
5917 				   "\t\t...skipping %d queued requests...\n",
5918 				   count - max);
5919 		}
5920 		show_request(m, last, "\t\tQ ");
5921 	}
5922 
5923 	last = NULL;
5924 	count = 0;
5925 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5926 		struct virtual_engine *ve =
5927 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5928 		struct i915_request *rq = READ_ONCE(ve->request);
5929 
5930 		if (rq) {
5931 			if (count++ < max - 1)
5932 				show_request(m, rq, "\t\tV ");
5933 			else
5934 				last = rq;
5935 		}
5936 	}
5937 	if (last) {
5938 		if (count > max) {
5939 			drm_printf(m,
5940 				   "\t\t...skipping %d virtual requests...\n",
5941 				   count - max);
5942 		}
5943 		show_request(m, last, "\t\tV ");
5944 	}
5945 
5946 	spin_unlock_irqrestore(&engine->active.lock, flags);
5947 }
5948 
5949 void intel_lr_context_reset(struct intel_engine_cs *engine,
5950 			    struct intel_context *ce,
5951 			    u32 head,
5952 			    bool scrub)
5953 {
5954 	GEM_BUG_ON(!intel_context_is_pinned(ce));
5955 
5956 	/*
5957 	 * We want a simple context + ring to execute the breadcrumb update.
5958 	 * We cannot rely on the context being intact across the GPU hang,
5959 	 * so clear it and rebuild just what we need for the breadcrumb.
5960 	 * All pending requests for this context will be zapped, and any
5961 	 * future request will be after userspace has had the opportunity
5962 	 * to recreate its own state.
5963 	 */
5964 	if (scrub)
5965 		restore_default_state(ce, engine);
5966 
5967 	/* Rerun the request; its payload has been neutered (if guilty). */
5968 	__execlists_update_reg_state(ce, engine, head);
5969 }
5970 
5971 bool
5972 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5973 {
5974 	return engine->set_default_submission ==
5975 	       intel_execlists_set_default_submission;
5976 }
5977 
5978 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5979 #include "selftest_lrc.c"
5980 #endif
5981