xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision e8ec0493)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150 #include "shmem_utils.h"
151 
152 #define RING_EXECLIST_QFULL		(1 << 0x2)
153 #define RING_EXECLIST1_VALID		(1 << 0x3)
154 #define RING_EXECLIST0_VALID		(1 << 0x4)
155 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
156 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
157 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
158 
159 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
160 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
162 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
163 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
164 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
165 
166 #define GEN8_CTX_STATUS_COMPLETED_MASK \
167 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
168 
169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
170 
171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
173 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
174 #define GEN12_IDLE_CTX_ID		0x7FF
175 #define GEN12_CSB_CTX_VALID(csb_dw) \
176 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
177 
178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
180 
181 struct virtual_engine {
182 	struct intel_engine_cs base;
183 	struct intel_context context;
184 
185 	/*
186 	 * We allow only a single request through the virtual engine at a time
187 	 * (each request in the timeline waits for the completion fence of
188 	 * the previous before being submitted). By restricting ourselves to
189 	 * only submitting a single request, each request is placed on to a
190 	 * physical to maximise load spreading (by virtue of the late greedy
191 	 * scheduling -- each real engine takes the next available request
192 	 * upon idling).
193 	 */
194 	struct i915_request *request;
195 
196 	/*
197 	 * We keep a rbtree of available virtual engines inside each physical
198 	 * engine, sorted by priority. Here we preallocate the nodes we need
199 	 * for the virtual engine, indexed by physical_engine->id.
200 	 */
201 	struct ve_node {
202 		struct rb_node rb;
203 		int prio;
204 	} nodes[I915_NUM_ENGINES];
205 
206 	/*
207 	 * Keep track of bonded pairs -- restrictions upon on our selection
208 	 * of physical engines any particular request may be submitted to.
209 	 * If we receive a submit-fence from a master engine, we will only
210 	 * use one of sibling_mask physical engines.
211 	 */
212 	struct ve_bond {
213 		const struct intel_engine_cs *master;
214 		intel_engine_mask_t sibling_mask;
215 	} *bonds;
216 	unsigned int num_bonds;
217 
218 	/* And finally, which physical engines this virtual engine maps onto. */
219 	unsigned int num_siblings;
220 	struct intel_engine_cs *siblings[];
221 };
222 
223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
224 {
225 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
226 	return container_of(engine, struct virtual_engine, base);
227 }
228 
229 static int __execlists_context_alloc(struct intel_context *ce,
230 				     struct intel_engine_cs *engine);
231 
232 static void execlists_init_reg_state(u32 *reg_state,
233 				     const struct intel_context *ce,
234 				     const struct intel_engine_cs *engine,
235 				     const struct intel_ring *ring,
236 				     bool close);
237 static void
238 __execlists_update_reg_state(const struct intel_context *ce,
239 			     const struct intel_engine_cs *engine,
240 			     u32 head);
241 
242 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
243 {
244 	if (INTEL_GEN(engine->i915) >= 12)
245 		return 0x60;
246 	else if (INTEL_GEN(engine->i915) >= 9)
247 		return 0x54;
248 	else if (engine->class == RENDER_CLASS)
249 		return 0x58;
250 	else
251 		return -1;
252 }
253 
254 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
255 {
256 	if (INTEL_GEN(engine->i915) >= 12)
257 		return 0x74;
258 	else if (INTEL_GEN(engine->i915) >= 9)
259 		return 0x68;
260 	else if (engine->class == RENDER_CLASS)
261 		return 0xd8;
262 	else
263 		return -1;
264 }
265 
266 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
267 {
268 	if (INTEL_GEN(engine->i915) >= 12)
269 		return 0x12;
270 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
271 		return 0x18;
272 	else
273 		return -1;
274 }
275 
276 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
277 {
278 	int x;
279 
280 	x = lrc_ring_wa_bb_per_ctx(engine);
281 	if (x < 0)
282 		return x;
283 
284 	return x + 2;
285 }
286 
287 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
288 {
289 	int x;
290 
291 	x = lrc_ring_indirect_ptr(engine);
292 	if (x < 0)
293 		return x;
294 
295 	return x + 2;
296 }
297 
298 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
299 {
300 	if (engine->class != RENDER_CLASS)
301 		return -1;
302 
303 	if (INTEL_GEN(engine->i915) >= 12)
304 		return 0xb6;
305 	else if (INTEL_GEN(engine->i915) >= 11)
306 		return 0xaa;
307 	else
308 		return -1;
309 }
310 
311 static u32
312 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
313 {
314 	switch (INTEL_GEN(engine->i915)) {
315 	default:
316 		MISSING_CASE(INTEL_GEN(engine->i915));
317 		fallthrough;
318 	case 12:
319 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
320 	case 11:
321 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
322 	case 10:
323 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
324 	case 9:
325 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
326 	case 8:
327 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
328 	}
329 }
330 
331 static void
332 lrc_ring_setup_indirect_ctx(u32 *regs,
333 			    const struct intel_engine_cs *engine,
334 			    u32 ctx_bb_ggtt_addr,
335 			    u32 size)
336 {
337 	GEM_BUG_ON(!size);
338 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
339 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
340 	regs[lrc_ring_indirect_ptr(engine) + 1] =
341 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
342 
343 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
344 	regs[lrc_ring_indirect_offset(engine) + 1] =
345 		lrc_ring_indirect_offset_default(engine) << 6;
346 }
347 
348 static u32 intel_context_get_runtime(const struct intel_context *ce)
349 {
350 	/*
351 	 * We can use either ppHWSP[16] which is recorded before the context
352 	 * switch (and so excludes the cost of context switches) or use the
353 	 * value from the context image itself, which is saved/restored earlier
354 	 * and so includes the cost of the save.
355 	 */
356 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
357 }
358 
359 static void mark_eio(struct i915_request *rq)
360 {
361 	if (i915_request_completed(rq))
362 		return;
363 
364 	GEM_BUG_ON(i915_request_signaled(rq));
365 
366 	i915_request_set_error_once(rq, -EIO);
367 	i915_request_mark_complete(rq);
368 }
369 
370 static struct i915_request *
371 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
372 {
373 	struct i915_request *active = rq;
374 
375 	rcu_read_lock();
376 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
377 		if (i915_request_completed(rq))
378 			break;
379 
380 		active = rq;
381 	}
382 	rcu_read_unlock();
383 
384 	return active;
385 }
386 
387 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
388 {
389 	return (i915_ggtt_offset(engine->status_page.vma) +
390 		I915_GEM_HWS_PREEMPT_ADDR);
391 }
392 
393 static inline void
394 ring_set_paused(const struct intel_engine_cs *engine, int state)
395 {
396 	/*
397 	 * We inspect HWS_PREEMPT with a semaphore inside
398 	 * engine->emit_fini_breadcrumb. If the dword is true,
399 	 * the ring is paused as the semaphore will busywait
400 	 * until the dword is false.
401 	 */
402 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
403 	if (state)
404 		wmb();
405 }
406 
407 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
408 {
409 	return rb_entry(rb, struct i915_priolist, node);
410 }
411 
412 static inline int rq_prio(const struct i915_request *rq)
413 {
414 	return READ_ONCE(rq->sched.attr.priority);
415 }
416 
417 static int effective_prio(const struct i915_request *rq)
418 {
419 	int prio = rq_prio(rq);
420 
421 	/*
422 	 * If this request is special and must not be interrupted at any
423 	 * cost, so be it. Note we are only checking the most recent request
424 	 * in the context and so may be masking an earlier vip request. It
425 	 * is hoped that under the conditions where nopreempt is used, this
426 	 * will not matter (i.e. all requests to that context will be
427 	 * nopreempt for as long as desired).
428 	 */
429 	if (i915_request_has_nopreempt(rq))
430 		prio = I915_PRIORITY_UNPREEMPTABLE;
431 
432 	return prio;
433 }
434 
435 static int queue_prio(const struct intel_engine_execlists *execlists)
436 {
437 	struct i915_priolist *p;
438 	struct rb_node *rb;
439 
440 	rb = rb_first_cached(&execlists->queue);
441 	if (!rb)
442 		return INT_MIN;
443 
444 	/*
445 	 * As the priolist[] are inverted, with the highest priority in [0],
446 	 * we have to flip the index value to become priority.
447 	 */
448 	p = to_priolist(rb);
449 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
450 }
451 
452 static inline bool need_preempt(const struct intel_engine_cs *engine,
453 				const struct i915_request *rq,
454 				struct rb_node *rb)
455 {
456 	int last_prio;
457 
458 	if (!intel_engine_has_semaphores(engine))
459 		return false;
460 
461 	/*
462 	 * Check if the current priority hint merits a preemption attempt.
463 	 *
464 	 * We record the highest value priority we saw during rescheduling
465 	 * prior to this dequeue, therefore we know that if it is strictly
466 	 * less than the current tail of ESLP[0], we do not need to force
467 	 * a preempt-to-idle cycle.
468 	 *
469 	 * However, the priority hint is a mere hint that we may need to
470 	 * preempt. If that hint is stale or we may be trying to preempt
471 	 * ourselves, ignore the request.
472 	 *
473 	 * More naturally we would write
474 	 *      prio >= max(0, last);
475 	 * except that we wish to prevent triggering preemption at the same
476 	 * priority level: the task that is running should remain running
477 	 * to preserve FIFO ordering of dependencies.
478 	 */
479 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
480 	if (engine->execlists.queue_priority_hint <= last_prio)
481 		return false;
482 
483 	/*
484 	 * Check against the first request in ELSP[1], it will, thanks to the
485 	 * power of PI, be the highest priority of that context.
486 	 */
487 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
488 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
489 		return true;
490 
491 	if (rb) {
492 		struct virtual_engine *ve =
493 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
494 		bool preempt = false;
495 
496 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
497 			struct i915_request *next;
498 
499 			rcu_read_lock();
500 			next = READ_ONCE(ve->request);
501 			if (next)
502 				preempt = rq_prio(next) > last_prio;
503 			rcu_read_unlock();
504 		}
505 
506 		if (preempt)
507 			return preempt;
508 	}
509 
510 	/*
511 	 * If the inflight context did not trigger the preemption, then maybe
512 	 * it was the set of queued requests? Pick the highest priority in
513 	 * the queue (the first active priolist) and see if it deserves to be
514 	 * running instead of ELSP[0].
515 	 *
516 	 * The highest priority request in the queue can not be either
517 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
518 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
519 	 */
520 	return queue_prio(&engine->execlists) > last_prio;
521 }
522 
523 __maybe_unused static inline bool
524 assert_priority_queue(const struct i915_request *prev,
525 		      const struct i915_request *next)
526 {
527 	/*
528 	 * Without preemption, the prev may refer to the still active element
529 	 * which we refuse to let go.
530 	 *
531 	 * Even with preemption, there are times when we think it is better not
532 	 * to preempt and leave an ostensibly lower priority request in flight.
533 	 */
534 	if (i915_request_is_active(prev))
535 		return true;
536 
537 	return rq_prio(prev) >= rq_prio(next);
538 }
539 
540 /*
541  * The context descriptor encodes various attributes of a context,
542  * including its GTT address and some flags. Because it's fairly
543  * expensive to calculate, we'll just do it once and cache the result,
544  * which remains valid until the context is unpinned.
545  *
546  * This is what a descriptor looks like, from LSB to MSB::
547  *
548  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
549  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
550  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
551  *      bits 53-54:    mbz, reserved for use by hardware
552  *      bits 55-63:    group ID, currently unused and set to 0
553  *
554  * Starting from Gen11, the upper dword of the descriptor has a new format:
555  *
556  *      bits 32-36:    reserved
557  *      bits 37-47:    SW context ID
558  *      bits 48:53:    engine instance
559  *      bit 54:        mbz, reserved for use by hardware
560  *      bits 55-60:    SW counter
561  *      bits 61-63:    engine class
562  *
563  * engine info, SW context ID and SW counter need to form a unique number
564  * (Context ID) per lrc.
565  */
566 static u32
567 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
568 {
569 	u32 desc;
570 
571 	desc = INTEL_LEGACY_32B_CONTEXT;
572 	if (i915_vm_is_4lvl(ce->vm))
573 		desc = INTEL_LEGACY_64B_CONTEXT;
574 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
575 
576 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
577 	if (IS_GEN(engine->i915, 8))
578 		desc |= GEN8_CTX_L3LLC_COHERENT;
579 
580 	return i915_ggtt_offset(ce->state) | desc;
581 }
582 
583 static inline unsigned int dword_in_page(void *addr)
584 {
585 	return offset_in_page(addr) / sizeof(u32);
586 }
587 
588 static void set_offsets(u32 *regs,
589 			const u8 *data,
590 			const struct intel_engine_cs *engine,
591 			bool clear)
592 #define NOP(x) (BIT(7) | (x))
593 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
594 #define POSTED BIT(0)
595 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
596 #define REG16(x) \
597 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
598 	(((x) >> 2) & 0x7f)
599 #define END(total_state_size) 0, (total_state_size)
600 {
601 	const u32 base = engine->mmio_base;
602 
603 	while (*data) {
604 		u8 count, flags;
605 
606 		if (*data & BIT(7)) { /* skip */
607 			count = *data++ & ~BIT(7);
608 			if (clear)
609 				memset32(regs, MI_NOOP, count);
610 			regs += count;
611 			continue;
612 		}
613 
614 		count = *data & 0x3f;
615 		flags = *data >> 6;
616 		data++;
617 
618 		*regs = MI_LOAD_REGISTER_IMM(count);
619 		if (flags & POSTED)
620 			*regs |= MI_LRI_FORCE_POSTED;
621 		if (INTEL_GEN(engine->i915) >= 11)
622 			*regs |= MI_LRI_LRM_CS_MMIO;
623 		regs++;
624 
625 		GEM_BUG_ON(!count);
626 		do {
627 			u32 offset = 0;
628 			u8 v;
629 
630 			do {
631 				v = *data++;
632 				offset <<= 7;
633 				offset |= v & ~BIT(7);
634 			} while (v & BIT(7));
635 
636 			regs[0] = base + (offset << 2);
637 			if (clear)
638 				regs[1] = 0;
639 			regs += 2;
640 		} while (--count);
641 	}
642 
643 	if (clear) {
644 		u8 count = *++data;
645 
646 		/* Clear past the tail for HW access */
647 		GEM_BUG_ON(dword_in_page(regs) > count);
648 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
649 
650 		/* Close the batch; used mainly by live_lrc_layout() */
651 		*regs = MI_BATCH_BUFFER_END;
652 		if (INTEL_GEN(engine->i915) >= 10)
653 			*regs |= BIT(0);
654 	}
655 }
656 
657 static const u8 gen8_xcs_offsets[] = {
658 	NOP(1),
659 	LRI(11, 0),
660 	REG16(0x244),
661 	REG(0x034),
662 	REG(0x030),
663 	REG(0x038),
664 	REG(0x03c),
665 	REG(0x168),
666 	REG(0x140),
667 	REG(0x110),
668 	REG(0x11c),
669 	REG(0x114),
670 	REG(0x118),
671 
672 	NOP(9),
673 	LRI(9, 0),
674 	REG16(0x3a8),
675 	REG16(0x28c),
676 	REG16(0x288),
677 	REG16(0x284),
678 	REG16(0x280),
679 	REG16(0x27c),
680 	REG16(0x278),
681 	REG16(0x274),
682 	REG16(0x270),
683 
684 	NOP(13),
685 	LRI(2, 0),
686 	REG16(0x200),
687 	REG(0x028),
688 
689 	END(80)
690 };
691 
692 static const u8 gen9_xcs_offsets[] = {
693 	NOP(1),
694 	LRI(14, POSTED),
695 	REG16(0x244),
696 	REG(0x034),
697 	REG(0x030),
698 	REG(0x038),
699 	REG(0x03c),
700 	REG(0x168),
701 	REG(0x140),
702 	REG(0x110),
703 	REG(0x11c),
704 	REG(0x114),
705 	REG(0x118),
706 	REG(0x1c0),
707 	REG(0x1c4),
708 	REG(0x1c8),
709 
710 	NOP(3),
711 	LRI(9, POSTED),
712 	REG16(0x3a8),
713 	REG16(0x28c),
714 	REG16(0x288),
715 	REG16(0x284),
716 	REG16(0x280),
717 	REG16(0x27c),
718 	REG16(0x278),
719 	REG16(0x274),
720 	REG16(0x270),
721 
722 	NOP(13),
723 	LRI(1, POSTED),
724 	REG16(0x200),
725 
726 	NOP(13),
727 	LRI(44, POSTED),
728 	REG(0x028),
729 	REG(0x09c),
730 	REG(0x0c0),
731 	REG(0x178),
732 	REG(0x17c),
733 	REG16(0x358),
734 	REG(0x170),
735 	REG(0x150),
736 	REG(0x154),
737 	REG(0x158),
738 	REG16(0x41c),
739 	REG16(0x600),
740 	REG16(0x604),
741 	REG16(0x608),
742 	REG16(0x60c),
743 	REG16(0x610),
744 	REG16(0x614),
745 	REG16(0x618),
746 	REG16(0x61c),
747 	REG16(0x620),
748 	REG16(0x624),
749 	REG16(0x628),
750 	REG16(0x62c),
751 	REG16(0x630),
752 	REG16(0x634),
753 	REG16(0x638),
754 	REG16(0x63c),
755 	REG16(0x640),
756 	REG16(0x644),
757 	REG16(0x648),
758 	REG16(0x64c),
759 	REG16(0x650),
760 	REG16(0x654),
761 	REG16(0x658),
762 	REG16(0x65c),
763 	REG16(0x660),
764 	REG16(0x664),
765 	REG16(0x668),
766 	REG16(0x66c),
767 	REG16(0x670),
768 	REG16(0x674),
769 	REG16(0x678),
770 	REG16(0x67c),
771 	REG(0x068),
772 
773 	END(176)
774 };
775 
776 static const u8 gen12_xcs_offsets[] = {
777 	NOP(1),
778 	LRI(13, POSTED),
779 	REG16(0x244),
780 	REG(0x034),
781 	REG(0x030),
782 	REG(0x038),
783 	REG(0x03c),
784 	REG(0x168),
785 	REG(0x140),
786 	REG(0x110),
787 	REG(0x1c0),
788 	REG(0x1c4),
789 	REG(0x1c8),
790 	REG(0x180),
791 	REG16(0x2b4),
792 
793 	NOP(5),
794 	LRI(9, POSTED),
795 	REG16(0x3a8),
796 	REG16(0x28c),
797 	REG16(0x288),
798 	REG16(0x284),
799 	REG16(0x280),
800 	REG16(0x27c),
801 	REG16(0x278),
802 	REG16(0x274),
803 	REG16(0x270),
804 
805 	END(80)
806 };
807 
808 static const u8 gen8_rcs_offsets[] = {
809 	NOP(1),
810 	LRI(14, POSTED),
811 	REG16(0x244),
812 	REG(0x034),
813 	REG(0x030),
814 	REG(0x038),
815 	REG(0x03c),
816 	REG(0x168),
817 	REG(0x140),
818 	REG(0x110),
819 	REG(0x11c),
820 	REG(0x114),
821 	REG(0x118),
822 	REG(0x1c0),
823 	REG(0x1c4),
824 	REG(0x1c8),
825 
826 	NOP(3),
827 	LRI(9, POSTED),
828 	REG16(0x3a8),
829 	REG16(0x28c),
830 	REG16(0x288),
831 	REG16(0x284),
832 	REG16(0x280),
833 	REG16(0x27c),
834 	REG16(0x278),
835 	REG16(0x274),
836 	REG16(0x270),
837 
838 	NOP(13),
839 	LRI(1, 0),
840 	REG(0x0c8),
841 
842 	END(80)
843 };
844 
845 static const u8 gen9_rcs_offsets[] = {
846 	NOP(1),
847 	LRI(14, POSTED),
848 	REG16(0x244),
849 	REG(0x34),
850 	REG(0x30),
851 	REG(0x38),
852 	REG(0x3c),
853 	REG(0x168),
854 	REG(0x140),
855 	REG(0x110),
856 	REG(0x11c),
857 	REG(0x114),
858 	REG(0x118),
859 	REG(0x1c0),
860 	REG(0x1c4),
861 	REG(0x1c8),
862 
863 	NOP(3),
864 	LRI(9, POSTED),
865 	REG16(0x3a8),
866 	REG16(0x28c),
867 	REG16(0x288),
868 	REG16(0x284),
869 	REG16(0x280),
870 	REG16(0x27c),
871 	REG16(0x278),
872 	REG16(0x274),
873 	REG16(0x270),
874 
875 	NOP(13),
876 	LRI(1, 0),
877 	REG(0xc8),
878 
879 	NOP(13),
880 	LRI(44, POSTED),
881 	REG(0x28),
882 	REG(0x9c),
883 	REG(0xc0),
884 	REG(0x178),
885 	REG(0x17c),
886 	REG16(0x358),
887 	REG(0x170),
888 	REG(0x150),
889 	REG(0x154),
890 	REG(0x158),
891 	REG16(0x41c),
892 	REG16(0x600),
893 	REG16(0x604),
894 	REG16(0x608),
895 	REG16(0x60c),
896 	REG16(0x610),
897 	REG16(0x614),
898 	REG16(0x618),
899 	REG16(0x61c),
900 	REG16(0x620),
901 	REG16(0x624),
902 	REG16(0x628),
903 	REG16(0x62c),
904 	REG16(0x630),
905 	REG16(0x634),
906 	REG16(0x638),
907 	REG16(0x63c),
908 	REG16(0x640),
909 	REG16(0x644),
910 	REG16(0x648),
911 	REG16(0x64c),
912 	REG16(0x650),
913 	REG16(0x654),
914 	REG16(0x658),
915 	REG16(0x65c),
916 	REG16(0x660),
917 	REG16(0x664),
918 	REG16(0x668),
919 	REG16(0x66c),
920 	REG16(0x670),
921 	REG16(0x674),
922 	REG16(0x678),
923 	REG16(0x67c),
924 	REG(0x68),
925 
926 	END(176)
927 };
928 
929 static const u8 gen11_rcs_offsets[] = {
930 	NOP(1),
931 	LRI(15, POSTED),
932 	REG16(0x244),
933 	REG(0x034),
934 	REG(0x030),
935 	REG(0x038),
936 	REG(0x03c),
937 	REG(0x168),
938 	REG(0x140),
939 	REG(0x110),
940 	REG(0x11c),
941 	REG(0x114),
942 	REG(0x118),
943 	REG(0x1c0),
944 	REG(0x1c4),
945 	REG(0x1c8),
946 	REG(0x180),
947 
948 	NOP(1),
949 	LRI(9, POSTED),
950 	REG16(0x3a8),
951 	REG16(0x28c),
952 	REG16(0x288),
953 	REG16(0x284),
954 	REG16(0x280),
955 	REG16(0x27c),
956 	REG16(0x278),
957 	REG16(0x274),
958 	REG16(0x270),
959 
960 	LRI(1, POSTED),
961 	REG(0x1b0),
962 
963 	NOP(10),
964 	LRI(1, 0),
965 	REG(0x0c8),
966 
967 	END(80)
968 };
969 
970 static const u8 gen12_rcs_offsets[] = {
971 	NOP(1),
972 	LRI(13, POSTED),
973 	REG16(0x244),
974 	REG(0x034),
975 	REG(0x030),
976 	REG(0x038),
977 	REG(0x03c),
978 	REG(0x168),
979 	REG(0x140),
980 	REG(0x110),
981 	REG(0x1c0),
982 	REG(0x1c4),
983 	REG(0x1c8),
984 	REG(0x180),
985 	REG16(0x2b4),
986 
987 	NOP(5),
988 	LRI(9, POSTED),
989 	REG16(0x3a8),
990 	REG16(0x28c),
991 	REG16(0x288),
992 	REG16(0x284),
993 	REG16(0x280),
994 	REG16(0x27c),
995 	REG16(0x278),
996 	REG16(0x274),
997 	REG16(0x270),
998 
999 	LRI(3, POSTED),
1000 	REG(0x1b0),
1001 	REG16(0x5a8),
1002 	REG16(0x5ac),
1003 
1004 	NOP(6),
1005 	LRI(1, 0),
1006 	REG(0x0c8),
1007 	NOP(3 + 9 + 1),
1008 
1009 	LRI(51, POSTED),
1010 	REG16(0x588),
1011 	REG16(0x588),
1012 	REG16(0x588),
1013 	REG16(0x588),
1014 	REG16(0x588),
1015 	REG16(0x588),
1016 	REG(0x028),
1017 	REG(0x09c),
1018 	REG(0x0c0),
1019 	REG(0x178),
1020 	REG(0x17c),
1021 	REG16(0x358),
1022 	REG(0x170),
1023 	REG(0x150),
1024 	REG(0x154),
1025 	REG(0x158),
1026 	REG16(0x41c),
1027 	REG16(0x600),
1028 	REG16(0x604),
1029 	REG16(0x608),
1030 	REG16(0x60c),
1031 	REG16(0x610),
1032 	REG16(0x614),
1033 	REG16(0x618),
1034 	REG16(0x61c),
1035 	REG16(0x620),
1036 	REG16(0x624),
1037 	REG16(0x628),
1038 	REG16(0x62c),
1039 	REG16(0x630),
1040 	REG16(0x634),
1041 	REG16(0x638),
1042 	REG16(0x63c),
1043 	REG16(0x640),
1044 	REG16(0x644),
1045 	REG16(0x648),
1046 	REG16(0x64c),
1047 	REG16(0x650),
1048 	REG16(0x654),
1049 	REG16(0x658),
1050 	REG16(0x65c),
1051 	REG16(0x660),
1052 	REG16(0x664),
1053 	REG16(0x668),
1054 	REG16(0x66c),
1055 	REG16(0x670),
1056 	REG16(0x674),
1057 	REG16(0x678),
1058 	REG16(0x67c),
1059 	REG(0x068),
1060 	REG(0x084),
1061 	NOP(1),
1062 
1063 	END(192)
1064 };
1065 
1066 #undef END
1067 #undef REG16
1068 #undef REG
1069 #undef LRI
1070 #undef NOP
1071 
1072 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1073 {
1074 	/*
1075 	 * The gen12+ lists only have the registers we program in the basic
1076 	 * default state. We rely on the context image using relative
1077 	 * addressing to automatic fixup the register state between the
1078 	 * physical engines for virtual engine.
1079 	 */
1080 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1081 		   !intel_engine_has_relative_mmio(engine));
1082 
1083 	if (engine->class == RENDER_CLASS) {
1084 		if (INTEL_GEN(engine->i915) >= 12)
1085 			return gen12_rcs_offsets;
1086 		else if (INTEL_GEN(engine->i915) >= 11)
1087 			return gen11_rcs_offsets;
1088 		else if (INTEL_GEN(engine->i915) >= 9)
1089 			return gen9_rcs_offsets;
1090 		else
1091 			return gen8_rcs_offsets;
1092 	} else {
1093 		if (INTEL_GEN(engine->i915) >= 12)
1094 			return gen12_xcs_offsets;
1095 		else if (INTEL_GEN(engine->i915) >= 9)
1096 			return gen9_xcs_offsets;
1097 		else
1098 			return gen8_xcs_offsets;
1099 	}
1100 }
1101 
1102 static struct i915_request *
1103 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1104 {
1105 	struct i915_request *rq, *rn, *active = NULL;
1106 	struct list_head *uninitialized_var(pl);
1107 	int prio = I915_PRIORITY_INVALID;
1108 
1109 	lockdep_assert_held(&engine->active.lock);
1110 
1111 	list_for_each_entry_safe_reverse(rq, rn,
1112 					 &engine->active.requests,
1113 					 sched.link) {
1114 		if (i915_request_completed(rq))
1115 			continue; /* XXX */
1116 
1117 		__i915_request_unsubmit(rq);
1118 
1119 		/*
1120 		 * Push the request back into the queue for later resubmission.
1121 		 * If this request is not native to this physical engine (i.e.
1122 		 * it came from a virtual source), push it back onto the virtual
1123 		 * engine so that it can be moved across onto another physical
1124 		 * engine as load dictates.
1125 		 */
1126 		if (likely(rq->execution_mask == engine->mask)) {
1127 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1128 			if (rq_prio(rq) != prio) {
1129 				prio = rq_prio(rq);
1130 				pl = i915_sched_lookup_priolist(engine, prio);
1131 			}
1132 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1133 
1134 			list_move(&rq->sched.link, pl);
1135 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1136 
1137 			active = rq;
1138 		} else {
1139 			struct intel_engine_cs *owner = rq->context->engine;
1140 
1141 			/*
1142 			 * Decouple the virtual breadcrumb before moving it
1143 			 * back to the virtual engine -- we don't want the
1144 			 * request to complete in the background and try
1145 			 * and cancel the breadcrumb on the virtual engine
1146 			 * (instead of the old engine where it is linked)!
1147 			 */
1148 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1149 				     &rq->fence.flags)) {
1150 				spin_lock_nested(&rq->lock,
1151 						 SINGLE_DEPTH_NESTING);
1152 				i915_request_cancel_breadcrumb(rq);
1153 				spin_unlock(&rq->lock);
1154 			}
1155 			WRITE_ONCE(rq->engine, owner);
1156 			owner->submit_request(rq);
1157 			active = NULL;
1158 		}
1159 	}
1160 
1161 	return active;
1162 }
1163 
1164 struct i915_request *
1165 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1166 {
1167 	struct intel_engine_cs *engine =
1168 		container_of(execlists, typeof(*engine), execlists);
1169 
1170 	return __unwind_incomplete_requests(engine);
1171 }
1172 
1173 static inline void
1174 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1175 {
1176 	/*
1177 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1178 	 * The compiler should eliminate this function as dead-code.
1179 	 */
1180 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1181 		return;
1182 
1183 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1184 				   status, rq);
1185 }
1186 
1187 static void intel_engine_context_in(struct intel_engine_cs *engine)
1188 {
1189 	unsigned long flags;
1190 
1191 	if (atomic_add_unless(&engine->stats.active, 1, 0))
1192 		return;
1193 
1194 	write_seqlock_irqsave(&engine->stats.lock, flags);
1195 	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1196 		engine->stats.start = ktime_get();
1197 		atomic_inc(&engine->stats.active);
1198 	}
1199 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1200 }
1201 
1202 static void intel_engine_context_out(struct intel_engine_cs *engine)
1203 {
1204 	unsigned long flags;
1205 
1206 	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1207 
1208 	if (atomic_add_unless(&engine->stats.active, -1, 1))
1209 		return;
1210 
1211 	write_seqlock_irqsave(&engine->stats.lock, flags);
1212 	if (atomic_dec_and_test(&engine->stats.active)) {
1213 		engine->stats.total =
1214 			ktime_add(engine->stats.total,
1215 				  ktime_sub(ktime_get(), engine->stats.start));
1216 	}
1217 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1218 }
1219 
1220 static void
1221 execlists_check_context(const struct intel_context *ce,
1222 			const struct intel_engine_cs *engine)
1223 {
1224 	const struct intel_ring *ring = ce->ring;
1225 	u32 *regs = ce->lrc_reg_state;
1226 	bool valid = true;
1227 	int x;
1228 
1229 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1230 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1231 		       engine->name,
1232 		       regs[CTX_RING_START],
1233 		       i915_ggtt_offset(ring->vma));
1234 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1235 		valid = false;
1236 	}
1237 
1238 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1239 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1240 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1241 		       engine->name,
1242 		       regs[CTX_RING_CTL],
1243 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1244 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1245 		valid = false;
1246 	}
1247 
1248 	x = lrc_ring_mi_mode(engine);
1249 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1250 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1251 		       engine->name, regs[x + 1]);
1252 		regs[x + 1] &= ~STOP_RING;
1253 		regs[x + 1] |= STOP_RING << 16;
1254 		valid = false;
1255 	}
1256 
1257 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1258 }
1259 
1260 static void restore_default_state(struct intel_context *ce,
1261 				  struct intel_engine_cs *engine)
1262 {
1263 	u32 *regs;
1264 
1265 	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1266 	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1267 
1268 	ce->runtime.last = intel_context_get_runtime(ce);
1269 }
1270 
1271 static void reset_active(struct i915_request *rq,
1272 			 struct intel_engine_cs *engine)
1273 {
1274 	struct intel_context * const ce = rq->context;
1275 	u32 head;
1276 
1277 	/*
1278 	 * The executing context has been cancelled. We want to prevent
1279 	 * further execution along this context and propagate the error on
1280 	 * to anything depending on its results.
1281 	 *
1282 	 * In __i915_request_submit(), we apply the -EIO and remove the
1283 	 * requests' payloads for any banned requests. But first, we must
1284 	 * rewind the context back to the start of the incomplete request so
1285 	 * that we do not jump back into the middle of the batch.
1286 	 *
1287 	 * We preserve the breadcrumbs and semaphores of the incomplete
1288 	 * requests so that inter-timeline dependencies (i.e other timelines)
1289 	 * remain correctly ordered. And we defer to __i915_request_submit()
1290 	 * so that all asynchronous waits are correctly handled.
1291 	 */
1292 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1293 		     rq->fence.context, rq->fence.seqno);
1294 
1295 	/* On resubmission of the active request, payload will be scrubbed */
1296 	if (i915_request_completed(rq))
1297 		head = rq->tail;
1298 	else
1299 		head = active_request(ce->timeline, rq)->head;
1300 	head = intel_ring_wrap(ce->ring, head);
1301 
1302 	/* Scrub the context image to prevent replaying the previous batch */
1303 	restore_default_state(ce, engine);
1304 	__execlists_update_reg_state(ce, engine, head);
1305 
1306 	/* We've switched away, so this should be a no-op, but intent matters */
1307 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1308 }
1309 
1310 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1311 {
1312 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1313 	ce->runtime.num_underflow += dt < 0;
1314 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1315 #endif
1316 }
1317 
1318 static void intel_context_update_runtime(struct intel_context *ce)
1319 {
1320 	u32 old;
1321 	s32 dt;
1322 
1323 	if (intel_context_is_barrier(ce))
1324 		return;
1325 
1326 	old = ce->runtime.last;
1327 	ce->runtime.last = intel_context_get_runtime(ce);
1328 	dt = ce->runtime.last - old;
1329 
1330 	if (unlikely(dt <= 0)) {
1331 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1332 			 old, ce->runtime.last, dt);
1333 		st_update_runtime_underflow(ce, dt);
1334 		return;
1335 	}
1336 
1337 	ewma_runtime_add(&ce->runtime.avg, dt);
1338 	ce->runtime.total += dt;
1339 }
1340 
1341 static inline struct intel_engine_cs *
1342 __execlists_schedule_in(struct i915_request *rq)
1343 {
1344 	struct intel_engine_cs * const engine = rq->engine;
1345 	struct intel_context * const ce = rq->context;
1346 
1347 	intel_context_get(ce);
1348 
1349 	if (unlikely(intel_context_is_banned(ce)))
1350 		reset_active(rq, engine);
1351 
1352 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1353 		execlists_check_context(ce, engine);
1354 
1355 	if (ce->tag) {
1356 		/* Use a fixed tag for OA and friends */
1357 		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1358 		ce->lrc.ccid = ce->tag;
1359 	} else {
1360 		/* We don't need a strict matching tag, just different values */
1361 		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1362 
1363 		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1364 		clear_bit(tag - 1, &engine->context_tag);
1365 		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1366 
1367 		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1368 	}
1369 
1370 	ce->lrc.ccid |= engine->execlists.ccid;
1371 
1372 	__intel_gt_pm_get(engine->gt);
1373 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1374 	intel_engine_context_in(engine);
1375 
1376 	return engine;
1377 }
1378 
1379 static inline struct i915_request *
1380 execlists_schedule_in(struct i915_request *rq, int idx)
1381 {
1382 	struct intel_context * const ce = rq->context;
1383 	struct intel_engine_cs *old;
1384 
1385 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1386 	trace_i915_request_in(rq, idx);
1387 
1388 	old = READ_ONCE(ce->inflight);
1389 	do {
1390 		if (!old) {
1391 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1392 			break;
1393 		}
1394 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1395 
1396 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1397 	return i915_request_get(rq);
1398 }
1399 
1400 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1401 {
1402 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1403 	struct i915_request *next = READ_ONCE(ve->request);
1404 
1405 	if (next && next->execution_mask & ~rq->execution_mask)
1406 		tasklet_schedule(&ve->base.execlists.tasklet);
1407 }
1408 
1409 static inline void
1410 __execlists_schedule_out(struct i915_request *rq,
1411 			 struct intel_engine_cs * const engine,
1412 			 unsigned int ccid)
1413 {
1414 	struct intel_context * const ce = rq->context;
1415 
1416 	/*
1417 	 * NB process_csb() is not under the engine->active.lock and hence
1418 	 * schedule_out can race with schedule_in meaning that we should
1419 	 * refrain from doing non-trivial work here.
1420 	 */
1421 
1422 	/*
1423 	 * If we have just completed this context, the engine may now be
1424 	 * idle and we want to re-enter powersaving.
1425 	 */
1426 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1427 	    i915_request_completed(rq))
1428 		intel_engine_add_retire(engine, ce->timeline);
1429 
1430 	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1431 	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1432 	if (ccid < BITS_PER_LONG) {
1433 		GEM_BUG_ON(ccid == 0);
1434 		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1435 		set_bit(ccid - 1, &engine->context_tag);
1436 	}
1437 
1438 	intel_context_update_runtime(ce);
1439 	intel_engine_context_out(engine);
1440 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1441 	intel_gt_pm_put_async(engine->gt);
1442 
1443 	/*
1444 	 * If this is part of a virtual engine, its next request may
1445 	 * have been blocked waiting for access to the active context.
1446 	 * We have to kick all the siblings again in case we need to
1447 	 * switch (e.g. the next request is not runnable on this
1448 	 * engine). Hopefully, we will already have submitted the next
1449 	 * request before the tasklet runs and do not need to rebuild
1450 	 * each virtual tree and kick everyone again.
1451 	 */
1452 	if (ce->engine != engine)
1453 		kick_siblings(rq, ce);
1454 
1455 	intel_context_put(ce);
1456 }
1457 
1458 static inline void
1459 execlists_schedule_out(struct i915_request *rq)
1460 {
1461 	struct intel_context * const ce = rq->context;
1462 	struct intel_engine_cs *cur, *old;
1463 	u32 ccid;
1464 
1465 	trace_i915_request_out(rq);
1466 
1467 	ccid = rq->context->lrc.ccid;
1468 	old = READ_ONCE(ce->inflight);
1469 	do
1470 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1471 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1472 	if (!cur)
1473 		__execlists_schedule_out(rq, old, ccid);
1474 
1475 	i915_request_put(rq);
1476 }
1477 
1478 static u64 execlists_update_context(struct i915_request *rq)
1479 {
1480 	struct intel_context *ce = rq->context;
1481 	u64 desc = ce->lrc.desc;
1482 	u32 tail, prev;
1483 
1484 	/*
1485 	 * WaIdleLiteRestore:bdw,skl
1486 	 *
1487 	 * We should never submit the context with the same RING_TAIL twice
1488 	 * just in case we submit an empty ring, which confuses the HW.
1489 	 *
1490 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1491 	 * the normal request to be able to always advance the RING_TAIL on
1492 	 * subsequent resubmissions (for lite restore). Should that fail us,
1493 	 * and we try and submit the same tail again, force the context
1494 	 * reload.
1495 	 *
1496 	 * If we need to return to a preempted context, we need to skip the
1497 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1498 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1499 	 * an earlier request.
1500 	 */
1501 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1502 	prev = ce->lrc_reg_state[CTX_RING_TAIL];
1503 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1504 		desc |= CTX_DESC_FORCE_RESTORE;
1505 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1506 	rq->tail = rq->wa_tail;
1507 
1508 	/*
1509 	 * Make sure the context image is complete before we submit it to HW.
1510 	 *
1511 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1512 	 * an uncached write such as our mmio register access, the empirical
1513 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1514 	 * may not be visible to the HW prior to the completion of the UC
1515 	 * register write and that we may begin execution from the context
1516 	 * before its image is complete leading to invalid PD chasing.
1517 	 */
1518 	wmb();
1519 
1520 	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1521 	return desc;
1522 }
1523 
1524 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1525 {
1526 	if (execlists->ctrl_reg) {
1527 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1528 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1529 	} else {
1530 		writel(upper_32_bits(desc), execlists->submit_reg);
1531 		writel(lower_32_bits(desc), execlists->submit_reg);
1532 	}
1533 }
1534 
1535 static __maybe_unused char *
1536 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1537 {
1538 	if (!rq)
1539 		return "";
1540 
1541 	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1542 		 prefix,
1543 		 rq->context->lrc.ccid,
1544 		 rq->fence.context, rq->fence.seqno,
1545 		 i915_request_completed(rq) ? "!" :
1546 		 i915_request_started(rq) ? "*" :
1547 		 "",
1548 		 rq_prio(rq));
1549 
1550 	return buf;
1551 }
1552 
1553 static __maybe_unused void
1554 trace_ports(const struct intel_engine_execlists *execlists,
1555 	    const char *msg,
1556 	    struct i915_request * const *ports)
1557 {
1558 	const struct intel_engine_cs *engine =
1559 		container_of(execlists, typeof(*engine), execlists);
1560 	char __maybe_unused p0[40], p1[40];
1561 
1562 	if (!ports[0])
1563 		return;
1564 
1565 	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1566 		     dump_port(p0, sizeof(p0), "", ports[0]),
1567 		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1568 }
1569 
1570 static inline bool
1571 reset_in_progress(const struct intel_engine_execlists *execlists)
1572 {
1573 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1574 }
1575 
1576 static __maybe_unused bool
1577 assert_pending_valid(const struct intel_engine_execlists *execlists,
1578 		     const char *msg)
1579 {
1580 	struct intel_engine_cs *engine =
1581 		container_of(execlists, typeof(*engine), execlists);
1582 	struct i915_request * const *port, *rq;
1583 	struct intel_context *ce = NULL;
1584 	bool sentinel = false;
1585 	u32 ccid = -1;
1586 
1587 	trace_ports(execlists, msg, execlists->pending);
1588 
1589 	/* We may be messing around with the lists during reset, lalala */
1590 	if (reset_in_progress(execlists))
1591 		return true;
1592 
1593 	if (!execlists->pending[0]) {
1594 		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1595 			      engine->name);
1596 		return false;
1597 	}
1598 
1599 	if (execlists->pending[execlists_num_ports(execlists)]) {
1600 		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1601 			      engine->name, execlists_num_ports(execlists));
1602 		return false;
1603 	}
1604 
1605 	for (port = execlists->pending; (rq = *port); port++) {
1606 		unsigned long flags;
1607 		bool ok = true;
1608 
1609 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1610 		GEM_BUG_ON(!i915_request_is_active(rq));
1611 
1612 		if (ce == rq->context) {
1613 			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1614 				      engine->name,
1615 				      ce->timeline->fence_context,
1616 				      port - execlists->pending);
1617 			return false;
1618 		}
1619 		ce = rq->context;
1620 
1621 		if (ccid == ce->lrc.ccid) {
1622 			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1623 				      engine->name,
1624 				      ccid, ce->timeline->fence_context,
1625 				      port - execlists->pending);
1626 			return false;
1627 		}
1628 		ccid = ce->lrc.ccid;
1629 
1630 		/*
1631 		 * Sentinels are supposed to be lonely so they flush the
1632 		 * current exection off the HW. Check that they are the
1633 		 * only request in the pending submission.
1634 		 */
1635 		if (sentinel) {
1636 			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1637 				      engine->name,
1638 				      ce->timeline->fence_context,
1639 				      port - execlists->pending);
1640 			return false;
1641 		}
1642 
1643 		sentinel = i915_request_has_sentinel(rq);
1644 		if (sentinel && port != execlists->pending) {
1645 			GEM_TRACE_ERR("%s: sentinel context:%llx not in prime position[%zd]\n",
1646 				      engine->name,
1647 				      ce->timeline->fence_context,
1648 				      port - execlists->pending);
1649 			return false;
1650 		}
1651 
1652 		/* Hold tightly onto the lock to prevent concurrent retires! */
1653 		if (!spin_trylock_irqsave(&rq->lock, flags))
1654 			continue;
1655 
1656 		if (i915_request_completed(rq))
1657 			goto unlock;
1658 
1659 		if (i915_active_is_idle(&ce->active) &&
1660 		    !intel_context_is_barrier(ce)) {
1661 			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1662 				      engine->name,
1663 				      ce->timeline->fence_context,
1664 				      port - execlists->pending);
1665 			ok = false;
1666 			goto unlock;
1667 		}
1668 
1669 		if (!i915_vma_is_pinned(ce->state)) {
1670 			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1671 				      engine->name,
1672 				      ce->timeline->fence_context,
1673 				      port - execlists->pending);
1674 			ok = false;
1675 			goto unlock;
1676 		}
1677 
1678 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1679 			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1680 				      engine->name,
1681 				      ce->timeline->fence_context,
1682 				      port - execlists->pending);
1683 			ok = false;
1684 			goto unlock;
1685 		}
1686 
1687 unlock:
1688 		spin_unlock_irqrestore(&rq->lock, flags);
1689 		if (!ok)
1690 			return false;
1691 	}
1692 
1693 	return ce;
1694 }
1695 
1696 static void execlists_submit_ports(struct intel_engine_cs *engine)
1697 {
1698 	struct intel_engine_execlists *execlists = &engine->execlists;
1699 	unsigned int n;
1700 
1701 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1702 
1703 	/*
1704 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1705 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1706 	 * not be relinquished until the device is idle (see
1707 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1708 	 * that all ELSP are drained i.e. we have processed the CSB,
1709 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1710 	 */
1711 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1712 
1713 	/*
1714 	 * ELSQ note: the submit queue is not cleared after being submitted
1715 	 * to the HW so we need to make sure we always clean it up. This is
1716 	 * currently ensured by the fact that we always write the same number
1717 	 * of elsq entries, keep this in mind before changing the loop below.
1718 	 */
1719 	for (n = execlists_num_ports(execlists); n--; ) {
1720 		struct i915_request *rq = execlists->pending[n];
1721 
1722 		write_desc(execlists,
1723 			   rq ? execlists_update_context(rq) : 0,
1724 			   n);
1725 	}
1726 
1727 	/* we need to manually load the submit queue */
1728 	if (execlists->ctrl_reg)
1729 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1730 }
1731 
1732 static bool ctx_single_port_submission(const struct intel_context *ce)
1733 {
1734 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1735 		intel_context_force_single_submission(ce));
1736 }
1737 
1738 static bool can_merge_ctx(const struct intel_context *prev,
1739 			  const struct intel_context *next)
1740 {
1741 	if (prev != next)
1742 		return false;
1743 
1744 	if (ctx_single_port_submission(prev))
1745 		return false;
1746 
1747 	return true;
1748 }
1749 
1750 static unsigned long i915_request_flags(const struct i915_request *rq)
1751 {
1752 	return READ_ONCE(rq->fence.flags);
1753 }
1754 
1755 static bool can_merge_rq(const struct i915_request *prev,
1756 			 const struct i915_request *next)
1757 {
1758 	GEM_BUG_ON(prev == next);
1759 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1760 
1761 	/*
1762 	 * We do not submit known completed requests. Therefore if the next
1763 	 * request is already completed, we can pretend to merge it in
1764 	 * with the previous context (and we will skip updating the ELSP
1765 	 * and tracking). Thus hopefully keeping the ELSP full with active
1766 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1767 	 * us.
1768 	 */
1769 	if (i915_request_completed(next))
1770 		return true;
1771 
1772 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1773 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1774 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1775 		return false;
1776 
1777 	if (!can_merge_ctx(prev->context, next->context))
1778 		return false;
1779 
1780 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1781 	return true;
1782 }
1783 
1784 static void virtual_update_register_offsets(u32 *regs,
1785 					    struct intel_engine_cs *engine)
1786 {
1787 	set_offsets(regs, reg_offsets(engine), engine, false);
1788 }
1789 
1790 static bool virtual_matches(const struct virtual_engine *ve,
1791 			    const struct i915_request *rq,
1792 			    const struct intel_engine_cs *engine)
1793 {
1794 	const struct intel_engine_cs *inflight;
1795 
1796 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1797 		return false;
1798 
1799 	/*
1800 	 * We track when the HW has completed saving the context image
1801 	 * (i.e. when we have seen the final CS event switching out of
1802 	 * the context) and must not overwrite the context image before
1803 	 * then. This restricts us to only using the active engine
1804 	 * while the previous virtualized request is inflight (so
1805 	 * we reuse the register offsets). This is a very small
1806 	 * hystersis on the greedy seelction algorithm.
1807 	 */
1808 	inflight = intel_context_inflight(&ve->context);
1809 	if (inflight && inflight != engine)
1810 		return false;
1811 
1812 	return true;
1813 }
1814 
1815 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve)
1816 {
1817 	/*
1818 	 * All the outstanding signals on ve->siblings[0] must have
1819 	 * been completed, just pending the interrupt handler. As those
1820 	 * signals still refer to the old sibling (via rq->engine), we must
1821 	 * transfer those to the old irq_worker to keep our locking
1822 	 * consistent.
1823 	 */
1824 	intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context);
1825 }
1826 
1827 #define for_each_waiter(p__, rq__) \
1828 	list_for_each_entry_lockless(p__, \
1829 				     &(rq__)->sched.waiters_list, \
1830 				     wait_link)
1831 
1832 #define for_each_signaler(p__, rq__) \
1833 	list_for_each_entry_rcu(p__, \
1834 				&(rq__)->sched.signalers_list, \
1835 				signal_link)
1836 
1837 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1838 {
1839 	LIST_HEAD(list);
1840 
1841 	/*
1842 	 * We want to move the interrupted request to the back of
1843 	 * the round-robin list (i.e. its priority level), but
1844 	 * in doing so, we must then move all requests that were in
1845 	 * flight and were waiting for the interrupted request to
1846 	 * be run after it again.
1847 	 */
1848 	do {
1849 		struct i915_dependency *p;
1850 
1851 		GEM_BUG_ON(i915_request_is_active(rq));
1852 		list_move_tail(&rq->sched.link, pl);
1853 
1854 		for_each_waiter(p, rq) {
1855 			struct i915_request *w =
1856 				container_of(p->waiter, typeof(*w), sched);
1857 
1858 			if (p->flags & I915_DEPENDENCY_WEAK)
1859 				continue;
1860 
1861 			/* Leave semaphores spinning on the other engines */
1862 			if (w->engine != rq->engine)
1863 				continue;
1864 
1865 			/* No waiter should start before its signaler */
1866 			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1867 				   i915_request_started(w) &&
1868 				   !i915_request_completed(rq));
1869 
1870 			GEM_BUG_ON(i915_request_is_active(w));
1871 			if (!i915_request_is_ready(w))
1872 				continue;
1873 
1874 			if (rq_prio(w) < rq_prio(rq))
1875 				continue;
1876 
1877 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1878 			list_move_tail(&w->sched.link, &list);
1879 		}
1880 
1881 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1882 	} while (rq);
1883 }
1884 
1885 static void defer_active(struct intel_engine_cs *engine)
1886 {
1887 	struct i915_request *rq;
1888 
1889 	rq = __unwind_incomplete_requests(engine);
1890 	if (!rq)
1891 		return;
1892 
1893 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1894 }
1895 
1896 static bool
1897 need_timeslice(const struct intel_engine_cs *engine,
1898 	       const struct i915_request *rq)
1899 {
1900 	int hint;
1901 
1902 	if (!intel_engine_has_timeslices(engine))
1903 		return false;
1904 
1905 	hint = engine->execlists.queue_priority_hint;
1906 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1907 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1908 
1909 	return hint >= effective_prio(rq);
1910 }
1911 
1912 static bool
1913 timeslice_yield(const struct intel_engine_execlists *el,
1914 		const struct i915_request *rq)
1915 {
1916 	/*
1917 	 * Once bitten, forever smitten!
1918 	 *
1919 	 * If the active context ever busy-waited on a semaphore,
1920 	 * it will be treated as a hog until the end of its timeslice (i.e.
1921 	 * until it is scheduled out and replaced by a new submission,
1922 	 * possibly even its own lite-restore). The HW only sends an interrupt
1923 	 * on the first miss, and we do know if that semaphore has been
1924 	 * signaled, or even if it is now stuck on another semaphore. Play
1925 	 * safe, yield if it might be stuck -- it will be given a fresh
1926 	 * timeslice in the near future.
1927 	 */
1928 	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1929 }
1930 
1931 static bool
1932 timeslice_expired(const struct intel_engine_execlists *el,
1933 		  const struct i915_request *rq)
1934 {
1935 	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1936 }
1937 
1938 static int
1939 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1940 {
1941 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1942 		return INT_MIN;
1943 
1944 	return rq_prio(list_next_entry(rq, sched.link));
1945 }
1946 
1947 static inline unsigned long
1948 timeslice(const struct intel_engine_cs *engine)
1949 {
1950 	return READ_ONCE(engine->props.timeslice_duration_ms);
1951 }
1952 
1953 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1954 {
1955 	const struct intel_engine_execlists *execlists = &engine->execlists;
1956 	const struct i915_request *rq = *execlists->active;
1957 
1958 	if (!rq || i915_request_completed(rq))
1959 		return 0;
1960 
1961 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1962 		return 0;
1963 
1964 	return timeslice(engine);
1965 }
1966 
1967 static void set_timeslice(struct intel_engine_cs *engine)
1968 {
1969 	unsigned long duration;
1970 
1971 	if (!intel_engine_has_timeslices(engine))
1972 		return;
1973 
1974 	duration = active_timeslice(engine);
1975 	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
1976 
1977 	set_timer_ms(&engine->execlists.timer, duration);
1978 }
1979 
1980 static void start_timeslice(struct intel_engine_cs *engine)
1981 {
1982 	struct intel_engine_execlists *execlists = &engine->execlists;
1983 	const int prio = queue_prio(execlists);
1984 	unsigned long duration;
1985 
1986 	if (!intel_engine_has_timeslices(engine))
1987 		return;
1988 
1989 	WRITE_ONCE(execlists->switch_priority_hint, prio);
1990 	if (prio == INT_MIN)
1991 		return;
1992 
1993 	if (timer_pending(&execlists->timer))
1994 		return;
1995 
1996 	duration = timeslice(engine);
1997 	ENGINE_TRACE(engine,
1998 		     "start timeslicing, prio:%d, interval:%lu",
1999 		     prio, duration);
2000 
2001 	set_timer_ms(&execlists->timer, duration);
2002 }
2003 
2004 static void record_preemption(struct intel_engine_execlists *execlists)
2005 {
2006 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2007 }
2008 
2009 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2010 					    const struct i915_request *rq)
2011 {
2012 	if (!rq)
2013 		return 0;
2014 
2015 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2016 	if (unlikely(intel_context_is_banned(rq->context)))
2017 		return 1;
2018 
2019 	return READ_ONCE(engine->props.preempt_timeout_ms);
2020 }
2021 
2022 static void set_preempt_timeout(struct intel_engine_cs *engine,
2023 				const struct i915_request *rq)
2024 {
2025 	if (!intel_engine_has_preempt_reset(engine))
2026 		return;
2027 
2028 	set_timer_ms(&engine->execlists.preempt,
2029 		     active_preempt_timeout(engine, rq));
2030 }
2031 
2032 static inline void clear_ports(struct i915_request **ports, int count)
2033 {
2034 	memset_p((void **)ports, NULL, count);
2035 }
2036 
2037 static void execlists_dequeue(struct intel_engine_cs *engine)
2038 {
2039 	struct intel_engine_execlists * const execlists = &engine->execlists;
2040 	struct i915_request **port = execlists->pending;
2041 	struct i915_request ** const last_port = port + execlists->port_mask;
2042 	struct i915_request * const *active;
2043 	struct i915_request *last;
2044 	struct rb_node *rb;
2045 	bool submit = false;
2046 
2047 	/*
2048 	 * Hardware submission is through 2 ports. Conceptually each port
2049 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2050 	 * static for a context, and unique to each, so we only execute
2051 	 * requests belonging to a single context from each ring. RING_HEAD
2052 	 * is maintained by the CS in the context image, it marks the place
2053 	 * where it got up to last time, and through RING_TAIL we tell the CS
2054 	 * where we want to execute up to this time.
2055 	 *
2056 	 * In this list the requests are in order of execution. Consecutive
2057 	 * requests from the same context are adjacent in the ringbuffer. We
2058 	 * can combine these requests into a single RING_TAIL update:
2059 	 *
2060 	 *              RING_HEAD...req1...req2
2061 	 *                                    ^- RING_TAIL
2062 	 * since to execute req2 the CS must first execute req1.
2063 	 *
2064 	 * Our goal then is to point each port to the end of a consecutive
2065 	 * sequence of requests as being the most optimal (fewest wake ups
2066 	 * and context switches) submission.
2067 	 */
2068 
2069 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2070 		struct virtual_engine *ve =
2071 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2072 		struct i915_request *rq = READ_ONCE(ve->request);
2073 
2074 		if (!rq) { /* lazily cleanup after another engine handled rq */
2075 			rb_erase_cached(rb, &execlists->virtual);
2076 			RB_CLEAR_NODE(rb);
2077 			rb = rb_first_cached(&execlists->virtual);
2078 			continue;
2079 		}
2080 
2081 		if (!virtual_matches(ve, rq, engine)) {
2082 			rb = rb_next(rb);
2083 			continue;
2084 		}
2085 
2086 		break;
2087 	}
2088 
2089 	/*
2090 	 * If the queue is higher priority than the last
2091 	 * request in the currently active context, submit afresh.
2092 	 * We will resubmit again afterwards in case we need to split
2093 	 * the active context to interject the preemption request,
2094 	 * i.e. we will retrigger preemption following the ack in case
2095 	 * of trouble.
2096 	 */
2097 	active = READ_ONCE(execlists->active);
2098 
2099 	/*
2100 	 * In theory we can skip over completed contexts that have not
2101 	 * yet been processed by events (as those events are in flight):
2102 	 *
2103 	 * while ((last = *active) && i915_request_completed(last))
2104 	 *	active++;
2105 	 *
2106 	 * However, the GPU cannot handle this as it will ultimately
2107 	 * find itself trying to jump back into a context it has just
2108 	 * completed and barf.
2109 	 */
2110 
2111 	if ((last = *active)) {
2112 		if (need_preempt(engine, last, rb)) {
2113 			if (i915_request_completed(last)) {
2114 				tasklet_hi_schedule(&execlists->tasklet);
2115 				return;
2116 			}
2117 
2118 			ENGINE_TRACE(engine,
2119 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2120 				     last->fence.context,
2121 				     last->fence.seqno,
2122 				     last->sched.attr.priority,
2123 				     execlists->queue_priority_hint);
2124 			record_preemption(execlists);
2125 
2126 			/*
2127 			 * Don't let the RING_HEAD advance past the breadcrumb
2128 			 * as we unwind (and until we resubmit) so that we do
2129 			 * not accidentally tell it to go backwards.
2130 			 */
2131 			ring_set_paused(engine, 1);
2132 
2133 			/*
2134 			 * Note that we have not stopped the GPU at this point,
2135 			 * so we are unwinding the incomplete requests as they
2136 			 * remain inflight and so by the time we do complete
2137 			 * the preemption, some of the unwound requests may
2138 			 * complete!
2139 			 */
2140 			__unwind_incomplete_requests(engine);
2141 
2142 			last = NULL;
2143 		} else if (need_timeslice(engine, last) &&
2144 			   timeslice_expired(execlists, last)) {
2145 			if (i915_request_completed(last)) {
2146 				tasklet_hi_schedule(&execlists->tasklet);
2147 				return;
2148 			}
2149 
2150 			ENGINE_TRACE(engine,
2151 				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2152 				     last->fence.context,
2153 				     last->fence.seqno,
2154 				     last->sched.attr.priority,
2155 				     execlists->queue_priority_hint,
2156 				     yesno(timeslice_yield(execlists, last)));
2157 
2158 			ring_set_paused(engine, 1);
2159 			defer_active(engine);
2160 
2161 			/*
2162 			 * Unlike for preemption, if we rewind and continue
2163 			 * executing the same context as previously active,
2164 			 * the order of execution will remain the same and
2165 			 * the tail will only advance. We do not need to
2166 			 * force a full context restore, as a lite-restore
2167 			 * is sufficient to resample the monotonic TAIL.
2168 			 *
2169 			 * If we switch to any other context, similarly we
2170 			 * will not rewind TAIL of current context, and
2171 			 * normal save/restore will preserve state and allow
2172 			 * us to later continue executing the same request.
2173 			 */
2174 			last = NULL;
2175 		} else {
2176 			/*
2177 			 * Otherwise if we already have a request pending
2178 			 * for execution after the current one, we can
2179 			 * just wait until the next CS event before
2180 			 * queuing more. In either case we will force a
2181 			 * lite-restore preemption event, but if we wait
2182 			 * we hopefully coalesce several updates into a single
2183 			 * submission.
2184 			 */
2185 			if (!list_is_last(&last->sched.link,
2186 					  &engine->active.requests)) {
2187 				/*
2188 				 * Even if ELSP[1] is occupied and not worthy
2189 				 * of timeslices, our queue might be.
2190 				 */
2191 				start_timeslice(engine);
2192 				return;
2193 			}
2194 		}
2195 	}
2196 
2197 	while (rb) { /* XXX virtual is always taking precedence */
2198 		struct virtual_engine *ve =
2199 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2200 		struct i915_request *rq;
2201 
2202 		spin_lock(&ve->base.active.lock);
2203 
2204 		rq = ve->request;
2205 		if (unlikely(!rq)) { /* lost the race to a sibling */
2206 			spin_unlock(&ve->base.active.lock);
2207 			rb_erase_cached(rb, &execlists->virtual);
2208 			RB_CLEAR_NODE(rb);
2209 			rb = rb_first_cached(&execlists->virtual);
2210 			continue;
2211 		}
2212 
2213 		GEM_BUG_ON(rq != ve->request);
2214 		GEM_BUG_ON(rq->engine != &ve->base);
2215 		GEM_BUG_ON(rq->context != &ve->context);
2216 
2217 		if (rq_prio(rq) >= queue_prio(execlists)) {
2218 			if (!virtual_matches(ve, rq, engine)) {
2219 				spin_unlock(&ve->base.active.lock);
2220 				rb = rb_next(rb);
2221 				continue;
2222 			}
2223 
2224 			if (last && !can_merge_rq(last, rq)) {
2225 				spin_unlock(&ve->base.active.lock);
2226 				start_timeslice(engine);
2227 				return; /* leave this for another sibling */
2228 			}
2229 
2230 			ENGINE_TRACE(engine,
2231 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2232 				     rq->fence.context,
2233 				     rq->fence.seqno,
2234 				     i915_request_completed(rq) ? "!" :
2235 				     i915_request_started(rq) ? "*" :
2236 				     "",
2237 				     yesno(engine != ve->siblings[0]));
2238 
2239 			WRITE_ONCE(ve->request, NULL);
2240 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2241 				   INT_MIN);
2242 			rb_erase_cached(rb, &execlists->virtual);
2243 			RB_CLEAR_NODE(rb);
2244 
2245 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2246 			WRITE_ONCE(rq->engine, engine);
2247 
2248 			if (engine != ve->siblings[0]) {
2249 				u32 *regs = ve->context.lrc_reg_state;
2250 				unsigned int n;
2251 
2252 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2253 
2254 				if (!intel_engine_has_relative_mmio(engine))
2255 					virtual_update_register_offsets(regs,
2256 									engine);
2257 
2258 				if (!list_empty(&ve->context.signals))
2259 					virtual_xfer_breadcrumbs(ve);
2260 
2261 				/*
2262 				 * Move the bound engine to the top of the list
2263 				 * for future execution. We then kick this
2264 				 * tasklet first before checking others, so that
2265 				 * we preferentially reuse this set of bound
2266 				 * registers.
2267 				 */
2268 				for (n = 1; n < ve->num_siblings; n++) {
2269 					if (ve->siblings[n] == engine) {
2270 						swap(ve->siblings[n],
2271 						     ve->siblings[0]);
2272 						break;
2273 					}
2274 				}
2275 
2276 				GEM_BUG_ON(ve->siblings[0] != engine);
2277 			}
2278 
2279 			if (__i915_request_submit(rq)) {
2280 				submit = true;
2281 				last = rq;
2282 			}
2283 			i915_request_put(rq);
2284 
2285 			/*
2286 			 * Hmm, we have a bunch of virtual engine requests,
2287 			 * but the first one was already completed (thanks
2288 			 * preempt-to-busy!). Keep looking at the veng queue
2289 			 * until we have no more relevant requests (i.e.
2290 			 * the normal submit queue has higher priority).
2291 			 */
2292 			if (!submit) {
2293 				spin_unlock(&ve->base.active.lock);
2294 				rb = rb_first_cached(&execlists->virtual);
2295 				continue;
2296 			}
2297 		}
2298 
2299 		spin_unlock(&ve->base.active.lock);
2300 		break;
2301 	}
2302 
2303 	while ((rb = rb_first_cached(&execlists->queue))) {
2304 		struct i915_priolist *p = to_priolist(rb);
2305 		struct i915_request *rq, *rn;
2306 		int i;
2307 
2308 		priolist_for_each_request_consume(rq, rn, p, i) {
2309 			bool merge = true;
2310 
2311 			/*
2312 			 * Can we combine this request with the current port?
2313 			 * It has to be the same context/ringbuffer and not
2314 			 * have any exceptions (e.g. GVT saying never to
2315 			 * combine contexts).
2316 			 *
2317 			 * If we can combine the requests, we can execute both
2318 			 * by updating the RING_TAIL to point to the end of the
2319 			 * second request, and so we never need to tell the
2320 			 * hardware about the first.
2321 			 */
2322 			if (last && !can_merge_rq(last, rq)) {
2323 				/*
2324 				 * If we are on the second port and cannot
2325 				 * combine this request with the last, then we
2326 				 * are done.
2327 				 */
2328 				if (port == last_port)
2329 					goto done;
2330 
2331 				/*
2332 				 * We must not populate both ELSP[] with the
2333 				 * same LRCA, i.e. we must submit 2 different
2334 				 * contexts if we submit 2 ELSP.
2335 				 */
2336 				if (last->context == rq->context)
2337 					goto done;
2338 
2339 				if (i915_request_has_sentinel(last))
2340 					goto done;
2341 
2342 				/*
2343 				 * If GVT overrides us we only ever submit
2344 				 * port[0], leaving port[1] empty. Note that we
2345 				 * also have to be careful that we don't queue
2346 				 * the same context (even though a different
2347 				 * request) to the second port.
2348 				 */
2349 				if (ctx_single_port_submission(last->context) ||
2350 				    ctx_single_port_submission(rq->context))
2351 					goto done;
2352 
2353 				merge = false;
2354 			}
2355 
2356 			if (__i915_request_submit(rq)) {
2357 				if (!merge) {
2358 					*port = execlists_schedule_in(last, port - execlists->pending);
2359 					port++;
2360 					last = NULL;
2361 				}
2362 
2363 				GEM_BUG_ON(last &&
2364 					   !can_merge_ctx(last->context,
2365 							  rq->context));
2366 				GEM_BUG_ON(last &&
2367 					   i915_seqno_passed(last->fence.seqno,
2368 							     rq->fence.seqno));
2369 
2370 				submit = true;
2371 				last = rq;
2372 			}
2373 		}
2374 
2375 		rb_erase_cached(&p->node, &execlists->queue);
2376 		i915_priolist_free(p);
2377 	}
2378 
2379 done:
2380 	/*
2381 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2382 	 *
2383 	 * We choose the priority hint such that if we add a request of greater
2384 	 * priority than this, we kick the submission tasklet to decide on
2385 	 * the right order of submitting the requests to hardware. We must
2386 	 * also be prepared to reorder requests as they are in-flight on the
2387 	 * HW. We derive the priority hint then as the first "hole" in
2388 	 * the HW submission ports and if there are no available slots,
2389 	 * the priority of the lowest executing request, i.e. last.
2390 	 *
2391 	 * When we do receive a higher priority request ready to run from the
2392 	 * user, see queue_request(), the priority hint is bumped to that
2393 	 * request triggering preemption on the next dequeue (or subsequent
2394 	 * interrupt for secondary ports).
2395 	 */
2396 	execlists->queue_priority_hint = queue_prio(execlists);
2397 
2398 	if (submit) {
2399 		*port = execlists_schedule_in(last, port - execlists->pending);
2400 		execlists->switch_priority_hint =
2401 			switch_prio(engine, *execlists->pending);
2402 
2403 		/*
2404 		 * Skip if we ended up with exactly the same set of requests,
2405 		 * e.g. trying to timeslice a pair of ordered contexts
2406 		 */
2407 		if (!memcmp(active, execlists->pending,
2408 			    (port - execlists->pending + 1) * sizeof(*port))) {
2409 			do
2410 				execlists_schedule_out(fetch_and_zero(port));
2411 			while (port-- != execlists->pending);
2412 
2413 			goto skip_submit;
2414 		}
2415 		clear_ports(port + 1, last_port - port);
2416 
2417 		WRITE_ONCE(execlists->yield, -1);
2418 		set_preempt_timeout(engine, *active);
2419 		execlists_submit_ports(engine);
2420 	} else {
2421 skip_submit:
2422 		ring_set_paused(engine, 0);
2423 	}
2424 }
2425 
2426 static void
2427 cancel_port_requests(struct intel_engine_execlists * const execlists)
2428 {
2429 	struct i915_request * const *port;
2430 
2431 	for (port = execlists->pending; *port; port++)
2432 		execlists_schedule_out(*port);
2433 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2434 
2435 	/* Mark the end of active before we overwrite *active */
2436 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2437 		execlists_schedule_out(*port);
2438 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2439 
2440 	smp_wmb(); /* complete the seqlock for execlists_active() */
2441 	WRITE_ONCE(execlists->active, execlists->inflight);
2442 }
2443 
2444 static inline void
2445 invalidate_csb_entries(const u32 *first, const u32 *last)
2446 {
2447 	clflush((void *)first);
2448 	clflush((void *)last);
2449 }
2450 
2451 /*
2452  * Starting with Gen12, the status has a new format:
2453  *
2454  *     bit  0:     switched to new queue
2455  *     bit  1:     reserved
2456  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2457  *                 switch detail is set to "wait on semaphore"
2458  *     bits 3-5:   engine class
2459  *     bits 6-11:  engine instance
2460  *     bits 12-14: reserved
2461  *     bits 15-25: sw context id of the lrc the GT switched to
2462  *     bits 26-31: sw counter of the lrc the GT switched to
2463  *     bits 32-35: context switch detail
2464  *                  - 0: ctx complete
2465  *                  - 1: wait on sync flip
2466  *                  - 2: wait on vblank
2467  *                  - 3: wait on scanline
2468  *                  - 4: wait on semaphore
2469  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2470  *                       WAIT_FOR_EVENT)
2471  *     bit  36:    reserved
2472  *     bits 37-43: wait detail (for switch detail 1 to 4)
2473  *     bits 44-46: reserved
2474  *     bits 47-57: sw context id of the lrc the GT switched away from
2475  *     bits 58-63: sw counter of the lrc the GT switched away from
2476  */
2477 static inline bool
2478 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2479 {
2480 	u32 lower_dw = csb[0];
2481 	u32 upper_dw = csb[1];
2482 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2483 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2484 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2485 
2486 	/*
2487 	 * The context switch detail is not guaranteed to be 5 when a preemption
2488 	 * occurs, so we can't just check for that. The check below works for
2489 	 * all the cases we care about, including preemptions of WAIT
2490 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2491 	 * would require some extra handling, but we don't support that.
2492 	 */
2493 	if (!ctx_away_valid || new_queue) {
2494 		GEM_BUG_ON(!ctx_to_valid);
2495 		return true;
2496 	}
2497 
2498 	/*
2499 	 * switch detail = 5 is covered by the case above and we do not expect a
2500 	 * context switch on an unsuccessful wait instruction since we always
2501 	 * use polling mode.
2502 	 */
2503 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2504 	return false;
2505 }
2506 
2507 static inline bool
2508 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2509 {
2510 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2511 }
2512 
2513 static void process_csb(struct intel_engine_cs *engine)
2514 {
2515 	struct intel_engine_execlists * const execlists = &engine->execlists;
2516 	const u32 * const buf = execlists->csb_status;
2517 	const u8 num_entries = execlists->csb_size;
2518 	u8 head, tail;
2519 
2520 	/*
2521 	 * As we modify our execlists state tracking we require exclusive
2522 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2523 	 * and we assume that is only inside the reset paths and so serialised.
2524 	 */
2525 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2526 		   !reset_in_progress(execlists));
2527 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2528 
2529 	/*
2530 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2531 	 * When reading from the csb_write mmio register, we have to be
2532 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2533 	 * the low 4bits. As it happens we know the next 4bits are always
2534 	 * zero and so we can simply masked off the low u8 of the register
2535 	 * and treat it identically to reading from the HWSP (without having
2536 	 * to use explicit shifting and masking, and probably bifurcating
2537 	 * the code to handle the legacy mmio read).
2538 	 */
2539 	head = execlists->csb_head;
2540 	tail = READ_ONCE(*execlists->csb_write);
2541 	if (unlikely(head == tail))
2542 		return;
2543 
2544 	/*
2545 	 * Hopefully paired with a wmb() in HW!
2546 	 *
2547 	 * We must complete the read of the write pointer before any reads
2548 	 * from the CSB, so that we do not see stale values. Without an rmb
2549 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2550 	 * we perform the READ_ONCE(*csb_write).
2551 	 */
2552 	rmb();
2553 
2554 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2555 	do {
2556 		bool promote;
2557 
2558 		if (++head == num_entries)
2559 			head = 0;
2560 
2561 		/*
2562 		 * We are flying near dragons again.
2563 		 *
2564 		 * We hold a reference to the request in execlist_port[]
2565 		 * but no more than that. We are operating in softirq
2566 		 * context and so cannot hold any mutex or sleep. That
2567 		 * prevents us stopping the requests we are processing
2568 		 * in port[] from being retired simultaneously (the
2569 		 * breadcrumb will be complete before we see the
2570 		 * context-switch). As we only hold the reference to the
2571 		 * request, any pointer chasing underneath the request
2572 		 * is subject to a potential use-after-free. Thus we
2573 		 * store all of the bookkeeping within port[] as
2574 		 * required, and avoid using unguarded pointers beneath
2575 		 * request itself. The same applies to the atomic
2576 		 * status notifier.
2577 		 */
2578 
2579 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2580 			     head, buf[2 * head + 0], buf[2 * head + 1]);
2581 
2582 		if (INTEL_GEN(engine->i915) >= 12)
2583 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2584 		else
2585 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2586 		if (promote) {
2587 			struct i915_request * const *old = execlists->active;
2588 
2589 			ring_set_paused(engine, 0);
2590 
2591 			/* Point active to the new ELSP; prevent overwriting */
2592 			WRITE_ONCE(execlists->active, execlists->pending);
2593 			smp_wmb(); /* notify execlists_active() */
2594 
2595 			/* cancel old inflight, prepare for switch */
2596 			trace_ports(execlists, "preempted", old);
2597 			while (*old)
2598 				execlists_schedule_out(*old++);
2599 
2600 			/* switch pending to inflight */
2601 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2602 			memcpy(execlists->inflight,
2603 			       execlists->pending,
2604 			       execlists_num_ports(execlists) *
2605 			       sizeof(*execlists->pending));
2606 			smp_wmb(); /* complete the seqlock */
2607 			WRITE_ONCE(execlists->active, execlists->inflight);
2608 
2609 			WRITE_ONCE(execlists->pending[0], NULL);
2610 		} else {
2611 			GEM_BUG_ON(!*execlists->active);
2612 
2613 			/* port0 completed, advanced to port1 */
2614 			trace_ports(execlists, "completed", execlists->active);
2615 
2616 			/*
2617 			 * We rely on the hardware being strongly
2618 			 * ordered, that the breadcrumb write is
2619 			 * coherent (visible from the CPU) before the
2620 			 * user interrupt is processed. One might assume
2621 			 * that the breadcrumb write being before the
2622 			 * user interrupt and the CS event for the context
2623 			 * switch would therefore be before the CS event
2624 			 * itself...
2625 			 */
2626 			if (GEM_SHOW_DEBUG() &&
2627 			    !i915_request_completed(*execlists->active)) {
2628 				struct i915_request *rq = *execlists->active;
2629 				const u32 *regs __maybe_unused =
2630 					rq->context->lrc_reg_state;
2631 
2632 				ENGINE_TRACE(engine,
2633 					     "context completed before request!\n");
2634 				ENGINE_TRACE(engine,
2635 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2636 					     ENGINE_READ(engine, RING_START),
2637 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2638 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2639 					     ENGINE_READ(engine, RING_CTL),
2640 					     ENGINE_READ(engine, RING_MI_MODE));
2641 				ENGINE_TRACE(engine,
2642 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2643 					     i915_ggtt_offset(rq->ring->vma),
2644 					     rq->head, rq->tail,
2645 					     rq->fence.context,
2646 					     lower_32_bits(rq->fence.seqno),
2647 					     hwsp_seqno(rq));
2648 				ENGINE_TRACE(engine,
2649 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2650 					     regs[CTX_RING_START],
2651 					     regs[CTX_RING_HEAD],
2652 					     regs[CTX_RING_TAIL]);
2653 			}
2654 
2655 			execlists_schedule_out(*execlists->active++);
2656 
2657 			GEM_BUG_ON(execlists->active - execlists->inflight >
2658 				   execlists_num_ports(execlists));
2659 		}
2660 	} while (head != tail);
2661 
2662 	execlists->csb_head = head;
2663 	set_timeslice(engine);
2664 
2665 	/*
2666 	 * Gen11 has proven to fail wrt global observation point between
2667 	 * entry and tail update, failing on the ordering and thus
2668 	 * we see an old entry in the context status buffer.
2669 	 *
2670 	 * Forcibly evict out entries for the next gpu csb update,
2671 	 * to increase the odds that we get a fresh entries with non
2672 	 * working hardware. The cost for doing so comes out mostly with
2673 	 * the wash as hardware, working or not, will need to do the
2674 	 * invalidation before.
2675 	 */
2676 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2677 }
2678 
2679 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2680 {
2681 	lockdep_assert_held(&engine->active.lock);
2682 	if (!READ_ONCE(engine->execlists.pending[0])) {
2683 		rcu_read_lock(); /* protect peeking at execlists->active */
2684 		execlists_dequeue(engine);
2685 		rcu_read_unlock();
2686 	}
2687 }
2688 
2689 static void __execlists_hold(struct i915_request *rq)
2690 {
2691 	LIST_HEAD(list);
2692 
2693 	do {
2694 		struct i915_dependency *p;
2695 
2696 		if (i915_request_is_active(rq))
2697 			__i915_request_unsubmit(rq);
2698 
2699 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2700 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2701 		i915_request_set_hold(rq);
2702 		RQ_TRACE(rq, "on hold\n");
2703 
2704 		for_each_waiter(p, rq) {
2705 			struct i915_request *w =
2706 				container_of(p->waiter, typeof(*w), sched);
2707 
2708 			/* Leave semaphores spinning on the other engines */
2709 			if (w->engine != rq->engine)
2710 				continue;
2711 
2712 			if (!i915_request_is_ready(w))
2713 				continue;
2714 
2715 			if (i915_request_completed(w))
2716 				continue;
2717 
2718 			if (i915_request_on_hold(w))
2719 				continue;
2720 
2721 			list_move_tail(&w->sched.link, &list);
2722 		}
2723 
2724 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2725 	} while (rq);
2726 }
2727 
2728 static bool execlists_hold(struct intel_engine_cs *engine,
2729 			   struct i915_request *rq)
2730 {
2731 	spin_lock_irq(&engine->active.lock);
2732 
2733 	if (i915_request_completed(rq)) { /* too late! */
2734 		rq = NULL;
2735 		goto unlock;
2736 	}
2737 
2738 	if (rq->engine != engine) { /* preempted virtual engine */
2739 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2740 
2741 		/*
2742 		 * intel_context_inflight() is only protected by virtue
2743 		 * of process_csb() being called only by the tasklet (or
2744 		 * directly from inside reset while the tasklet is suspended).
2745 		 * Assert that neither of those are allowed to run while we
2746 		 * poke at the request queues.
2747 		 */
2748 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2749 
2750 		/*
2751 		 * An unsubmitted request along a virtual engine will
2752 		 * remain on the active (this) engine until we are able
2753 		 * to process the context switch away (and so mark the
2754 		 * context as no longer in flight). That cannot have happened
2755 		 * yet, otherwise we would not be hanging!
2756 		 */
2757 		spin_lock(&ve->base.active.lock);
2758 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2759 		GEM_BUG_ON(ve->request != rq);
2760 		ve->request = NULL;
2761 		spin_unlock(&ve->base.active.lock);
2762 		i915_request_put(rq);
2763 
2764 		rq->engine = engine;
2765 	}
2766 
2767 	/*
2768 	 * Transfer this request onto the hold queue to prevent it
2769 	 * being resumbitted to HW (and potentially completed) before we have
2770 	 * released it. Since we may have already submitted following
2771 	 * requests, we need to remove those as well.
2772 	 */
2773 	GEM_BUG_ON(i915_request_on_hold(rq));
2774 	GEM_BUG_ON(rq->engine != engine);
2775 	__execlists_hold(rq);
2776 	GEM_BUG_ON(list_empty(&engine->active.hold));
2777 
2778 unlock:
2779 	spin_unlock_irq(&engine->active.lock);
2780 	return rq;
2781 }
2782 
2783 static bool hold_request(const struct i915_request *rq)
2784 {
2785 	struct i915_dependency *p;
2786 	bool result = false;
2787 
2788 	/*
2789 	 * If one of our ancestors is on hold, we must also be on hold,
2790 	 * otherwise we will bypass it and execute before it.
2791 	 */
2792 	rcu_read_lock();
2793 	for_each_signaler(p, rq) {
2794 		const struct i915_request *s =
2795 			container_of(p->signaler, typeof(*s), sched);
2796 
2797 		if (s->engine != rq->engine)
2798 			continue;
2799 
2800 		result = i915_request_on_hold(s);
2801 		if (result)
2802 			break;
2803 	}
2804 	rcu_read_unlock();
2805 
2806 	return result;
2807 }
2808 
2809 static void __execlists_unhold(struct i915_request *rq)
2810 {
2811 	LIST_HEAD(list);
2812 
2813 	do {
2814 		struct i915_dependency *p;
2815 
2816 		RQ_TRACE(rq, "hold release\n");
2817 
2818 		GEM_BUG_ON(!i915_request_on_hold(rq));
2819 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2820 
2821 		i915_request_clear_hold(rq);
2822 		list_move_tail(&rq->sched.link,
2823 			       i915_sched_lookup_priolist(rq->engine,
2824 							  rq_prio(rq)));
2825 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2826 
2827 		/* Also release any children on this engine that are ready */
2828 		for_each_waiter(p, rq) {
2829 			struct i915_request *w =
2830 				container_of(p->waiter, typeof(*w), sched);
2831 
2832 			/* Propagate any change in error status */
2833 			if (rq->fence.error)
2834 				i915_request_set_error_once(w, rq->fence.error);
2835 
2836 			if (w->engine != rq->engine)
2837 				continue;
2838 
2839 			if (!i915_request_on_hold(w))
2840 				continue;
2841 
2842 			/* Check that no other parents are also on hold */
2843 			if (hold_request(w))
2844 				continue;
2845 
2846 			list_move_tail(&w->sched.link, &list);
2847 		}
2848 
2849 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2850 	} while (rq);
2851 }
2852 
2853 static void execlists_unhold(struct intel_engine_cs *engine,
2854 			     struct i915_request *rq)
2855 {
2856 	spin_lock_irq(&engine->active.lock);
2857 
2858 	/*
2859 	 * Move this request back to the priority queue, and all of its
2860 	 * children and grandchildren that were suspended along with it.
2861 	 */
2862 	__execlists_unhold(rq);
2863 
2864 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2865 		engine->execlists.queue_priority_hint = rq_prio(rq);
2866 		tasklet_hi_schedule(&engine->execlists.tasklet);
2867 	}
2868 
2869 	spin_unlock_irq(&engine->active.lock);
2870 }
2871 
2872 struct execlists_capture {
2873 	struct work_struct work;
2874 	struct i915_request *rq;
2875 	struct i915_gpu_coredump *error;
2876 };
2877 
2878 static void execlists_capture_work(struct work_struct *work)
2879 {
2880 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2881 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2882 	struct intel_engine_cs *engine = cap->rq->engine;
2883 	struct intel_gt_coredump *gt = cap->error->gt;
2884 	struct intel_engine_capture_vma *vma;
2885 
2886 	/* Compress all the objects attached to the request, slow! */
2887 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2888 	if (vma) {
2889 		struct i915_vma_compress *compress =
2890 			i915_vma_capture_prepare(gt);
2891 
2892 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2893 		i915_vma_capture_finish(gt, compress);
2894 	}
2895 
2896 	gt->simulated = gt->engine->simulated;
2897 	cap->error->simulated = gt->simulated;
2898 
2899 	/* Publish the error state, and announce it to the world */
2900 	i915_error_state_store(cap->error);
2901 	i915_gpu_coredump_put(cap->error);
2902 
2903 	/* Return this request and all that depend upon it for signaling */
2904 	execlists_unhold(engine, cap->rq);
2905 	i915_request_put(cap->rq);
2906 
2907 	kfree(cap);
2908 }
2909 
2910 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2911 {
2912 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2913 	struct execlists_capture *cap;
2914 
2915 	cap = kmalloc(sizeof(*cap), gfp);
2916 	if (!cap)
2917 		return NULL;
2918 
2919 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2920 	if (!cap->error)
2921 		goto err_cap;
2922 
2923 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2924 	if (!cap->error->gt)
2925 		goto err_gpu;
2926 
2927 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2928 	if (!cap->error->gt->engine)
2929 		goto err_gt;
2930 
2931 	return cap;
2932 
2933 err_gt:
2934 	kfree(cap->error->gt);
2935 err_gpu:
2936 	kfree(cap->error);
2937 err_cap:
2938 	kfree(cap);
2939 	return NULL;
2940 }
2941 
2942 static struct i915_request *
2943 active_context(struct intel_engine_cs *engine, u32 ccid)
2944 {
2945 	const struct intel_engine_execlists * const el = &engine->execlists;
2946 	struct i915_request * const *port, *rq;
2947 
2948 	/*
2949 	 * Use the most recent result from process_csb(), but just in case
2950 	 * we trigger an error (via interrupt) before the first CS event has
2951 	 * been written, peek at the next submission.
2952 	 */
2953 
2954 	for (port = el->active; (rq = *port); port++) {
2955 		if (rq->context->lrc.ccid == ccid) {
2956 			ENGINE_TRACE(engine,
2957 				     "ccid found at active:%zd\n",
2958 				     port - el->active);
2959 			return rq;
2960 		}
2961 	}
2962 
2963 	for (port = el->pending; (rq = *port); port++) {
2964 		if (rq->context->lrc.ccid == ccid) {
2965 			ENGINE_TRACE(engine,
2966 				     "ccid found at pending:%zd\n",
2967 				     port - el->pending);
2968 			return rq;
2969 		}
2970 	}
2971 
2972 	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
2973 	return NULL;
2974 }
2975 
2976 static u32 active_ccid(struct intel_engine_cs *engine)
2977 {
2978 	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
2979 }
2980 
2981 static bool execlists_capture(struct intel_engine_cs *engine)
2982 {
2983 	struct execlists_capture *cap;
2984 
2985 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2986 		return true;
2987 
2988 	/*
2989 	 * We need to _quickly_ capture the engine state before we reset.
2990 	 * We are inside an atomic section (softirq) here and we are delaying
2991 	 * the forced preemption event.
2992 	 */
2993 	cap = capture_regs(engine);
2994 	if (!cap)
2995 		return true;
2996 
2997 	spin_lock_irq(&engine->active.lock);
2998 	cap->rq = active_context(engine, active_ccid(engine));
2999 	if (cap->rq) {
3000 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3001 		cap->rq = i915_request_get_rcu(cap->rq);
3002 	}
3003 	spin_unlock_irq(&engine->active.lock);
3004 	if (!cap->rq)
3005 		goto err_free;
3006 
3007 	/*
3008 	 * Remove the request from the execlists queue, and take ownership
3009 	 * of the request. We pass it to our worker who will _slowly_ compress
3010 	 * all the pages the _user_ requested for debugging their batch, after
3011 	 * which we return it to the queue for signaling.
3012 	 *
3013 	 * By removing them from the execlists queue, we also remove the
3014 	 * requests from being processed by __unwind_incomplete_requests()
3015 	 * during the intel_engine_reset(), and so they will *not* be replayed
3016 	 * afterwards.
3017 	 *
3018 	 * Note that because we have not yet reset the engine at this point,
3019 	 * it is possible for the request that we have identified as being
3020 	 * guilty, did in fact complete and we will then hit an arbitration
3021 	 * point allowing the outstanding preemption to succeed. The likelihood
3022 	 * of that is very low (as capturing of the engine registers should be
3023 	 * fast enough to run inside an irq-off atomic section!), so we will
3024 	 * simply hold that request accountable for being non-preemptible
3025 	 * long enough to force the reset.
3026 	 */
3027 	if (!execlists_hold(engine, cap->rq))
3028 		goto err_rq;
3029 
3030 	INIT_WORK(&cap->work, execlists_capture_work);
3031 	schedule_work(&cap->work);
3032 	return true;
3033 
3034 err_rq:
3035 	i915_request_put(cap->rq);
3036 err_free:
3037 	i915_gpu_coredump_put(cap->error);
3038 	kfree(cap);
3039 	return false;
3040 }
3041 
3042 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3043 {
3044 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3045 	unsigned long *lock = &engine->gt->reset.flags;
3046 
3047 	if (!intel_has_reset_engine(engine->gt))
3048 		return;
3049 
3050 	if (test_and_set_bit(bit, lock))
3051 		return;
3052 
3053 	ENGINE_TRACE(engine, "reset for %s\n", msg);
3054 
3055 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3056 	tasklet_disable_nosync(&engine->execlists.tasklet);
3057 
3058 	ring_set_paused(engine, 1); /* Freeze the current request in place */
3059 	if (execlists_capture(engine))
3060 		intel_engine_reset(engine, msg);
3061 	else
3062 		ring_set_paused(engine, 0);
3063 
3064 	tasklet_enable(&engine->execlists.tasklet);
3065 	clear_and_wake_up_bit(bit, lock);
3066 }
3067 
3068 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3069 {
3070 	const struct timer_list *t = &engine->execlists.preempt;
3071 
3072 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3073 		return false;
3074 
3075 	if (!timer_expired(t))
3076 		return false;
3077 
3078 	return READ_ONCE(engine->execlists.pending[0]);
3079 }
3080 
3081 /*
3082  * Check the unread Context Status Buffers and manage the submission of new
3083  * contexts to the ELSP accordingly.
3084  */
3085 static void execlists_submission_tasklet(unsigned long data)
3086 {
3087 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3088 	bool timeout = preempt_timeout(engine);
3089 
3090 	process_csb(engine);
3091 
3092 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3093 		engine->execlists.error_interrupt = 0;
3094 		if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
3095 			execlists_reset(engine, "CS error");
3096 	}
3097 
3098 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3099 		unsigned long flags;
3100 
3101 		spin_lock_irqsave(&engine->active.lock, flags);
3102 		__execlists_submission_tasklet(engine);
3103 		spin_unlock_irqrestore(&engine->active.lock, flags);
3104 
3105 		/* Recheck after serialising with direct-submission */
3106 		if (unlikely(timeout && preempt_timeout(engine)))
3107 			execlists_reset(engine, "preemption time out");
3108 	}
3109 }
3110 
3111 static void __execlists_kick(struct intel_engine_execlists *execlists)
3112 {
3113 	/* Kick the tasklet for some interrupt coalescing and reset handling */
3114 	tasklet_hi_schedule(&execlists->tasklet);
3115 }
3116 
3117 #define execlists_kick(t, member) \
3118 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3119 
3120 static void execlists_timeslice(struct timer_list *timer)
3121 {
3122 	execlists_kick(timer, timer);
3123 }
3124 
3125 static void execlists_preempt(struct timer_list *timer)
3126 {
3127 	execlists_kick(timer, preempt);
3128 }
3129 
3130 static void queue_request(struct intel_engine_cs *engine,
3131 			  struct i915_request *rq)
3132 {
3133 	GEM_BUG_ON(!list_empty(&rq->sched.link));
3134 	list_add_tail(&rq->sched.link,
3135 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3136 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3137 }
3138 
3139 static void __submit_queue_imm(struct intel_engine_cs *engine)
3140 {
3141 	struct intel_engine_execlists * const execlists = &engine->execlists;
3142 
3143 	if (reset_in_progress(execlists))
3144 		return; /* defer until we restart the engine following reset */
3145 
3146 	/* Hopefully we clear execlists->pending[] to let us through */
3147 	if (READ_ONCE(execlists->pending[0]) &&
3148 	    tasklet_trylock(&execlists->tasklet)) {
3149 		process_csb(engine);
3150 		tasklet_unlock(&execlists->tasklet);
3151 	}
3152 
3153 	__execlists_submission_tasklet(engine);
3154 }
3155 
3156 static void submit_queue(struct intel_engine_cs *engine,
3157 			 const struct i915_request *rq)
3158 {
3159 	struct intel_engine_execlists *execlists = &engine->execlists;
3160 
3161 	if (rq_prio(rq) <= execlists->queue_priority_hint)
3162 		return;
3163 
3164 	execlists->queue_priority_hint = rq_prio(rq);
3165 	__submit_queue_imm(engine);
3166 }
3167 
3168 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3169 			     const struct i915_request *rq)
3170 {
3171 	GEM_BUG_ON(i915_request_on_hold(rq));
3172 	return !list_empty(&engine->active.hold) && hold_request(rq);
3173 }
3174 
3175 static void execlists_submit_request(struct i915_request *request)
3176 {
3177 	struct intel_engine_cs *engine = request->engine;
3178 	unsigned long flags;
3179 
3180 	/* Will be called from irq-context when using foreign fences. */
3181 	spin_lock_irqsave(&engine->active.lock, flags);
3182 
3183 	if (unlikely(ancestor_on_hold(engine, request))) {
3184 		RQ_TRACE(request, "ancestor on hold\n");
3185 		list_add_tail(&request->sched.link, &engine->active.hold);
3186 		i915_request_set_hold(request);
3187 	} else {
3188 		queue_request(engine, request);
3189 
3190 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3191 		GEM_BUG_ON(list_empty(&request->sched.link));
3192 
3193 		submit_queue(engine, request);
3194 	}
3195 
3196 	spin_unlock_irqrestore(&engine->active.lock, flags);
3197 }
3198 
3199 static void __execlists_context_fini(struct intel_context *ce)
3200 {
3201 	intel_ring_put(ce->ring);
3202 	i915_vma_put(ce->state);
3203 }
3204 
3205 static void execlists_context_destroy(struct kref *kref)
3206 {
3207 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3208 
3209 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3210 	GEM_BUG_ON(intel_context_is_pinned(ce));
3211 
3212 	if (ce->state)
3213 		__execlists_context_fini(ce);
3214 
3215 	intel_context_fini(ce);
3216 	intel_context_free(ce);
3217 }
3218 
3219 static void
3220 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3221 {
3222 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3223 		return;
3224 
3225 	vaddr += engine->context_size;
3226 
3227 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3228 }
3229 
3230 static void
3231 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3232 {
3233 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3234 		return;
3235 
3236 	vaddr += engine->context_size;
3237 
3238 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3239 		drm_err_once(&engine->i915->drm,
3240 			     "%s context redzone overwritten!\n",
3241 			     engine->name);
3242 }
3243 
3244 static void execlists_context_unpin(struct intel_context *ce)
3245 {
3246 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3247 		      ce->engine);
3248 
3249 	i915_gem_object_unpin_map(ce->state->obj);
3250 }
3251 
3252 static u32 *
3253 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3254 {
3255 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3256 		MI_SRM_LRM_GLOBAL_GTT |
3257 		MI_LRI_LRM_CS_MMIO;
3258 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3259 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3260 		CTX_TIMESTAMP * sizeof(u32);
3261 	*cs++ = 0;
3262 
3263 	*cs++ = MI_LOAD_REGISTER_REG |
3264 		MI_LRR_SOURCE_CS_MMIO |
3265 		MI_LRI_LRM_CS_MMIO;
3266 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3267 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3268 
3269 	*cs++ = MI_LOAD_REGISTER_REG |
3270 		MI_LRR_SOURCE_CS_MMIO |
3271 		MI_LRI_LRM_CS_MMIO;
3272 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3273 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3274 
3275 	return cs;
3276 }
3277 
3278 static u32 *
3279 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3280 {
3281 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3282 
3283 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3284 		MI_SRM_LRM_GLOBAL_GTT |
3285 		MI_LRI_LRM_CS_MMIO;
3286 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3287 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3288 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3289 	*cs++ = 0;
3290 
3291 	return cs;
3292 }
3293 
3294 static u32 *
3295 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3296 {
3297 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3298 
3299 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3300 		MI_SRM_LRM_GLOBAL_GTT |
3301 		MI_LRI_LRM_CS_MMIO;
3302 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3303 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3304 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3305 	*cs++ = 0;
3306 
3307 	*cs++ = MI_LOAD_REGISTER_REG |
3308 		MI_LRR_SOURCE_CS_MMIO |
3309 		MI_LRI_LRM_CS_MMIO;
3310 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3311 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3312 
3313 	return cs;
3314 }
3315 
3316 static u32 *
3317 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3318 {
3319 	cs = gen12_emit_timestamp_wa(ce, cs);
3320 	cs = gen12_emit_cmd_buf_wa(ce, cs);
3321 	cs = gen12_emit_restore_scratch(ce, cs);
3322 
3323 	return cs;
3324 }
3325 
3326 static u32 *
3327 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3328 {
3329 	cs = gen12_emit_timestamp_wa(ce, cs);
3330 	cs = gen12_emit_restore_scratch(ce, cs);
3331 
3332 	return cs;
3333 }
3334 
3335 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3336 {
3337 	return PAGE_SIZE * ce->wa_bb_page;
3338 }
3339 
3340 static u32 *context_indirect_bb(const struct intel_context *ce)
3341 {
3342 	void *ptr;
3343 
3344 	GEM_BUG_ON(!ce->wa_bb_page);
3345 
3346 	ptr = ce->lrc_reg_state;
3347 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3348 	ptr += context_wa_bb_offset(ce);
3349 
3350 	return ptr;
3351 }
3352 
3353 static void
3354 setup_indirect_ctx_bb(const struct intel_context *ce,
3355 		      const struct intel_engine_cs *engine,
3356 		      u32 *(*emit)(const struct intel_context *, u32 *))
3357 {
3358 	u32 * const start = context_indirect_bb(ce);
3359 	u32 *cs;
3360 
3361 	cs = emit(ce, start);
3362 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3363 	while ((unsigned long)cs % CACHELINE_BYTES)
3364 		*cs++ = MI_NOOP;
3365 
3366 	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3367 				    i915_ggtt_offset(ce->state) +
3368 				    context_wa_bb_offset(ce),
3369 				    (cs - start) * sizeof(*cs));
3370 }
3371 
3372 static void
3373 __execlists_update_reg_state(const struct intel_context *ce,
3374 			     const struct intel_engine_cs *engine,
3375 			     u32 head)
3376 {
3377 	struct intel_ring *ring = ce->ring;
3378 	u32 *regs = ce->lrc_reg_state;
3379 
3380 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3381 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3382 
3383 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3384 	regs[CTX_RING_HEAD] = head;
3385 	regs[CTX_RING_TAIL] = ring->tail;
3386 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3387 
3388 	/* RPCS */
3389 	if (engine->class == RENDER_CLASS) {
3390 		regs[CTX_R_PWR_CLK_STATE] =
3391 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
3392 
3393 		i915_oa_init_reg_state(ce, engine);
3394 	}
3395 
3396 	if (ce->wa_bb_page) {
3397 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3398 
3399 		fn = gen12_emit_indirect_ctx_xcs;
3400 		if (ce->engine->class == RENDER_CLASS)
3401 			fn = gen12_emit_indirect_ctx_rcs;
3402 
3403 		/* Mutually exclusive wrt to global indirect bb */
3404 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3405 		setup_indirect_ctx_bb(ce, engine, fn);
3406 	}
3407 }
3408 
3409 static int
3410 __execlists_context_pin(struct intel_context *ce,
3411 			struct intel_engine_cs *engine)
3412 {
3413 	void *vaddr;
3414 
3415 	GEM_BUG_ON(!ce->state);
3416 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3417 
3418 	vaddr = i915_gem_object_pin_map(ce->state->obj,
3419 					i915_coherent_map_type(engine->i915) |
3420 					I915_MAP_OVERRIDE);
3421 	if (IS_ERR(vaddr))
3422 		return PTR_ERR(vaddr);
3423 
3424 	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3425 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3426 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3427 
3428 	return 0;
3429 }
3430 
3431 static int execlists_context_pin(struct intel_context *ce)
3432 {
3433 	return __execlists_context_pin(ce, ce->engine);
3434 }
3435 
3436 static int execlists_context_alloc(struct intel_context *ce)
3437 {
3438 	return __execlists_context_alloc(ce, ce->engine);
3439 }
3440 
3441 static void execlists_context_reset(struct intel_context *ce)
3442 {
3443 	CE_TRACE(ce, "reset\n");
3444 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3445 
3446 	intel_ring_reset(ce->ring, ce->ring->emit);
3447 
3448 	/* Scrub away the garbage */
3449 	execlists_init_reg_state(ce->lrc_reg_state,
3450 				 ce, ce->engine, ce->ring, true);
3451 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3452 
3453 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3454 }
3455 
3456 static const struct intel_context_ops execlists_context_ops = {
3457 	.alloc = execlists_context_alloc,
3458 
3459 	.pin = execlists_context_pin,
3460 	.unpin = execlists_context_unpin,
3461 
3462 	.enter = intel_context_enter_engine,
3463 	.exit = intel_context_exit_engine,
3464 
3465 	.reset = execlists_context_reset,
3466 	.destroy = execlists_context_destroy,
3467 };
3468 
3469 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3470 {
3471 	u32 *cs;
3472 
3473 	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3474 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3475 		return 0;
3476 
3477 	cs = intel_ring_begin(rq, 6);
3478 	if (IS_ERR(cs))
3479 		return PTR_ERR(cs);
3480 
3481 	/*
3482 	 * Check if we have been preempted before we even get started.
3483 	 *
3484 	 * After this point i915_request_started() reports true, even if
3485 	 * we get preempted and so are no longer running.
3486 	 */
3487 	*cs++ = MI_ARB_CHECK;
3488 	*cs++ = MI_NOOP;
3489 
3490 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3491 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3492 	*cs++ = 0;
3493 	*cs++ = rq->fence.seqno - 1;
3494 
3495 	intel_ring_advance(rq, cs);
3496 
3497 	/* Record the updated position of the request's payload */
3498 	rq->infix = intel_ring_offset(rq, cs);
3499 
3500 	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3501 
3502 	return 0;
3503 }
3504 
3505 static int emit_pdps(struct i915_request *rq)
3506 {
3507 	const struct intel_engine_cs * const engine = rq->engine;
3508 	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3509 	int err, i;
3510 	u32 *cs;
3511 
3512 	GEM_BUG_ON(intel_vgpu_active(rq->i915));
3513 
3514 	/*
3515 	 * Beware ye of the dragons, this sequence is magic!
3516 	 *
3517 	 * Small changes to this sequence can cause anything from
3518 	 * GPU hangs to forcewake errors and machine lockups!
3519 	 */
3520 
3521 	/* Flush any residual operations from the context load */
3522 	err = engine->emit_flush(rq, EMIT_FLUSH);
3523 	if (err)
3524 		return err;
3525 
3526 	/* Magic required to prevent forcewake errors! */
3527 	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3528 	if (err)
3529 		return err;
3530 
3531 	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3532 	if (IS_ERR(cs))
3533 		return PTR_ERR(cs);
3534 
3535 	/* Ensure the LRI have landed before we invalidate & continue */
3536 	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3537 	for (i = GEN8_3LVL_PDPES; i--; ) {
3538 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3539 		u32 base = engine->mmio_base;
3540 
3541 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3542 		*cs++ = upper_32_bits(pd_daddr);
3543 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3544 		*cs++ = lower_32_bits(pd_daddr);
3545 	}
3546 	*cs++ = MI_NOOP;
3547 
3548 	intel_ring_advance(rq, cs);
3549 
3550 	return 0;
3551 }
3552 
3553 static int execlists_request_alloc(struct i915_request *request)
3554 {
3555 	int ret;
3556 
3557 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3558 
3559 	/*
3560 	 * Flush enough space to reduce the likelihood of waiting after
3561 	 * we start building the request - in which case we will just
3562 	 * have to repeat work.
3563 	 */
3564 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3565 
3566 	/*
3567 	 * Note that after this point, we have committed to using
3568 	 * this request as it is being used to both track the
3569 	 * state of engine initialisation and liveness of the
3570 	 * golden renderstate above. Think twice before you try
3571 	 * to cancel/unwind this request now.
3572 	 */
3573 
3574 	if (!i915_vm_is_4lvl(request->context->vm)) {
3575 		ret = emit_pdps(request);
3576 		if (ret)
3577 			return ret;
3578 	}
3579 
3580 	/* Unconditionally invalidate GPU caches and TLBs. */
3581 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3582 	if (ret)
3583 		return ret;
3584 
3585 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3586 	return 0;
3587 }
3588 
3589 /*
3590  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3591  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3592  * but there is a slight complication as this is applied in WA batch where the
3593  * values are only initialized once so we cannot take register value at the
3594  * beginning and reuse it further; hence we save its value to memory, upload a
3595  * constant value with bit21 set and then we restore it back with the saved value.
3596  * To simplify the WA, a constant value is formed by using the default value
3597  * of this register. This shouldn't be a problem because we are only modifying
3598  * it for a short period and this batch in non-premptible. We can ofcourse
3599  * use additional instructions that read the actual value of the register
3600  * at that time and set our bit of interest but it makes the WA complicated.
3601  *
3602  * This WA is also required for Gen9 so extracting as a function avoids
3603  * code duplication.
3604  */
3605 static u32 *
3606 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3607 {
3608 	/* NB no one else is allowed to scribble over scratch + 256! */
3609 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3610 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3611 	*batch++ = intel_gt_scratch_offset(engine->gt,
3612 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3613 	*batch++ = 0;
3614 
3615 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3616 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3617 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3618 
3619 	batch = gen8_emit_pipe_control(batch,
3620 				       PIPE_CONTROL_CS_STALL |
3621 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3622 				       0);
3623 
3624 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3625 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3626 	*batch++ = intel_gt_scratch_offset(engine->gt,
3627 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3628 	*batch++ = 0;
3629 
3630 	return batch;
3631 }
3632 
3633 /*
3634  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3635  * initialized at the beginning and shared across all contexts but this field
3636  * helps us to have multiple batches at different offsets and select them based
3637  * on a criteria. At the moment this batch always start at the beginning of the page
3638  * and at this point we don't have multiple wa_ctx batch buffers.
3639  *
3640  * The number of WA applied are not known at the beginning; we use this field
3641  * to return the no of DWORDS written.
3642  *
3643  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3644  * so it adds NOOPs as padding to make it cacheline aligned.
3645  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3646  * makes a complete batch buffer.
3647  */
3648 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3649 {
3650 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3651 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3652 
3653 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3654 	if (IS_BROADWELL(engine->i915))
3655 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3656 
3657 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3658 	/* Actual scratch location is at 128 bytes offset */
3659 	batch = gen8_emit_pipe_control(batch,
3660 				       PIPE_CONTROL_FLUSH_L3 |
3661 				       PIPE_CONTROL_STORE_DATA_INDEX |
3662 				       PIPE_CONTROL_CS_STALL |
3663 				       PIPE_CONTROL_QW_WRITE,
3664 				       LRC_PPHWSP_SCRATCH_ADDR);
3665 
3666 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3667 
3668 	/* Pad to end of cacheline */
3669 	while ((unsigned long)batch % CACHELINE_BYTES)
3670 		*batch++ = MI_NOOP;
3671 
3672 	/*
3673 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3674 	 * execution depends on the length specified in terms of cache lines
3675 	 * in the register CTX_RCS_INDIRECT_CTX
3676 	 */
3677 
3678 	return batch;
3679 }
3680 
3681 struct lri {
3682 	i915_reg_t reg;
3683 	u32 value;
3684 };
3685 
3686 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3687 {
3688 	GEM_BUG_ON(!count || count > 63);
3689 
3690 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3691 	do {
3692 		*batch++ = i915_mmio_reg_offset(lri->reg);
3693 		*batch++ = lri->value;
3694 	} while (lri++, --count);
3695 	*batch++ = MI_NOOP;
3696 
3697 	return batch;
3698 }
3699 
3700 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3701 {
3702 	static const struct lri lri[] = {
3703 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3704 		{
3705 			COMMON_SLICE_CHICKEN2,
3706 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3707 				       0),
3708 		},
3709 
3710 		/* BSpec: 11391 */
3711 		{
3712 			FF_SLICE_CHICKEN,
3713 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3714 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3715 		},
3716 
3717 		/* BSpec: 11299 */
3718 		{
3719 			_3D_CHICKEN3,
3720 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3721 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3722 		}
3723 	};
3724 
3725 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3726 
3727 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3728 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3729 
3730 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3731 	batch = gen8_emit_pipe_control(batch,
3732 				       PIPE_CONTROL_FLUSH_L3 |
3733 				       PIPE_CONTROL_STORE_DATA_INDEX |
3734 				       PIPE_CONTROL_CS_STALL |
3735 				       PIPE_CONTROL_QW_WRITE,
3736 				       LRC_PPHWSP_SCRATCH_ADDR);
3737 
3738 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3739 
3740 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3741 	if (HAS_POOLED_EU(engine->i915)) {
3742 		/*
3743 		 * EU pool configuration is setup along with golden context
3744 		 * during context initialization. This value depends on
3745 		 * device type (2x6 or 3x6) and needs to be updated based
3746 		 * on which subslice is disabled especially for 2x6
3747 		 * devices, however it is safe to load default
3748 		 * configuration of 3x6 device instead of masking off
3749 		 * corresponding bits because HW ignores bits of a disabled
3750 		 * subslice and drops down to appropriate config. Please
3751 		 * see render_state_setup() in i915_gem_render_state.c for
3752 		 * possible configurations, to avoid duplication they are
3753 		 * not shown here again.
3754 		 */
3755 		*batch++ = GEN9_MEDIA_POOL_STATE;
3756 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3757 		*batch++ = 0x00777000;
3758 		*batch++ = 0;
3759 		*batch++ = 0;
3760 		*batch++ = 0;
3761 	}
3762 
3763 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3764 
3765 	/* Pad to end of cacheline */
3766 	while ((unsigned long)batch % CACHELINE_BYTES)
3767 		*batch++ = MI_NOOP;
3768 
3769 	return batch;
3770 }
3771 
3772 static u32 *
3773 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3774 {
3775 	int i;
3776 
3777 	/*
3778 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3779 	 *
3780 	 * Ensure the engine is idle prior to programming a
3781 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3782 	 */
3783 	batch = gen8_emit_pipe_control(batch,
3784 				       PIPE_CONTROL_CS_STALL,
3785 				       0);
3786 	/*
3787 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3788 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3789 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3790 	 * confusing. Since gen8_emit_pipe_control() already advances the
3791 	 * batch by 6 dwords, we advance the other 10 here, completing a
3792 	 * cacheline. It's not clear if the workaround requires this padding
3793 	 * before other commands, or if it's just the regular padding we would
3794 	 * already have for the workaround bb, so leave it here for now.
3795 	 */
3796 	for (i = 0; i < 10; i++)
3797 		*batch++ = MI_NOOP;
3798 
3799 	/* Pad to end of cacheline */
3800 	while ((unsigned long)batch % CACHELINE_BYTES)
3801 		*batch++ = MI_NOOP;
3802 
3803 	return batch;
3804 }
3805 
3806 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3807 
3808 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3809 {
3810 	struct drm_i915_gem_object *obj;
3811 	struct i915_vma *vma;
3812 	int err;
3813 
3814 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3815 	if (IS_ERR(obj))
3816 		return PTR_ERR(obj);
3817 
3818 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3819 	if (IS_ERR(vma)) {
3820 		err = PTR_ERR(vma);
3821 		goto err;
3822 	}
3823 
3824 	err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3825 	if (err)
3826 		goto err;
3827 
3828 	engine->wa_ctx.vma = vma;
3829 	return 0;
3830 
3831 err:
3832 	i915_gem_object_put(obj);
3833 	return err;
3834 }
3835 
3836 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3837 {
3838 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3839 }
3840 
3841 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3842 
3843 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3844 {
3845 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3846 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3847 					    &wa_ctx->per_ctx };
3848 	wa_bb_func_t wa_bb_fn[2];
3849 	struct page *page;
3850 	void *batch, *batch_ptr;
3851 	unsigned int i;
3852 	int ret;
3853 
3854 	if (engine->class != RENDER_CLASS)
3855 		return 0;
3856 
3857 	switch (INTEL_GEN(engine->i915)) {
3858 	case 12:
3859 	case 11:
3860 		return 0;
3861 	case 10:
3862 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3863 		wa_bb_fn[1] = NULL;
3864 		break;
3865 	case 9:
3866 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3867 		wa_bb_fn[1] = NULL;
3868 		break;
3869 	case 8:
3870 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3871 		wa_bb_fn[1] = NULL;
3872 		break;
3873 	default:
3874 		MISSING_CASE(INTEL_GEN(engine->i915));
3875 		return 0;
3876 	}
3877 
3878 	ret = lrc_setup_wa_ctx(engine);
3879 	if (ret) {
3880 		drm_dbg(&engine->i915->drm,
3881 			"Failed to setup context WA page: %d\n", ret);
3882 		return ret;
3883 	}
3884 
3885 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3886 	batch = batch_ptr = kmap_atomic(page);
3887 
3888 	/*
3889 	 * Emit the two workaround batch buffers, recording the offset from the
3890 	 * start of the workaround batch buffer object for each and their
3891 	 * respective sizes.
3892 	 */
3893 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3894 		wa_bb[i]->offset = batch_ptr - batch;
3895 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3896 						  CACHELINE_BYTES))) {
3897 			ret = -EINVAL;
3898 			break;
3899 		}
3900 		if (wa_bb_fn[i])
3901 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3902 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3903 	}
3904 
3905 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3906 
3907 	kunmap_atomic(batch);
3908 	if (ret)
3909 		lrc_destroy_wa_ctx(engine);
3910 
3911 	return ret;
3912 }
3913 
3914 static void reset_csb_pointers(struct intel_engine_cs *engine)
3915 {
3916 	struct intel_engine_execlists * const execlists = &engine->execlists;
3917 	const unsigned int reset_value = execlists->csb_size - 1;
3918 
3919 	ring_set_paused(engine, 0);
3920 
3921 	/*
3922 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3923 	 * Bludgeon them with a mmio update to be sure.
3924 	 */
3925 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3926 		     0xffff << 16 | reset_value << 8 | reset_value);
3927 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3928 
3929 	/*
3930 	 * After a reset, the HW starts writing into CSB entry [0]. We
3931 	 * therefore have to set our HEAD pointer back one entry so that
3932 	 * the *first* entry we check is entry 0. To complicate this further,
3933 	 * as we don't wait for the first interrupt after reset, we have to
3934 	 * fake the HW write to point back to the last entry so that our
3935 	 * inline comparison of our cached head position against the last HW
3936 	 * write works even before the first interrupt.
3937 	 */
3938 	execlists->csb_head = reset_value;
3939 	WRITE_ONCE(*execlists->csb_write, reset_value);
3940 	wmb(); /* Make sure this is visible to HW (paranoia?) */
3941 
3942 	invalidate_csb_entries(&execlists->csb_status[0],
3943 			       &execlists->csb_status[reset_value]);
3944 
3945 	/* Once more for luck and our trusty paranoia */
3946 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3947 		     0xffff << 16 | reset_value << 8 | reset_value);
3948 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3949 
3950 	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
3951 }
3952 
3953 static void execlists_sanitize(struct intel_engine_cs *engine)
3954 {
3955 	/*
3956 	 * Poison residual state on resume, in case the suspend didn't!
3957 	 *
3958 	 * We have to assume that across suspend/resume (or other loss
3959 	 * of control) that the contents of our pinned buffers has been
3960 	 * lost, replaced by garbage. Since this doesn't always happen,
3961 	 * let's poison such state so that we more quickly spot when
3962 	 * we falsely assume it has been preserved.
3963 	 */
3964 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3965 		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
3966 
3967 	reset_csb_pointers(engine);
3968 
3969 	/*
3970 	 * The kernel_context HWSP is stored in the status_page. As above,
3971 	 * that may be lost on resume/initialisation, and so we need to
3972 	 * reset the value in the HWSP.
3973 	 */
3974 	intel_timeline_reset_seqno(engine->kernel_context->timeline);
3975 
3976 	/* And scrub the dirty cachelines for the HWSP */
3977 	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
3978 }
3979 
3980 static void enable_error_interrupt(struct intel_engine_cs *engine)
3981 {
3982 	u32 status;
3983 
3984 	engine->execlists.error_interrupt = 0;
3985 	ENGINE_WRITE(engine, RING_EMR, ~0u);
3986 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
3987 
3988 	status = ENGINE_READ(engine, RING_ESR);
3989 	if (unlikely(status)) {
3990 		drm_err(&engine->i915->drm,
3991 			"engine '%s' resumed still in error: %08x\n",
3992 			engine->name, status);
3993 		__intel_gt_reset(engine->gt, engine->mask);
3994 	}
3995 
3996 	/*
3997 	 * On current gen8+, we have 2 signals to play with
3998 	 *
3999 	 * - I915_ERROR_INSTUCTION (bit 0)
4000 	 *
4001 	 *    Generate an error if the command parser encounters an invalid
4002 	 *    instruction
4003 	 *
4004 	 *    This is a fatal error.
4005 	 *
4006 	 * - CP_PRIV (bit 2)
4007 	 *
4008 	 *    Generate an error on privilege violation (where the CP replaces
4009 	 *    the instruction with a no-op). This also fires for writes into
4010 	 *    read-only scratch pages.
4011 	 *
4012 	 *    This is a non-fatal error, parsing continues.
4013 	 *
4014 	 * * there are a few others defined for odd HW that we do not use
4015 	 *
4016 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4017 	 * error (as the HW is validating and suppressing the mistakes), we
4018 	 * only unmask the instruction error bit.
4019 	 */
4020 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4021 }
4022 
4023 static void enable_execlists(struct intel_engine_cs *engine)
4024 {
4025 	u32 mode;
4026 
4027 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4028 
4029 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4030 
4031 	if (INTEL_GEN(engine->i915) >= 11)
4032 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4033 	else
4034 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4035 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4036 
4037 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4038 
4039 	ENGINE_WRITE_FW(engine,
4040 			RING_HWS_PGA,
4041 			i915_ggtt_offset(engine->status_page.vma));
4042 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4043 
4044 	enable_error_interrupt(engine);
4045 
4046 	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4047 }
4048 
4049 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4050 {
4051 	bool unexpected = false;
4052 
4053 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4054 		drm_dbg(&engine->i915->drm,
4055 			"STOP_RING still set in RING_MI_MODE\n");
4056 		unexpected = true;
4057 	}
4058 
4059 	return unexpected;
4060 }
4061 
4062 static int execlists_resume(struct intel_engine_cs *engine)
4063 {
4064 	intel_mocs_init_engine(engine);
4065 
4066 	intel_engine_reset_breadcrumbs(engine);
4067 
4068 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4069 		struct drm_printer p = drm_debug_printer(__func__);
4070 
4071 		intel_engine_dump(engine, &p, NULL);
4072 	}
4073 
4074 	enable_execlists(engine);
4075 
4076 	return 0;
4077 }
4078 
4079 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4080 {
4081 	struct intel_engine_execlists * const execlists = &engine->execlists;
4082 	unsigned long flags;
4083 
4084 	ENGINE_TRACE(engine, "depth<-%d\n",
4085 		     atomic_read(&execlists->tasklet.count));
4086 
4087 	/*
4088 	 * Prevent request submission to the hardware until we have
4089 	 * completed the reset in i915_gem_reset_finish(). If a request
4090 	 * is completed by one engine, it may then queue a request
4091 	 * to a second via its execlists->tasklet *just* as we are
4092 	 * calling engine->resume() and also writing the ELSP.
4093 	 * Turning off the execlists->tasklet until the reset is over
4094 	 * prevents the race.
4095 	 */
4096 	__tasklet_disable_sync_once(&execlists->tasklet);
4097 	GEM_BUG_ON(!reset_in_progress(execlists));
4098 
4099 	/* And flush any current direct submission. */
4100 	spin_lock_irqsave(&engine->active.lock, flags);
4101 	spin_unlock_irqrestore(&engine->active.lock, flags);
4102 
4103 	/*
4104 	 * We stop engines, otherwise we might get failed reset and a
4105 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4106 	 * from system hang if batchbuffer is progressing when
4107 	 * the reset is issued, regardless of READY_TO_RESET ack.
4108 	 * Thus assume it is best to stop engines on all gens
4109 	 * where we have a gpu reset.
4110 	 *
4111 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4112 	 *
4113 	 * FIXME: Wa for more modern gens needs to be validated
4114 	 */
4115 	ring_set_paused(engine, 1);
4116 	intel_engine_stop_cs(engine);
4117 
4118 	engine->execlists.reset_ccid = active_ccid(engine);
4119 }
4120 
4121 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4122 {
4123 	int x;
4124 
4125 	x = lrc_ring_mi_mode(engine);
4126 	if (x != -1) {
4127 		regs[x + 1] &= ~STOP_RING;
4128 		regs[x + 1] |= STOP_RING << 16;
4129 	}
4130 }
4131 
4132 static void __execlists_reset_reg_state(const struct intel_context *ce,
4133 					const struct intel_engine_cs *engine)
4134 {
4135 	u32 *regs = ce->lrc_reg_state;
4136 
4137 	__reset_stop_ring(regs, engine);
4138 }
4139 
4140 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4141 {
4142 	struct intel_engine_execlists * const execlists = &engine->execlists;
4143 	struct intel_context *ce;
4144 	struct i915_request *rq;
4145 	u32 head;
4146 
4147 	mb(); /* paranoia: read the CSB pointers from after the reset */
4148 	clflush(execlists->csb_write);
4149 	mb();
4150 
4151 	process_csb(engine); /* drain preemption events */
4152 
4153 	/* Following the reset, we need to reload the CSB read/write pointers */
4154 	reset_csb_pointers(engine);
4155 
4156 	/*
4157 	 * Save the currently executing context, even if we completed
4158 	 * its request, it was still running at the time of the
4159 	 * reset and will have been clobbered.
4160 	 */
4161 	rq = active_context(engine, engine->execlists.reset_ccid);
4162 	if (!rq)
4163 		goto unwind;
4164 
4165 	ce = rq->context;
4166 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4167 
4168 	if (i915_request_completed(rq)) {
4169 		/* Idle context; tidy up the ring so we can restart afresh */
4170 		head = intel_ring_wrap(ce->ring, rq->tail);
4171 		goto out_replay;
4172 	}
4173 
4174 	/* We still have requests in-flight; the engine should be active */
4175 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4176 
4177 	/* Context has requests still in-flight; it should not be idle! */
4178 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4179 
4180 	rq = active_request(ce->timeline, rq);
4181 	head = intel_ring_wrap(ce->ring, rq->head);
4182 	GEM_BUG_ON(head == ce->ring->tail);
4183 
4184 	/*
4185 	 * If this request hasn't started yet, e.g. it is waiting on a
4186 	 * semaphore, we need to avoid skipping the request or else we
4187 	 * break the signaling chain. However, if the context is corrupt
4188 	 * the request will not restart and we will be stuck with a wedged
4189 	 * device. It is quite often the case that if we issue a reset
4190 	 * while the GPU is loading the context image, that the context
4191 	 * image becomes corrupt.
4192 	 *
4193 	 * Otherwise, if we have not started yet, the request should replay
4194 	 * perfectly and we do not need to flag the result as being erroneous.
4195 	 */
4196 	if (!i915_request_started(rq))
4197 		goto out_replay;
4198 
4199 	/*
4200 	 * If the request was innocent, we leave the request in the ELSP
4201 	 * and will try to replay it on restarting. The context image may
4202 	 * have been corrupted by the reset, in which case we may have
4203 	 * to service a new GPU hang, but more likely we can continue on
4204 	 * without impact.
4205 	 *
4206 	 * If the request was guilty, we presume the context is corrupt
4207 	 * and have to at least restore the RING register in the context
4208 	 * image back to the expected values to skip over the guilty request.
4209 	 */
4210 	__i915_request_reset(rq, stalled);
4211 
4212 	/*
4213 	 * We want a simple context + ring to execute the breadcrumb update.
4214 	 * We cannot rely on the context being intact across the GPU hang,
4215 	 * so clear it and rebuild just what we need for the breadcrumb.
4216 	 * All pending requests for this context will be zapped, and any
4217 	 * future request will be after userspace has had the opportunity
4218 	 * to recreate its own state.
4219 	 */
4220 out_replay:
4221 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4222 		     head, ce->ring->tail);
4223 	__execlists_reset_reg_state(ce, engine);
4224 	__execlists_update_reg_state(ce, engine, head);
4225 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4226 
4227 unwind:
4228 	/* Push back any incomplete requests for replay after the reset. */
4229 	cancel_port_requests(execlists);
4230 	__unwind_incomplete_requests(engine);
4231 }
4232 
4233 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4234 {
4235 	unsigned long flags;
4236 
4237 	ENGINE_TRACE(engine, "\n");
4238 
4239 	spin_lock_irqsave(&engine->active.lock, flags);
4240 
4241 	__execlists_reset(engine, stalled);
4242 
4243 	spin_unlock_irqrestore(&engine->active.lock, flags);
4244 }
4245 
4246 static void nop_submission_tasklet(unsigned long data)
4247 {
4248 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4249 
4250 	/* The driver is wedged; don't process any more events. */
4251 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4252 }
4253 
4254 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4255 {
4256 	struct intel_engine_execlists * const execlists = &engine->execlists;
4257 	struct i915_request *rq, *rn;
4258 	struct rb_node *rb;
4259 	unsigned long flags;
4260 
4261 	ENGINE_TRACE(engine, "\n");
4262 
4263 	/*
4264 	 * Before we call engine->cancel_requests(), we should have exclusive
4265 	 * access to the submission state. This is arranged for us by the
4266 	 * caller disabling the interrupt generation, the tasklet and other
4267 	 * threads that may then access the same state, giving us a free hand
4268 	 * to reset state. However, we still need to let lockdep be aware that
4269 	 * we know this state may be accessed in hardirq context, so we
4270 	 * disable the irq around this manipulation and we want to keep
4271 	 * the spinlock focused on its duties and not accidentally conflate
4272 	 * coverage to the submission's irq state. (Similarly, although we
4273 	 * shouldn't need to disable irq around the manipulation of the
4274 	 * submission's irq state, we also wish to remind ourselves that
4275 	 * it is irq state.)
4276 	 */
4277 	spin_lock_irqsave(&engine->active.lock, flags);
4278 
4279 	__execlists_reset(engine, true);
4280 
4281 	/* Mark all executing requests as skipped. */
4282 	list_for_each_entry(rq, &engine->active.requests, sched.link)
4283 		mark_eio(rq);
4284 
4285 	/* Flush the queued requests to the timeline list (for retiring). */
4286 	while ((rb = rb_first_cached(&execlists->queue))) {
4287 		struct i915_priolist *p = to_priolist(rb);
4288 		int i;
4289 
4290 		priolist_for_each_request_consume(rq, rn, p, i) {
4291 			mark_eio(rq);
4292 			__i915_request_submit(rq);
4293 		}
4294 
4295 		rb_erase_cached(&p->node, &execlists->queue);
4296 		i915_priolist_free(p);
4297 	}
4298 
4299 	/* On-hold requests will be flushed to timeline upon their release */
4300 	list_for_each_entry(rq, &engine->active.hold, sched.link)
4301 		mark_eio(rq);
4302 
4303 	/* Cancel all attached virtual engines */
4304 	while ((rb = rb_first_cached(&execlists->virtual))) {
4305 		struct virtual_engine *ve =
4306 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4307 
4308 		rb_erase_cached(rb, &execlists->virtual);
4309 		RB_CLEAR_NODE(rb);
4310 
4311 		spin_lock(&ve->base.active.lock);
4312 		rq = fetch_and_zero(&ve->request);
4313 		if (rq) {
4314 			mark_eio(rq);
4315 
4316 			rq->engine = engine;
4317 			__i915_request_submit(rq);
4318 			i915_request_put(rq);
4319 
4320 			ve->base.execlists.queue_priority_hint = INT_MIN;
4321 		}
4322 		spin_unlock(&ve->base.active.lock);
4323 	}
4324 
4325 	/* Remaining _unready_ requests will be nop'ed when submitted */
4326 
4327 	execlists->queue_priority_hint = INT_MIN;
4328 	execlists->queue = RB_ROOT_CACHED;
4329 
4330 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4331 	execlists->tasklet.func = nop_submission_tasklet;
4332 
4333 	spin_unlock_irqrestore(&engine->active.lock, flags);
4334 }
4335 
4336 static void execlists_reset_finish(struct intel_engine_cs *engine)
4337 {
4338 	struct intel_engine_execlists * const execlists = &engine->execlists;
4339 
4340 	/*
4341 	 * After a GPU reset, we may have requests to replay. Do so now while
4342 	 * we still have the forcewake to be sure that the GPU is not allowed
4343 	 * to sleep before we restart and reload a context.
4344 	 */
4345 	GEM_BUG_ON(!reset_in_progress(execlists));
4346 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4347 		execlists->tasklet.func(execlists->tasklet.data);
4348 
4349 	if (__tasklet_enable(&execlists->tasklet))
4350 		/* And kick in case we missed a new request submission. */
4351 		tasklet_hi_schedule(&execlists->tasklet);
4352 	ENGINE_TRACE(engine, "depth->%d\n",
4353 		     atomic_read(&execlists->tasklet.count));
4354 }
4355 
4356 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4357 				    u64 offset, u32 len,
4358 				    const unsigned int flags)
4359 {
4360 	u32 *cs;
4361 
4362 	cs = intel_ring_begin(rq, 4);
4363 	if (IS_ERR(cs))
4364 		return PTR_ERR(cs);
4365 
4366 	/*
4367 	 * WaDisableCtxRestoreArbitration:bdw,chv
4368 	 *
4369 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4370 	 * particular all the gen that do not need the w/a at all!), if we
4371 	 * took care to make sure that on every switch into this context
4372 	 * (both ordinary and for preemption) that arbitrartion was enabled
4373 	 * we would be fine.  However, for gen8 there is another w/a that
4374 	 * requires us to not preempt inside GPGPU execution, so we keep
4375 	 * arbitration disabled for gen8 batches. Arbitration will be
4376 	 * re-enabled before we close the request
4377 	 * (engine->emit_fini_breadcrumb).
4378 	 */
4379 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4380 
4381 	/* FIXME(BDW+): Address space and security selectors. */
4382 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4383 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4384 	*cs++ = lower_32_bits(offset);
4385 	*cs++ = upper_32_bits(offset);
4386 
4387 	intel_ring_advance(rq, cs);
4388 
4389 	return 0;
4390 }
4391 
4392 static int gen8_emit_bb_start(struct i915_request *rq,
4393 			      u64 offset, u32 len,
4394 			      const unsigned int flags)
4395 {
4396 	u32 *cs;
4397 
4398 	cs = intel_ring_begin(rq, 6);
4399 	if (IS_ERR(cs))
4400 		return PTR_ERR(cs);
4401 
4402 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4403 
4404 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4405 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4406 	*cs++ = lower_32_bits(offset);
4407 	*cs++ = upper_32_bits(offset);
4408 
4409 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4410 	*cs++ = MI_NOOP;
4411 
4412 	intel_ring_advance(rq, cs);
4413 
4414 	return 0;
4415 }
4416 
4417 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4418 {
4419 	ENGINE_WRITE(engine, RING_IMR,
4420 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4421 	ENGINE_POSTING_READ(engine, RING_IMR);
4422 }
4423 
4424 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4425 {
4426 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4427 }
4428 
4429 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4430 {
4431 	u32 cmd, *cs;
4432 
4433 	cs = intel_ring_begin(request, 4);
4434 	if (IS_ERR(cs))
4435 		return PTR_ERR(cs);
4436 
4437 	cmd = MI_FLUSH_DW + 1;
4438 
4439 	/* We always require a command barrier so that subsequent
4440 	 * commands, such as breadcrumb interrupts, are strictly ordered
4441 	 * wrt the contents of the write cache being flushed to memory
4442 	 * (and thus being coherent from the CPU).
4443 	 */
4444 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4445 
4446 	if (mode & EMIT_INVALIDATE) {
4447 		cmd |= MI_INVALIDATE_TLB;
4448 		if (request->engine->class == VIDEO_DECODE_CLASS)
4449 			cmd |= MI_INVALIDATE_BSD;
4450 	}
4451 
4452 	*cs++ = cmd;
4453 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4454 	*cs++ = 0; /* upper addr */
4455 	*cs++ = 0; /* value */
4456 	intel_ring_advance(request, cs);
4457 
4458 	return 0;
4459 }
4460 
4461 static int gen8_emit_flush_render(struct i915_request *request,
4462 				  u32 mode)
4463 {
4464 	bool vf_flush_wa = false, dc_flush_wa = false;
4465 	u32 *cs, flags = 0;
4466 	int len;
4467 
4468 	flags |= PIPE_CONTROL_CS_STALL;
4469 
4470 	if (mode & EMIT_FLUSH) {
4471 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4472 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4473 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4474 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4475 	}
4476 
4477 	if (mode & EMIT_INVALIDATE) {
4478 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4479 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4480 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4481 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4482 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4483 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4484 		flags |= PIPE_CONTROL_QW_WRITE;
4485 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4486 
4487 		/*
4488 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4489 		 * pipe control.
4490 		 */
4491 		if (IS_GEN(request->i915, 9))
4492 			vf_flush_wa = true;
4493 
4494 		/* WaForGAMHang:kbl */
4495 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
4496 			dc_flush_wa = true;
4497 	}
4498 
4499 	len = 6;
4500 
4501 	if (vf_flush_wa)
4502 		len += 6;
4503 
4504 	if (dc_flush_wa)
4505 		len += 12;
4506 
4507 	cs = intel_ring_begin(request, len);
4508 	if (IS_ERR(cs))
4509 		return PTR_ERR(cs);
4510 
4511 	if (vf_flush_wa)
4512 		cs = gen8_emit_pipe_control(cs, 0, 0);
4513 
4514 	if (dc_flush_wa)
4515 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4516 					    0);
4517 
4518 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4519 
4520 	if (dc_flush_wa)
4521 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4522 
4523 	intel_ring_advance(request, cs);
4524 
4525 	return 0;
4526 }
4527 
4528 static int gen11_emit_flush_render(struct i915_request *request,
4529 				   u32 mode)
4530 {
4531 	if (mode & EMIT_FLUSH) {
4532 		u32 *cs;
4533 		u32 flags = 0;
4534 
4535 		flags |= PIPE_CONTROL_CS_STALL;
4536 
4537 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4538 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4539 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4540 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4541 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4542 		flags |= PIPE_CONTROL_QW_WRITE;
4543 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4544 
4545 		cs = intel_ring_begin(request, 6);
4546 		if (IS_ERR(cs))
4547 			return PTR_ERR(cs);
4548 
4549 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4550 		intel_ring_advance(request, cs);
4551 	}
4552 
4553 	if (mode & EMIT_INVALIDATE) {
4554 		u32 *cs;
4555 		u32 flags = 0;
4556 
4557 		flags |= PIPE_CONTROL_CS_STALL;
4558 
4559 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4560 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4561 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4562 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4563 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4564 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4565 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4566 		flags |= PIPE_CONTROL_QW_WRITE;
4567 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4568 
4569 		cs = intel_ring_begin(request, 6);
4570 		if (IS_ERR(cs))
4571 			return PTR_ERR(cs);
4572 
4573 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4574 		intel_ring_advance(request, cs);
4575 	}
4576 
4577 	return 0;
4578 }
4579 
4580 static u32 preparser_disable(bool state)
4581 {
4582 	return MI_ARB_CHECK | 1 << 8 | state;
4583 }
4584 
4585 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4586 {
4587 	static const i915_reg_t vd[] = {
4588 		GEN12_VD0_AUX_NV,
4589 		GEN12_VD1_AUX_NV,
4590 		GEN12_VD2_AUX_NV,
4591 		GEN12_VD3_AUX_NV,
4592 	};
4593 
4594 	static const i915_reg_t ve[] = {
4595 		GEN12_VE0_AUX_NV,
4596 		GEN12_VE1_AUX_NV,
4597 	};
4598 
4599 	if (engine->class == VIDEO_DECODE_CLASS)
4600 		return vd[engine->instance];
4601 
4602 	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4603 		return ve[engine->instance];
4604 
4605 	GEM_BUG_ON("unknown aux_inv_reg\n");
4606 
4607 	return INVALID_MMIO_REG;
4608 }
4609 
4610 static u32 *
4611 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4612 {
4613 	*cs++ = MI_LOAD_REGISTER_IMM(1);
4614 	*cs++ = i915_mmio_reg_offset(inv_reg);
4615 	*cs++ = AUX_INV;
4616 	*cs++ = MI_NOOP;
4617 
4618 	return cs;
4619 }
4620 
4621 static int gen12_emit_flush_render(struct i915_request *request,
4622 				   u32 mode)
4623 {
4624 	if (mode & EMIT_FLUSH) {
4625 		u32 flags = 0;
4626 		u32 *cs;
4627 
4628 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4629 		flags |= PIPE_CONTROL_FLUSH_L3;
4630 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4631 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4632 		/* Wa_1409600907:tgl */
4633 		flags |= PIPE_CONTROL_DEPTH_STALL;
4634 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4635 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4636 
4637 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4638 		flags |= PIPE_CONTROL_QW_WRITE;
4639 
4640 		flags |= PIPE_CONTROL_CS_STALL;
4641 
4642 		cs = intel_ring_begin(request, 6);
4643 		if (IS_ERR(cs))
4644 			return PTR_ERR(cs);
4645 
4646 		cs = gen12_emit_pipe_control(cs,
4647 					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4648 					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4649 		intel_ring_advance(request, cs);
4650 	}
4651 
4652 	if (mode & EMIT_INVALIDATE) {
4653 		u32 flags = 0;
4654 		u32 *cs;
4655 
4656 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4657 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4658 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4659 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4660 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4661 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4662 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4663 
4664 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4665 		flags |= PIPE_CONTROL_QW_WRITE;
4666 
4667 		flags |= PIPE_CONTROL_CS_STALL;
4668 
4669 		cs = intel_ring_begin(request, 8 + 4);
4670 		if (IS_ERR(cs))
4671 			return PTR_ERR(cs);
4672 
4673 		/*
4674 		 * Prevent the pre-parser from skipping past the TLB
4675 		 * invalidate and loading a stale page for the batch
4676 		 * buffer / request payload.
4677 		 */
4678 		*cs++ = preparser_disable(true);
4679 
4680 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4681 
4682 		/* hsdes: 1809175790 */
4683 		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4684 
4685 		*cs++ = preparser_disable(false);
4686 		intel_ring_advance(request, cs);
4687 	}
4688 
4689 	return 0;
4690 }
4691 
4692 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4693 {
4694 	intel_engine_mask_t aux_inv = 0;
4695 	u32 cmd, *cs;
4696 
4697 	if (mode & EMIT_INVALIDATE)
4698 		aux_inv = request->engine->mask & ~BIT(BCS0);
4699 
4700 	cs = intel_ring_begin(request,
4701 			      4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0));
4702 	if (IS_ERR(cs))
4703 		return PTR_ERR(cs);
4704 
4705 	cmd = MI_FLUSH_DW + 1;
4706 
4707 	/* We always require a command barrier so that subsequent
4708 	 * commands, such as breadcrumb interrupts, are strictly ordered
4709 	 * wrt the contents of the write cache being flushed to memory
4710 	 * (and thus being coherent from the CPU).
4711 	 */
4712 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4713 
4714 	if (mode & EMIT_INVALIDATE) {
4715 		cmd |= MI_INVALIDATE_TLB;
4716 		if (request->engine->class == VIDEO_DECODE_CLASS)
4717 			cmd |= MI_INVALIDATE_BSD;
4718 	}
4719 
4720 	*cs++ = cmd;
4721 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4722 	*cs++ = 0; /* upper addr */
4723 	*cs++ = 0; /* value */
4724 
4725 	if (aux_inv) { /* hsdes: 1809175790 */
4726 		struct intel_engine_cs *engine;
4727 		unsigned int tmp;
4728 
4729 		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4730 		for_each_engine_masked(engine, request->engine->gt,
4731 				       aux_inv, tmp) {
4732 			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4733 			*cs++ = AUX_INV;
4734 		}
4735 		*cs++ = MI_NOOP;
4736 	}
4737 	intel_ring_advance(request, cs);
4738 
4739 	return 0;
4740 }
4741 
4742 /*
4743  * Reserve space for 2 NOOPs at the end of each request to be
4744  * used as a workaround for not being allowed to do lite
4745  * restore with HEAD==TAIL (WaIdleLiteRestore).
4746  */
4747 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4748 {
4749 	/* Ensure there's always at least one preemption point per-request. */
4750 	*cs++ = MI_ARB_CHECK;
4751 	*cs++ = MI_NOOP;
4752 	request->wa_tail = intel_ring_offset(request, cs);
4753 
4754 	return cs;
4755 }
4756 
4757 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4758 {
4759 	*cs++ = MI_SEMAPHORE_WAIT |
4760 		MI_SEMAPHORE_GLOBAL_GTT |
4761 		MI_SEMAPHORE_POLL |
4762 		MI_SEMAPHORE_SAD_EQ_SDD;
4763 	*cs++ = 0;
4764 	*cs++ = intel_hws_preempt_address(request->engine);
4765 	*cs++ = 0;
4766 
4767 	return cs;
4768 }
4769 
4770 static __always_inline u32*
4771 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4772 {
4773 	*cs++ = MI_USER_INTERRUPT;
4774 
4775 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4776 	if (intel_engine_has_semaphores(request->engine))
4777 		cs = emit_preempt_busywait(request, cs);
4778 
4779 	request->tail = intel_ring_offset(request, cs);
4780 	assert_ring_tail_valid(request->ring, request->tail);
4781 
4782 	return gen8_emit_wa_tail(request, cs);
4783 }
4784 
4785 static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs)
4786 {
4787 	u32 addr = i915_request_active_timeline(request)->hwsp_offset;
4788 
4789 	return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0);
4790 }
4791 
4792 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4793 {
4794 	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4795 }
4796 
4797 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4798 {
4799 	cs = gen8_emit_pipe_control(cs,
4800 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4801 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4802 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4803 				    0);
4804 
4805 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4806 	cs = gen8_emit_ggtt_write_rcs(cs,
4807 				      request->fence.seqno,
4808 				      i915_request_active_timeline(request)->hwsp_offset,
4809 				      PIPE_CONTROL_FLUSH_ENABLE |
4810 				      PIPE_CONTROL_CS_STALL);
4811 
4812 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4813 }
4814 
4815 static u32 *
4816 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4817 {
4818 	cs = gen8_emit_ggtt_write_rcs(cs,
4819 				      request->fence.seqno,
4820 				      i915_request_active_timeline(request)->hwsp_offset,
4821 				      PIPE_CONTROL_CS_STALL |
4822 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4823 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4824 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4825 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4826 				      PIPE_CONTROL_FLUSH_ENABLE);
4827 
4828 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4829 }
4830 
4831 /*
4832  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4833  * flush and will continue pre-fetching the instructions after it before the
4834  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4835  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4836  * of the next request before the memory has been flushed, we're guaranteed that
4837  * we won't access the batch itself too early.
4838  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4839  * so, if the current request is modifying an instruction in the next request on
4840  * the same intel_context, we might pre-fetch and then execute the pre-update
4841  * instruction. To avoid this, the users of self-modifying code should either
4842  * disable the parser around the code emitting the memory writes, via a new flag
4843  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4844  * the in-kernel use-cases we've opted to use a separate context, see
4845  * reloc_gpu() as an example.
4846  * All the above applies only to the instructions themselves. Non-inline data
4847  * used by the instructions is not pre-fetched.
4848  */
4849 
4850 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4851 {
4852 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4853 		MI_SEMAPHORE_GLOBAL_GTT |
4854 		MI_SEMAPHORE_POLL |
4855 		MI_SEMAPHORE_SAD_EQ_SDD;
4856 	*cs++ = 0;
4857 	*cs++ = intel_hws_preempt_address(request->engine);
4858 	*cs++ = 0;
4859 	*cs++ = 0;
4860 	*cs++ = MI_NOOP;
4861 
4862 	return cs;
4863 }
4864 
4865 static __always_inline u32*
4866 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4867 {
4868 	*cs++ = MI_USER_INTERRUPT;
4869 
4870 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4871 	if (intel_engine_has_semaphores(request->engine))
4872 		cs = gen12_emit_preempt_busywait(request, cs);
4873 
4874 	request->tail = intel_ring_offset(request, cs);
4875 	assert_ring_tail_valid(request->ring, request->tail);
4876 
4877 	return gen8_emit_wa_tail(request, cs);
4878 }
4879 
4880 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4881 {
4882 	return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4883 }
4884 
4885 static u32 *
4886 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4887 {
4888 	cs = gen12_emit_ggtt_write_rcs(cs,
4889 				       request->fence.seqno,
4890 				       i915_request_active_timeline(request)->hwsp_offset,
4891 				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4892 				       PIPE_CONTROL_CS_STALL |
4893 				       PIPE_CONTROL_TILE_CACHE_FLUSH |
4894 				       PIPE_CONTROL_FLUSH_L3 |
4895 				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4896 				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4897 				       /* Wa_1409600907:tgl */
4898 				       PIPE_CONTROL_DEPTH_STALL |
4899 				       PIPE_CONTROL_DC_FLUSH_ENABLE |
4900 				       PIPE_CONTROL_FLUSH_ENABLE);
4901 
4902 	return gen12_emit_fini_breadcrumb_tail(request, cs);
4903 }
4904 
4905 static void execlists_park(struct intel_engine_cs *engine)
4906 {
4907 	cancel_timer(&engine->execlists.timer);
4908 	cancel_timer(&engine->execlists.preempt);
4909 }
4910 
4911 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4912 {
4913 	engine->submit_request = execlists_submit_request;
4914 	engine->schedule = i915_schedule;
4915 	engine->execlists.tasklet.func = execlists_submission_tasklet;
4916 
4917 	engine->reset.prepare = execlists_reset_prepare;
4918 	engine->reset.rewind = execlists_reset_rewind;
4919 	engine->reset.cancel = execlists_reset_cancel;
4920 	engine->reset.finish = execlists_reset_finish;
4921 
4922 	engine->park = execlists_park;
4923 	engine->unpark = NULL;
4924 
4925 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4926 	if (!intel_vgpu_active(engine->i915)) {
4927 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4928 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
4929 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4930 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
4931 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
4932 		}
4933 	}
4934 
4935 	if (INTEL_GEN(engine->i915) >= 12)
4936 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4937 
4938 	if (intel_engine_has_preemption(engine))
4939 		engine->emit_bb_start = gen8_emit_bb_start;
4940 	else
4941 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
4942 }
4943 
4944 static void execlists_shutdown(struct intel_engine_cs *engine)
4945 {
4946 	/* Synchronise with residual timers and any softirq they raise */
4947 	del_timer_sync(&engine->execlists.timer);
4948 	del_timer_sync(&engine->execlists.preempt);
4949 	tasklet_kill(&engine->execlists.tasklet);
4950 }
4951 
4952 static void execlists_release(struct intel_engine_cs *engine)
4953 {
4954 	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
4955 
4956 	execlists_shutdown(engine);
4957 
4958 	intel_engine_cleanup_common(engine);
4959 	lrc_destroy_wa_ctx(engine);
4960 }
4961 
4962 static void
4963 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4964 {
4965 	/* Default vfuncs which can be overriden by each engine. */
4966 
4967 	engine->resume = execlists_resume;
4968 
4969 	engine->cops = &execlists_context_ops;
4970 	engine->request_alloc = execlists_request_alloc;
4971 
4972 	engine->emit_flush = gen8_emit_flush;
4973 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4974 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4975 	if (INTEL_GEN(engine->i915) >= 12) {
4976 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4977 		engine->emit_flush = gen12_emit_flush;
4978 	}
4979 	engine->set_default_submission = intel_execlists_set_default_submission;
4980 
4981 	if (INTEL_GEN(engine->i915) < 11) {
4982 		engine->irq_enable = gen8_logical_ring_enable_irq;
4983 		engine->irq_disable = gen8_logical_ring_disable_irq;
4984 	} else {
4985 		/*
4986 		 * TODO: On Gen11 interrupt masks need to be clear
4987 		 * to allow C6 entry. Keep interrupts enabled at
4988 		 * and take the hit of generating extra interrupts
4989 		 * until a more refined solution exists.
4990 		 */
4991 	}
4992 }
4993 
4994 static inline void
4995 logical_ring_default_irqs(struct intel_engine_cs *engine)
4996 {
4997 	unsigned int shift = 0;
4998 
4999 	if (INTEL_GEN(engine->i915) < 11) {
5000 		const u8 irq_shifts[] = {
5001 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5002 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5003 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5004 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5005 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5006 		};
5007 
5008 		shift = irq_shifts[engine->id];
5009 	}
5010 
5011 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5012 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5013 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5014 	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5015 }
5016 
5017 static void rcs_submission_override(struct intel_engine_cs *engine)
5018 {
5019 	switch (INTEL_GEN(engine->i915)) {
5020 	case 12:
5021 		engine->emit_flush = gen12_emit_flush_render;
5022 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5023 		break;
5024 	case 11:
5025 		engine->emit_flush = gen11_emit_flush_render;
5026 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5027 		break;
5028 	default:
5029 		engine->emit_flush = gen8_emit_flush_render;
5030 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5031 		break;
5032 	}
5033 }
5034 
5035 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5036 {
5037 	struct intel_engine_execlists * const execlists = &engine->execlists;
5038 	struct drm_i915_private *i915 = engine->i915;
5039 	struct intel_uncore *uncore = engine->uncore;
5040 	u32 base = engine->mmio_base;
5041 
5042 	tasklet_init(&engine->execlists.tasklet,
5043 		     execlists_submission_tasklet, (unsigned long)engine);
5044 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5045 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5046 
5047 	logical_ring_default_vfuncs(engine);
5048 	logical_ring_default_irqs(engine);
5049 
5050 	if (engine->class == RENDER_CLASS)
5051 		rcs_submission_override(engine);
5052 
5053 	if (intel_init_workaround_bb(engine))
5054 		/*
5055 		 * We continue even if we fail to initialize WA batch
5056 		 * because we only expect rare glitches but nothing
5057 		 * critical to prevent us from using GPU
5058 		 */
5059 		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5060 
5061 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5062 		execlists->submit_reg = uncore->regs +
5063 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5064 		execlists->ctrl_reg = uncore->regs +
5065 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5066 	} else {
5067 		execlists->submit_reg = uncore->regs +
5068 			i915_mmio_reg_offset(RING_ELSP(base));
5069 	}
5070 
5071 	execlists->csb_status =
5072 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5073 
5074 	execlists->csb_write =
5075 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5076 
5077 	if (INTEL_GEN(i915) < 11)
5078 		execlists->csb_size = GEN8_CSB_ENTRIES;
5079 	else
5080 		execlists->csb_size = GEN11_CSB_ENTRIES;
5081 
5082 	if (INTEL_GEN(engine->i915) >= 11) {
5083 		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5084 		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5085 	}
5086 
5087 	/* Finally, take ownership and responsibility for cleanup! */
5088 	engine->sanitize = execlists_sanitize;
5089 	engine->release = execlists_release;
5090 
5091 	return 0;
5092 }
5093 
5094 static void init_common_reg_state(u32 * const regs,
5095 				  const struct intel_engine_cs *engine,
5096 				  const struct intel_ring *ring,
5097 				  bool inhibit)
5098 {
5099 	u32 ctl;
5100 
5101 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5102 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5103 	if (inhibit)
5104 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5105 	if (INTEL_GEN(engine->i915) < 11)
5106 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5107 					   CTX_CTRL_RS_CTX_ENABLE);
5108 	regs[CTX_CONTEXT_CONTROL] = ctl;
5109 
5110 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5111 	regs[CTX_TIMESTAMP] = 0;
5112 }
5113 
5114 static void init_wa_bb_reg_state(u32 * const regs,
5115 				 const struct intel_engine_cs *engine)
5116 {
5117 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5118 
5119 	if (wa_ctx->per_ctx.size) {
5120 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5121 
5122 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5123 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5124 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5125 	}
5126 
5127 	if (wa_ctx->indirect_ctx.size) {
5128 		lrc_ring_setup_indirect_ctx(regs, engine,
5129 					    i915_ggtt_offset(wa_ctx->vma) +
5130 					    wa_ctx->indirect_ctx.offset,
5131 					    wa_ctx->indirect_ctx.size);
5132 	}
5133 }
5134 
5135 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5136 {
5137 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5138 		/* 64b PPGTT (48bit canonical)
5139 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5140 		 * other PDP Descriptors are ignored.
5141 		 */
5142 		ASSIGN_CTX_PML4(ppgtt, regs);
5143 	} else {
5144 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5145 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5146 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5147 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5148 	}
5149 }
5150 
5151 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5152 {
5153 	if (i915_is_ggtt(vm))
5154 		return i915_vm_to_ggtt(vm)->alias;
5155 	else
5156 		return i915_vm_to_ppgtt(vm);
5157 }
5158 
5159 static void execlists_init_reg_state(u32 *regs,
5160 				     const struct intel_context *ce,
5161 				     const struct intel_engine_cs *engine,
5162 				     const struct intel_ring *ring,
5163 				     bool inhibit)
5164 {
5165 	/*
5166 	 * A context is actually a big batch buffer with several
5167 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5168 	 * values we are setting here are only for the first context restore:
5169 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5170 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5171 	 * we are not initializing here).
5172 	 *
5173 	 * Must keep consistent with virtual_update_register_offsets().
5174 	 */
5175 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5176 
5177 	init_common_reg_state(regs, engine, ring, inhibit);
5178 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5179 
5180 	init_wa_bb_reg_state(regs, engine);
5181 
5182 	__reset_stop_ring(regs, engine);
5183 }
5184 
5185 static int
5186 populate_lr_context(struct intel_context *ce,
5187 		    struct drm_i915_gem_object *ctx_obj,
5188 		    struct intel_engine_cs *engine,
5189 		    struct intel_ring *ring)
5190 {
5191 	bool inhibit = true;
5192 	void *vaddr;
5193 
5194 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5195 	if (IS_ERR(vaddr)) {
5196 		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5197 		return PTR_ERR(vaddr);
5198 	}
5199 
5200 	set_redzone(vaddr, engine);
5201 
5202 	if (engine->default_state) {
5203 		shmem_read(engine->default_state, 0,
5204 			   vaddr, engine->context_size);
5205 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5206 		inhibit = false;
5207 	}
5208 
5209 	/* Clear the ppHWSP (inc. per-context counters) */
5210 	memset(vaddr, 0, PAGE_SIZE);
5211 
5212 	/*
5213 	 * The second page of the context object contains some registers which
5214 	 * must be set up prior to the first execution.
5215 	 */
5216 	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5217 				 ce, engine, ring, inhibit);
5218 
5219 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5220 	i915_gem_object_unpin_map(ctx_obj);
5221 	return 0;
5222 }
5223 
5224 static int __execlists_context_alloc(struct intel_context *ce,
5225 				     struct intel_engine_cs *engine)
5226 {
5227 	struct drm_i915_gem_object *ctx_obj;
5228 	struct intel_ring *ring;
5229 	struct i915_vma *vma;
5230 	u32 context_size;
5231 	int ret;
5232 
5233 	GEM_BUG_ON(ce->state);
5234 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5235 
5236 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5237 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5238 
5239 	if (INTEL_GEN(engine->i915) == 12) {
5240 		ce->wa_bb_page = context_size / PAGE_SIZE;
5241 		context_size += PAGE_SIZE;
5242 	}
5243 
5244 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5245 	if (IS_ERR(ctx_obj))
5246 		return PTR_ERR(ctx_obj);
5247 
5248 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5249 	if (IS_ERR(vma)) {
5250 		ret = PTR_ERR(vma);
5251 		goto error_deref_obj;
5252 	}
5253 
5254 	if (!ce->timeline) {
5255 		struct intel_timeline *tl;
5256 		struct i915_vma *hwsp;
5257 
5258 		/*
5259 		 * Use the static global HWSP for the kernel context, and
5260 		 * a dynamically allocated cacheline for everyone else.
5261 		 */
5262 		hwsp = NULL;
5263 		if (unlikely(intel_context_is_barrier(ce)))
5264 			hwsp = engine->status_page.vma;
5265 
5266 		tl = intel_timeline_create(engine->gt, hwsp);
5267 		if (IS_ERR(tl)) {
5268 			ret = PTR_ERR(tl);
5269 			goto error_deref_obj;
5270 		}
5271 
5272 		ce->timeline = tl;
5273 	}
5274 
5275 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5276 	if (IS_ERR(ring)) {
5277 		ret = PTR_ERR(ring);
5278 		goto error_deref_obj;
5279 	}
5280 
5281 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5282 	if (ret) {
5283 		drm_dbg(&engine->i915->drm,
5284 			"Failed to populate LRC: %d\n", ret);
5285 		goto error_ring_free;
5286 	}
5287 
5288 	ce->ring = ring;
5289 	ce->state = vma;
5290 
5291 	return 0;
5292 
5293 error_ring_free:
5294 	intel_ring_put(ring);
5295 error_deref_obj:
5296 	i915_gem_object_put(ctx_obj);
5297 	return ret;
5298 }
5299 
5300 static struct list_head *virtual_queue(struct virtual_engine *ve)
5301 {
5302 	return &ve->base.execlists.default_priolist.requests[0];
5303 }
5304 
5305 static void virtual_context_destroy(struct kref *kref)
5306 {
5307 	struct virtual_engine *ve =
5308 		container_of(kref, typeof(*ve), context.ref);
5309 	unsigned int n;
5310 
5311 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5312 	GEM_BUG_ON(ve->request);
5313 	GEM_BUG_ON(ve->context.inflight);
5314 
5315 	for (n = 0; n < ve->num_siblings; n++) {
5316 		struct intel_engine_cs *sibling = ve->siblings[n];
5317 		struct rb_node *node = &ve->nodes[sibling->id].rb;
5318 		unsigned long flags;
5319 
5320 		if (RB_EMPTY_NODE(node))
5321 			continue;
5322 
5323 		spin_lock_irqsave(&sibling->active.lock, flags);
5324 
5325 		/* Detachment is lazily performed in the execlists tasklet */
5326 		if (!RB_EMPTY_NODE(node))
5327 			rb_erase_cached(node, &sibling->execlists.virtual);
5328 
5329 		spin_unlock_irqrestore(&sibling->active.lock, flags);
5330 	}
5331 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5332 
5333 	if (ve->context.state)
5334 		__execlists_context_fini(&ve->context);
5335 	intel_context_fini(&ve->context);
5336 
5337 	intel_engine_free_request_pool(&ve->base);
5338 
5339 	kfree(ve->bonds);
5340 	kfree(ve);
5341 }
5342 
5343 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5344 {
5345 	int swp;
5346 
5347 	/*
5348 	 * Pick a random sibling on starting to help spread the load around.
5349 	 *
5350 	 * New contexts are typically created with exactly the same order
5351 	 * of siblings, and often started in batches. Due to the way we iterate
5352 	 * the array of sibling when submitting requests, sibling[0] is
5353 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5354 	 * randomised across the system, we also help spread the load by the
5355 	 * first engine we inspect being different each time.
5356 	 *
5357 	 * NB This does not force us to execute on this engine, it will just
5358 	 * typically be the first we inspect for submission.
5359 	 */
5360 	swp = prandom_u32_max(ve->num_siblings);
5361 	if (!swp)
5362 		return;
5363 
5364 	swap(ve->siblings[swp], ve->siblings[0]);
5365 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
5366 		virtual_update_register_offsets(ve->context.lrc_reg_state,
5367 						ve->siblings[0]);
5368 }
5369 
5370 static int virtual_context_alloc(struct intel_context *ce)
5371 {
5372 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5373 
5374 	return __execlists_context_alloc(ce, ve->siblings[0]);
5375 }
5376 
5377 static int virtual_context_pin(struct intel_context *ce)
5378 {
5379 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5380 	int err;
5381 
5382 	/* Note: we must use a real engine class for setting up reg state */
5383 	err = __execlists_context_pin(ce, ve->siblings[0]);
5384 	if (err)
5385 		return err;
5386 
5387 	virtual_engine_initial_hint(ve);
5388 	return 0;
5389 }
5390 
5391 static void virtual_context_enter(struct intel_context *ce)
5392 {
5393 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5394 	unsigned int n;
5395 
5396 	for (n = 0; n < ve->num_siblings; n++)
5397 		intel_engine_pm_get(ve->siblings[n]);
5398 
5399 	intel_timeline_enter(ce->timeline);
5400 }
5401 
5402 static void virtual_context_exit(struct intel_context *ce)
5403 {
5404 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5405 	unsigned int n;
5406 
5407 	intel_timeline_exit(ce->timeline);
5408 
5409 	for (n = 0; n < ve->num_siblings; n++)
5410 		intel_engine_pm_put(ve->siblings[n]);
5411 }
5412 
5413 static const struct intel_context_ops virtual_context_ops = {
5414 	.alloc = virtual_context_alloc,
5415 
5416 	.pin = virtual_context_pin,
5417 	.unpin = execlists_context_unpin,
5418 
5419 	.enter = virtual_context_enter,
5420 	.exit = virtual_context_exit,
5421 
5422 	.destroy = virtual_context_destroy,
5423 };
5424 
5425 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5426 {
5427 	struct i915_request *rq;
5428 	intel_engine_mask_t mask;
5429 
5430 	rq = READ_ONCE(ve->request);
5431 	if (!rq)
5432 		return 0;
5433 
5434 	/* The rq is ready for submission; rq->execution_mask is now stable. */
5435 	mask = rq->execution_mask;
5436 	if (unlikely(!mask)) {
5437 		/* Invalid selection, submit to a random engine in error */
5438 		i915_request_set_error_once(rq, -ENODEV);
5439 		mask = ve->siblings[0]->mask;
5440 	}
5441 
5442 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5443 		     rq->fence.context, rq->fence.seqno,
5444 		     mask, ve->base.execlists.queue_priority_hint);
5445 
5446 	return mask;
5447 }
5448 
5449 static void virtual_submission_tasklet(unsigned long data)
5450 {
5451 	struct virtual_engine * const ve = (struct virtual_engine *)data;
5452 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5453 	intel_engine_mask_t mask;
5454 	unsigned int n;
5455 
5456 	rcu_read_lock();
5457 	mask = virtual_submission_mask(ve);
5458 	rcu_read_unlock();
5459 	if (unlikely(!mask))
5460 		return;
5461 
5462 	local_irq_disable();
5463 	for (n = 0; n < ve->num_siblings; n++) {
5464 		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5465 		struct ve_node * const node = &ve->nodes[sibling->id];
5466 		struct rb_node **parent, *rb;
5467 		bool first;
5468 
5469 		if (!READ_ONCE(ve->request))
5470 			break; /* already handled by a sibling's tasklet */
5471 
5472 		if (unlikely(!(mask & sibling->mask))) {
5473 			if (!RB_EMPTY_NODE(&node->rb)) {
5474 				spin_lock(&sibling->active.lock);
5475 				rb_erase_cached(&node->rb,
5476 						&sibling->execlists.virtual);
5477 				RB_CLEAR_NODE(&node->rb);
5478 				spin_unlock(&sibling->active.lock);
5479 			}
5480 			continue;
5481 		}
5482 
5483 		spin_lock(&sibling->active.lock);
5484 
5485 		if (!RB_EMPTY_NODE(&node->rb)) {
5486 			/*
5487 			 * Cheat and avoid rebalancing the tree if we can
5488 			 * reuse this node in situ.
5489 			 */
5490 			first = rb_first_cached(&sibling->execlists.virtual) ==
5491 				&node->rb;
5492 			if (prio == node->prio || (prio > node->prio && first))
5493 				goto submit_engine;
5494 
5495 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5496 		}
5497 
5498 		rb = NULL;
5499 		first = true;
5500 		parent = &sibling->execlists.virtual.rb_root.rb_node;
5501 		while (*parent) {
5502 			struct ve_node *other;
5503 
5504 			rb = *parent;
5505 			other = rb_entry(rb, typeof(*other), rb);
5506 			if (prio > other->prio) {
5507 				parent = &rb->rb_left;
5508 			} else {
5509 				parent = &rb->rb_right;
5510 				first = false;
5511 			}
5512 		}
5513 
5514 		rb_link_node(&node->rb, rb, parent);
5515 		rb_insert_color_cached(&node->rb,
5516 				       &sibling->execlists.virtual,
5517 				       first);
5518 
5519 submit_engine:
5520 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5521 		node->prio = prio;
5522 		if (first && prio > sibling->execlists.queue_priority_hint)
5523 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5524 
5525 		spin_unlock(&sibling->active.lock);
5526 	}
5527 	local_irq_enable();
5528 }
5529 
5530 static void virtual_submit_request(struct i915_request *rq)
5531 {
5532 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5533 	struct i915_request *old;
5534 	unsigned long flags;
5535 
5536 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5537 		     rq->fence.context,
5538 		     rq->fence.seqno);
5539 
5540 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5541 
5542 	spin_lock_irqsave(&ve->base.active.lock, flags);
5543 
5544 	old = ve->request;
5545 	if (old) { /* background completion event from preempt-to-busy */
5546 		GEM_BUG_ON(!i915_request_completed(old));
5547 		__i915_request_submit(old);
5548 		i915_request_put(old);
5549 	}
5550 
5551 	if (i915_request_completed(rq)) {
5552 		__i915_request_submit(rq);
5553 
5554 		ve->base.execlists.queue_priority_hint = INT_MIN;
5555 		ve->request = NULL;
5556 	} else {
5557 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5558 		ve->request = i915_request_get(rq);
5559 
5560 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5561 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5562 
5563 		tasklet_schedule(&ve->base.execlists.tasklet);
5564 	}
5565 
5566 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5567 }
5568 
5569 static struct ve_bond *
5570 virtual_find_bond(struct virtual_engine *ve,
5571 		  const struct intel_engine_cs *master)
5572 {
5573 	int i;
5574 
5575 	for (i = 0; i < ve->num_bonds; i++) {
5576 		if (ve->bonds[i].master == master)
5577 			return &ve->bonds[i];
5578 	}
5579 
5580 	return NULL;
5581 }
5582 
5583 static void
5584 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5585 {
5586 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5587 	intel_engine_mask_t allowed, exec;
5588 	struct ve_bond *bond;
5589 
5590 	allowed = ~to_request(signal)->engine->mask;
5591 
5592 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5593 	if (bond)
5594 		allowed &= bond->sibling_mask;
5595 
5596 	/* Restrict the bonded request to run on only the available engines */
5597 	exec = READ_ONCE(rq->execution_mask);
5598 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5599 		;
5600 
5601 	/* Prevent the master from being re-run on the bonded engines */
5602 	to_request(signal)->execution_mask &= ~allowed;
5603 }
5604 
5605 struct intel_context *
5606 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5607 			       unsigned int count)
5608 {
5609 	struct virtual_engine *ve;
5610 	unsigned int n;
5611 	int err;
5612 
5613 	if (count == 0)
5614 		return ERR_PTR(-EINVAL);
5615 
5616 	if (count == 1)
5617 		return intel_context_create(siblings[0]);
5618 
5619 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5620 	if (!ve)
5621 		return ERR_PTR(-ENOMEM);
5622 
5623 	ve->base.i915 = siblings[0]->i915;
5624 	ve->base.gt = siblings[0]->gt;
5625 	ve->base.uncore = siblings[0]->uncore;
5626 	ve->base.id = -1;
5627 
5628 	ve->base.class = OTHER_CLASS;
5629 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5630 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5631 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5632 
5633 	/*
5634 	 * The decision on whether to submit a request using semaphores
5635 	 * depends on the saturated state of the engine. We only compute
5636 	 * this during HW submission of the request, and we need for this
5637 	 * state to be globally applied to all requests being submitted
5638 	 * to this engine. Virtual engines encompass more than one physical
5639 	 * engine and so we cannot accurately tell in advance if one of those
5640 	 * engines is already saturated and so cannot afford to use a semaphore
5641 	 * and be pessimized in priority for doing so -- if we are the only
5642 	 * context using semaphores after all other clients have stopped, we
5643 	 * will be starved on the saturated system. Such a global switch for
5644 	 * semaphores is less than ideal, but alas is the current compromise.
5645 	 */
5646 	ve->base.saturated = ALL_ENGINES;
5647 
5648 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5649 
5650 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5651 	intel_engine_init_breadcrumbs(&ve->base);
5652 	intel_engine_init_execlists(&ve->base);
5653 
5654 	ve->base.cops = &virtual_context_ops;
5655 	ve->base.request_alloc = execlists_request_alloc;
5656 
5657 	ve->base.schedule = i915_schedule;
5658 	ve->base.submit_request = virtual_submit_request;
5659 	ve->base.bond_execute = virtual_bond_execute;
5660 
5661 	INIT_LIST_HEAD(virtual_queue(ve));
5662 	ve->base.execlists.queue_priority_hint = INT_MIN;
5663 	tasklet_init(&ve->base.execlists.tasklet,
5664 		     virtual_submission_tasklet,
5665 		     (unsigned long)ve);
5666 
5667 	intel_context_init(&ve->context, &ve->base);
5668 
5669 	for (n = 0; n < count; n++) {
5670 		struct intel_engine_cs *sibling = siblings[n];
5671 
5672 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5673 		if (sibling->mask & ve->base.mask) {
5674 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5675 				  sibling->name);
5676 			err = -EINVAL;
5677 			goto err_put;
5678 		}
5679 
5680 		/*
5681 		 * The virtual engine implementation is tightly coupled to
5682 		 * the execlists backend -- we push out request directly
5683 		 * into a tree inside each physical engine. We could support
5684 		 * layering if we handle cloning of the requests and
5685 		 * submitting a copy into each backend.
5686 		 */
5687 		if (sibling->execlists.tasklet.func !=
5688 		    execlists_submission_tasklet) {
5689 			err = -ENODEV;
5690 			goto err_put;
5691 		}
5692 
5693 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5694 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5695 
5696 		ve->siblings[ve->num_siblings++] = sibling;
5697 		ve->base.mask |= sibling->mask;
5698 
5699 		/*
5700 		 * All physical engines must be compatible for their emission
5701 		 * functions (as we build the instructions during request
5702 		 * construction and do not alter them before submission
5703 		 * on the physical engine). We use the engine class as a guide
5704 		 * here, although that could be refined.
5705 		 */
5706 		if (ve->base.class != OTHER_CLASS) {
5707 			if (ve->base.class != sibling->class) {
5708 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5709 					  sibling->class, ve->base.class);
5710 				err = -EINVAL;
5711 				goto err_put;
5712 			}
5713 			continue;
5714 		}
5715 
5716 		ve->base.class = sibling->class;
5717 		ve->base.uabi_class = sibling->uabi_class;
5718 		snprintf(ve->base.name, sizeof(ve->base.name),
5719 			 "v%dx%d", ve->base.class, count);
5720 		ve->base.context_size = sibling->context_size;
5721 
5722 		ve->base.emit_bb_start = sibling->emit_bb_start;
5723 		ve->base.emit_flush = sibling->emit_flush;
5724 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5725 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5726 		ve->base.emit_fini_breadcrumb_dw =
5727 			sibling->emit_fini_breadcrumb_dw;
5728 
5729 		ve->base.flags = sibling->flags;
5730 	}
5731 
5732 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5733 
5734 	return &ve->context;
5735 
5736 err_put:
5737 	intel_context_put(&ve->context);
5738 	return ERR_PTR(err);
5739 }
5740 
5741 struct intel_context *
5742 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5743 {
5744 	struct virtual_engine *se = to_virtual_engine(src);
5745 	struct intel_context *dst;
5746 
5747 	dst = intel_execlists_create_virtual(se->siblings,
5748 					     se->num_siblings);
5749 	if (IS_ERR(dst))
5750 		return dst;
5751 
5752 	if (se->num_bonds) {
5753 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5754 
5755 		de->bonds = kmemdup(se->bonds,
5756 				    sizeof(*se->bonds) * se->num_bonds,
5757 				    GFP_KERNEL);
5758 		if (!de->bonds) {
5759 			intel_context_put(dst);
5760 			return ERR_PTR(-ENOMEM);
5761 		}
5762 
5763 		de->num_bonds = se->num_bonds;
5764 	}
5765 
5766 	return dst;
5767 }
5768 
5769 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5770 				     const struct intel_engine_cs *master,
5771 				     const struct intel_engine_cs *sibling)
5772 {
5773 	struct virtual_engine *ve = to_virtual_engine(engine);
5774 	struct ve_bond *bond;
5775 	int n;
5776 
5777 	/* Sanity check the sibling is part of the virtual engine */
5778 	for (n = 0; n < ve->num_siblings; n++)
5779 		if (sibling == ve->siblings[n])
5780 			break;
5781 	if (n == ve->num_siblings)
5782 		return -EINVAL;
5783 
5784 	bond = virtual_find_bond(ve, master);
5785 	if (bond) {
5786 		bond->sibling_mask |= sibling->mask;
5787 		return 0;
5788 	}
5789 
5790 	bond = krealloc(ve->bonds,
5791 			sizeof(*bond) * (ve->num_bonds + 1),
5792 			GFP_KERNEL);
5793 	if (!bond)
5794 		return -ENOMEM;
5795 
5796 	bond[ve->num_bonds].master = master;
5797 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5798 
5799 	ve->bonds = bond;
5800 	ve->num_bonds++;
5801 
5802 	return 0;
5803 }
5804 
5805 struct intel_engine_cs *
5806 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5807 				 unsigned int sibling)
5808 {
5809 	struct virtual_engine *ve = to_virtual_engine(engine);
5810 
5811 	if (sibling >= ve->num_siblings)
5812 		return NULL;
5813 
5814 	return ve->siblings[sibling];
5815 }
5816 
5817 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5818 				   struct drm_printer *m,
5819 				   void (*show_request)(struct drm_printer *m,
5820 							struct i915_request *rq,
5821 							const char *prefix),
5822 				   unsigned int max)
5823 {
5824 	const struct intel_engine_execlists *execlists = &engine->execlists;
5825 	struct i915_request *rq, *last;
5826 	unsigned long flags;
5827 	unsigned int count;
5828 	struct rb_node *rb;
5829 
5830 	spin_lock_irqsave(&engine->active.lock, flags);
5831 
5832 	last = NULL;
5833 	count = 0;
5834 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5835 		if (count++ < max - 1)
5836 			show_request(m, rq, "\t\tE ");
5837 		else
5838 			last = rq;
5839 	}
5840 	if (last) {
5841 		if (count > max) {
5842 			drm_printf(m,
5843 				   "\t\t...skipping %d executing requests...\n",
5844 				   count - max);
5845 		}
5846 		show_request(m, last, "\t\tE ");
5847 	}
5848 
5849 	if (execlists->switch_priority_hint != INT_MIN)
5850 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5851 			   READ_ONCE(execlists->switch_priority_hint));
5852 	if (execlists->queue_priority_hint != INT_MIN)
5853 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5854 			   READ_ONCE(execlists->queue_priority_hint));
5855 
5856 	last = NULL;
5857 	count = 0;
5858 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5859 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5860 		int i;
5861 
5862 		priolist_for_each_request(rq, p, i) {
5863 			if (count++ < max - 1)
5864 				show_request(m, rq, "\t\tQ ");
5865 			else
5866 				last = rq;
5867 		}
5868 	}
5869 	if (last) {
5870 		if (count > max) {
5871 			drm_printf(m,
5872 				   "\t\t...skipping %d queued requests...\n",
5873 				   count - max);
5874 		}
5875 		show_request(m, last, "\t\tQ ");
5876 	}
5877 
5878 	last = NULL;
5879 	count = 0;
5880 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5881 		struct virtual_engine *ve =
5882 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5883 		struct i915_request *rq = READ_ONCE(ve->request);
5884 
5885 		if (rq) {
5886 			if (count++ < max - 1)
5887 				show_request(m, rq, "\t\tV ");
5888 			else
5889 				last = rq;
5890 		}
5891 	}
5892 	if (last) {
5893 		if (count > max) {
5894 			drm_printf(m,
5895 				   "\t\t...skipping %d virtual requests...\n",
5896 				   count - max);
5897 		}
5898 		show_request(m, last, "\t\tV ");
5899 	}
5900 
5901 	spin_unlock_irqrestore(&engine->active.lock, flags);
5902 }
5903 
5904 void intel_lr_context_reset(struct intel_engine_cs *engine,
5905 			    struct intel_context *ce,
5906 			    u32 head,
5907 			    bool scrub)
5908 {
5909 	GEM_BUG_ON(!intel_context_is_pinned(ce));
5910 
5911 	/*
5912 	 * We want a simple context + ring to execute the breadcrumb update.
5913 	 * We cannot rely on the context being intact across the GPU hang,
5914 	 * so clear it and rebuild just what we need for the breadcrumb.
5915 	 * All pending requests for this context will be zapped, and any
5916 	 * future request will be after userspace has had the opportunity
5917 	 * to recreate its own state.
5918 	 */
5919 	if (scrub)
5920 		restore_default_state(ce, engine);
5921 
5922 	/* Rerun the request; its payload has been neutered (if guilty). */
5923 	__execlists_update_reg_state(ce, engine, head);
5924 }
5925 
5926 bool
5927 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5928 {
5929 	return engine->set_default_submission ==
5930 	       intel_execlists_set_default_submission;
5931 }
5932 
5933 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5934 #include "selftest_lrc.c"
5935 #endif
5936