xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 5b7b41cb)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_breadcrumbs.h"
141 #include "intel_context.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_gt_requests.h"
146 #include "intel_lrc_reg.h"
147 #include "intel_mocs.h"
148 #include "intel_reset.h"
149 #include "intel_ring.h"
150 #include "intel_workarounds.h"
151 #include "shmem_utils.h"
152 
153 #define RING_EXECLIST_QFULL		(1 << 0x2)
154 #define RING_EXECLIST1_VALID		(1 << 0x3)
155 #define RING_EXECLIST0_VALID		(1 << 0x4)
156 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
157 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
158 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
159 
160 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
161 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
163 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
164 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
165 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
166 
167 #define GEN8_CTX_STATUS_COMPLETED_MASK \
168 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
169 
170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
171 
172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
174 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
175 #define GEN12_IDLE_CTX_ID		0x7FF
176 #define GEN12_CSB_CTX_VALID(csb_dw) \
177 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
178 
179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
181 
182 struct virtual_engine {
183 	struct intel_engine_cs base;
184 	struct intel_context context;
185 
186 	/*
187 	 * We allow only a single request through the virtual engine at a time
188 	 * (each request in the timeline waits for the completion fence of
189 	 * the previous before being submitted). By restricting ourselves to
190 	 * only submitting a single request, each request is placed on to a
191 	 * physical to maximise load spreading (by virtue of the late greedy
192 	 * scheduling -- each real engine takes the next available request
193 	 * upon idling).
194 	 */
195 	struct i915_request *request;
196 
197 	/*
198 	 * We keep a rbtree of available virtual engines inside each physical
199 	 * engine, sorted by priority. Here we preallocate the nodes we need
200 	 * for the virtual engine, indexed by physical_engine->id.
201 	 */
202 	struct ve_node {
203 		struct rb_node rb;
204 		int prio;
205 	} nodes[I915_NUM_ENGINES];
206 
207 	/*
208 	 * Keep track of bonded pairs -- restrictions upon on our selection
209 	 * of physical engines any particular request may be submitted to.
210 	 * If we receive a submit-fence from a master engine, we will only
211 	 * use one of sibling_mask physical engines.
212 	 */
213 	struct ve_bond {
214 		const struct intel_engine_cs *master;
215 		intel_engine_mask_t sibling_mask;
216 	} *bonds;
217 	unsigned int num_bonds;
218 
219 	/* And finally, which physical engines this virtual engine maps onto. */
220 	unsigned int num_siblings;
221 	struct intel_engine_cs *siblings[];
222 };
223 
224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
225 {
226 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
227 	return container_of(engine, struct virtual_engine, base);
228 }
229 
230 static int __execlists_context_alloc(struct intel_context *ce,
231 				     struct intel_engine_cs *engine);
232 
233 static void execlists_init_reg_state(u32 *reg_state,
234 				     const struct intel_context *ce,
235 				     const struct intel_engine_cs *engine,
236 				     const struct intel_ring *ring,
237 				     bool close);
238 static void
239 __execlists_update_reg_state(const struct intel_context *ce,
240 			     const struct intel_engine_cs *engine,
241 			     u32 head);
242 
243 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
244 {
245 	if (INTEL_GEN(engine->i915) >= 12)
246 		return 0x60;
247 	else if (INTEL_GEN(engine->i915) >= 9)
248 		return 0x54;
249 	else if (engine->class == RENDER_CLASS)
250 		return 0x58;
251 	else
252 		return -1;
253 }
254 
255 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
256 {
257 	if (INTEL_GEN(engine->i915) >= 12)
258 		return 0x74;
259 	else if (INTEL_GEN(engine->i915) >= 9)
260 		return 0x68;
261 	else if (engine->class == RENDER_CLASS)
262 		return 0xd8;
263 	else
264 		return -1;
265 }
266 
267 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
268 {
269 	if (INTEL_GEN(engine->i915) >= 12)
270 		return 0x12;
271 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
272 		return 0x18;
273 	else
274 		return -1;
275 }
276 
277 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
278 {
279 	int x;
280 
281 	x = lrc_ring_wa_bb_per_ctx(engine);
282 	if (x < 0)
283 		return x;
284 
285 	return x + 2;
286 }
287 
288 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
289 {
290 	int x;
291 
292 	x = lrc_ring_indirect_ptr(engine);
293 	if (x < 0)
294 		return x;
295 
296 	return x + 2;
297 }
298 
299 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
300 {
301 	if (engine->class != RENDER_CLASS)
302 		return -1;
303 
304 	if (INTEL_GEN(engine->i915) >= 12)
305 		return 0xb6;
306 	else if (INTEL_GEN(engine->i915) >= 11)
307 		return 0xaa;
308 	else
309 		return -1;
310 }
311 
312 static u32
313 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
314 {
315 	switch (INTEL_GEN(engine->i915)) {
316 	default:
317 		MISSING_CASE(INTEL_GEN(engine->i915));
318 		fallthrough;
319 	case 12:
320 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
321 	case 11:
322 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
323 	case 10:
324 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
325 	case 9:
326 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
327 	case 8:
328 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
329 	}
330 }
331 
332 static void
333 lrc_ring_setup_indirect_ctx(u32 *regs,
334 			    const struct intel_engine_cs *engine,
335 			    u32 ctx_bb_ggtt_addr,
336 			    u32 size)
337 {
338 	GEM_BUG_ON(!size);
339 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
340 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
341 	regs[lrc_ring_indirect_ptr(engine) + 1] =
342 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
343 
344 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
345 	regs[lrc_ring_indirect_offset(engine) + 1] =
346 		lrc_ring_indirect_offset_default(engine) << 6;
347 }
348 
349 static u32 intel_context_get_runtime(const struct intel_context *ce)
350 {
351 	/*
352 	 * We can use either ppHWSP[16] which is recorded before the context
353 	 * switch (and so excludes the cost of context switches) or use the
354 	 * value from the context image itself, which is saved/restored earlier
355 	 * and so includes the cost of the save.
356 	 */
357 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
358 }
359 
360 static void mark_eio(struct i915_request *rq)
361 {
362 	if (i915_request_completed(rq))
363 		return;
364 
365 	GEM_BUG_ON(i915_request_signaled(rq));
366 
367 	i915_request_set_error_once(rq, -EIO);
368 	i915_request_mark_complete(rq);
369 }
370 
371 static struct i915_request *
372 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
373 {
374 	struct i915_request *active = rq;
375 
376 	rcu_read_lock();
377 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
378 		if (i915_request_completed(rq))
379 			break;
380 
381 		active = rq;
382 	}
383 	rcu_read_unlock();
384 
385 	return active;
386 }
387 
388 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
389 {
390 	return (i915_ggtt_offset(engine->status_page.vma) +
391 		I915_GEM_HWS_PREEMPT_ADDR);
392 }
393 
394 static inline void
395 ring_set_paused(const struct intel_engine_cs *engine, int state)
396 {
397 	/*
398 	 * We inspect HWS_PREEMPT with a semaphore inside
399 	 * engine->emit_fini_breadcrumb. If the dword is true,
400 	 * the ring is paused as the semaphore will busywait
401 	 * until the dword is false.
402 	 */
403 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
404 	if (state)
405 		wmb();
406 }
407 
408 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
409 {
410 	return rb_entry(rb, struct i915_priolist, node);
411 }
412 
413 static inline int rq_prio(const struct i915_request *rq)
414 {
415 	return READ_ONCE(rq->sched.attr.priority);
416 }
417 
418 static int effective_prio(const struct i915_request *rq)
419 {
420 	int prio = rq_prio(rq);
421 
422 	/*
423 	 * If this request is special and must not be interrupted at any
424 	 * cost, so be it. Note we are only checking the most recent request
425 	 * in the context and so may be masking an earlier vip request. It
426 	 * is hoped that under the conditions where nopreempt is used, this
427 	 * will not matter (i.e. all requests to that context will be
428 	 * nopreempt for as long as desired).
429 	 */
430 	if (i915_request_has_nopreempt(rq))
431 		prio = I915_PRIORITY_UNPREEMPTABLE;
432 
433 	return prio;
434 }
435 
436 static int queue_prio(const struct intel_engine_execlists *execlists)
437 {
438 	struct i915_priolist *p;
439 	struct rb_node *rb;
440 
441 	rb = rb_first_cached(&execlists->queue);
442 	if (!rb)
443 		return INT_MIN;
444 
445 	/*
446 	 * As the priolist[] are inverted, with the highest priority in [0],
447 	 * we have to flip the index value to become priority.
448 	 */
449 	p = to_priolist(rb);
450 	if (!I915_USER_PRIORITY_SHIFT)
451 		return p->priority;
452 
453 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
454 }
455 
456 static inline bool need_preempt(const struct intel_engine_cs *engine,
457 				const struct i915_request *rq,
458 				struct rb_node *rb)
459 {
460 	int last_prio;
461 
462 	if (!intel_engine_has_semaphores(engine))
463 		return false;
464 
465 	/*
466 	 * Check if the current priority hint merits a preemption attempt.
467 	 *
468 	 * We record the highest value priority we saw during rescheduling
469 	 * prior to this dequeue, therefore we know that if it is strictly
470 	 * less than the current tail of ESLP[0], we do not need to force
471 	 * a preempt-to-idle cycle.
472 	 *
473 	 * However, the priority hint is a mere hint that we may need to
474 	 * preempt. If that hint is stale or we may be trying to preempt
475 	 * ourselves, ignore the request.
476 	 *
477 	 * More naturally we would write
478 	 *      prio >= max(0, last);
479 	 * except that we wish to prevent triggering preemption at the same
480 	 * priority level: the task that is running should remain running
481 	 * to preserve FIFO ordering of dependencies.
482 	 */
483 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
484 	if (engine->execlists.queue_priority_hint <= last_prio)
485 		return false;
486 
487 	/*
488 	 * Check against the first request in ELSP[1], it will, thanks to the
489 	 * power of PI, be the highest priority of that context.
490 	 */
491 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
492 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
493 		return true;
494 
495 	if (rb) {
496 		struct virtual_engine *ve =
497 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
498 		bool preempt = false;
499 
500 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
501 			struct i915_request *next;
502 
503 			rcu_read_lock();
504 			next = READ_ONCE(ve->request);
505 			if (next)
506 				preempt = rq_prio(next) > last_prio;
507 			rcu_read_unlock();
508 		}
509 
510 		if (preempt)
511 			return preempt;
512 	}
513 
514 	/*
515 	 * If the inflight context did not trigger the preemption, then maybe
516 	 * it was the set of queued requests? Pick the highest priority in
517 	 * the queue (the first active priolist) and see if it deserves to be
518 	 * running instead of ELSP[0].
519 	 *
520 	 * The highest priority request in the queue can not be either
521 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
522 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
523 	 */
524 	return queue_prio(&engine->execlists) > last_prio;
525 }
526 
527 __maybe_unused static inline bool
528 assert_priority_queue(const struct i915_request *prev,
529 		      const struct i915_request *next)
530 {
531 	/*
532 	 * Without preemption, the prev may refer to the still active element
533 	 * which we refuse to let go.
534 	 *
535 	 * Even with preemption, there are times when we think it is better not
536 	 * to preempt and leave an ostensibly lower priority request in flight.
537 	 */
538 	if (i915_request_is_active(prev))
539 		return true;
540 
541 	return rq_prio(prev) >= rq_prio(next);
542 }
543 
544 /*
545  * The context descriptor encodes various attributes of a context,
546  * including its GTT address and some flags. Because it's fairly
547  * expensive to calculate, we'll just do it once and cache the result,
548  * which remains valid until the context is unpinned.
549  *
550  * This is what a descriptor looks like, from LSB to MSB::
551  *
552  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
553  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
554  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
555  *      bits 53-54:    mbz, reserved for use by hardware
556  *      bits 55-63:    group ID, currently unused and set to 0
557  *
558  * Starting from Gen11, the upper dword of the descriptor has a new format:
559  *
560  *      bits 32-36:    reserved
561  *      bits 37-47:    SW context ID
562  *      bits 48:53:    engine instance
563  *      bit 54:        mbz, reserved for use by hardware
564  *      bits 55-60:    SW counter
565  *      bits 61-63:    engine class
566  *
567  * engine info, SW context ID and SW counter need to form a unique number
568  * (Context ID) per lrc.
569  */
570 static u32
571 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
572 {
573 	u32 desc;
574 
575 	desc = INTEL_LEGACY_32B_CONTEXT;
576 	if (i915_vm_is_4lvl(ce->vm))
577 		desc = INTEL_LEGACY_64B_CONTEXT;
578 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
579 
580 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
581 	if (IS_GEN(engine->i915, 8))
582 		desc |= GEN8_CTX_L3LLC_COHERENT;
583 
584 	return i915_ggtt_offset(ce->state) | desc;
585 }
586 
587 static inline unsigned int dword_in_page(void *addr)
588 {
589 	return offset_in_page(addr) / sizeof(u32);
590 }
591 
592 static void set_offsets(u32 *regs,
593 			const u8 *data,
594 			const struct intel_engine_cs *engine,
595 			bool clear)
596 #define NOP(x) (BIT(7) | (x))
597 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
598 #define POSTED BIT(0)
599 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
600 #define REG16(x) \
601 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
602 	(((x) >> 2) & 0x7f)
603 #define END(total_state_size) 0, (total_state_size)
604 {
605 	const u32 base = engine->mmio_base;
606 
607 	while (*data) {
608 		u8 count, flags;
609 
610 		if (*data & BIT(7)) { /* skip */
611 			count = *data++ & ~BIT(7);
612 			if (clear)
613 				memset32(regs, MI_NOOP, count);
614 			regs += count;
615 			continue;
616 		}
617 
618 		count = *data & 0x3f;
619 		flags = *data >> 6;
620 		data++;
621 
622 		*regs = MI_LOAD_REGISTER_IMM(count);
623 		if (flags & POSTED)
624 			*regs |= MI_LRI_FORCE_POSTED;
625 		if (INTEL_GEN(engine->i915) >= 11)
626 			*regs |= MI_LRI_LRM_CS_MMIO;
627 		regs++;
628 
629 		GEM_BUG_ON(!count);
630 		do {
631 			u32 offset = 0;
632 			u8 v;
633 
634 			do {
635 				v = *data++;
636 				offset <<= 7;
637 				offset |= v & ~BIT(7);
638 			} while (v & BIT(7));
639 
640 			regs[0] = base + (offset << 2);
641 			if (clear)
642 				regs[1] = 0;
643 			regs += 2;
644 		} while (--count);
645 	}
646 
647 	if (clear) {
648 		u8 count = *++data;
649 
650 		/* Clear past the tail for HW access */
651 		GEM_BUG_ON(dword_in_page(regs) > count);
652 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
653 
654 		/* Close the batch; used mainly by live_lrc_layout() */
655 		*regs = MI_BATCH_BUFFER_END;
656 		if (INTEL_GEN(engine->i915) >= 10)
657 			*regs |= BIT(0);
658 	}
659 }
660 
661 static const u8 gen8_xcs_offsets[] = {
662 	NOP(1),
663 	LRI(11, 0),
664 	REG16(0x244),
665 	REG(0x034),
666 	REG(0x030),
667 	REG(0x038),
668 	REG(0x03c),
669 	REG(0x168),
670 	REG(0x140),
671 	REG(0x110),
672 	REG(0x11c),
673 	REG(0x114),
674 	REG(0x118),
675 
676 	NOP(9),
677 	LRI(9, 0),
678 	REG16(0x3a8),
679 	REG16(0x28c),
680 	REG16(0x288),
681 	REG16(0x284),
682 	REG16(0x280),
683 	REG16(0x27c),
684 	REG16(0x278),
685 	REG16(0x274),
686 	REG16(0x270),
687 
688 	NOP(13),
689 	LRI(2, 0),
690 	REG16(0x200),
691 	REG(0x028),
692 
693 	END(80)
694 };
695 
696 static const u8 gen9_xcs_offsets[] = {
697 	NOP(1),
698 	LRI(14, POSTED),
699 	REG16(0x244),
700 	REG(0x034),
701 	REG(0x030),
702 	REG(0x038),
703 	REG(0x03c),
704 	REG(0x168),
705 	REG(0x140),
706 	REG(0x110),
707 	REG(0x11c),
708 	REG(0x114),
709 	REG(0x118),
710 	REG(0x1c0),
711 	REG(0x1c4),
712 	REG(0x1c8),
713 
714 	NOP(3),
715 	LRI(9, POSTED),
716 	REG16(0x3a8),
717 	REG16(0x28c),
718 	REG16(0x288),
719 	REG16(0x284),
720 	REG16(0x280),
721 	REG16(0x27c),
722 	REG16(0x278),
723 	REG16(0x274),
724 	REG16(0x270),
725 
726 	NOP(13),
727 	LRI(1, POSTED),
728 	REG16(0x200),
729 
730 	NOP(13),
731 	LRI(44, POSTED),
732 	REG(0x028),
733 	REG(0x09c),
734 	REG(0x0c0),
735 	REG(0x178),
736 	REG(0x17c),
737 	REG16(0x358),
738 	REG(0x170),
739 	REG(0x150),
740 	REG(0x154),
741 	REG(0x158),
742 	REG16(0x41c),
743 	REG16(0x600),
744 	REG16(0x604),
745 	REG16(0x608),
746 	REG16(0x60c),
747 	REG16(0x610),
748 	REG16(0x614),
749 	REG16(0x618),
750 	REG16(0x61c),
751 	REG16(0x620),
752 	REG16(0x624),
753 	REG16(0x628),
754 	REG16(0x62c),
755 	REG16(0x630),
756 	REG16(0x634),
757 	REG16(0x638),
758 	REG16(0x63c),
759 	REG16(0x640),
760 	REG16(0x644),
761 	REG16(0x648),
762 	REG16(0x64c),
763 	REG16(0x650),
764 	REG16(0x654),
765 	REG16(0x658),
766 	REG16(0x65c),
767 	REG16(0x660),
768 	REG16(0x664),
769 	REG16(0x668),
770 	REG16(0x66c),
771 	REG16(0x670),
772 	REG16(0x674),
773 	REG16(0x678),
774 	REG16(0x67c),
775 	REG(0x068),
776 
777 	END(176)
778 };
779 
780 static const u8 gen12_xcs_offsets[] = {
781 	NOP(1),
782 	LRI(13, POSTED),
783 	REG16(0x244),
784 	REG(0x034),
785 	REG(0x030),
786 	REG(0x038),
787 	REG(0x03c),
788 	REG(0x168),
789 	REG(0x140),
790 	REG(0x110),
791 	REG(0x1c0),
792 	REG(0x1c4),
793 	REG(0x1c8),
794 	REG(0x180),
795 	REG16(0x2b4),
796 
797 	NOP(5),
798 	LRI(9, POSTED),
799 	REG16(0x3a8),
800 	REG16(0x28c),
801 	REG16(0x288),
802 	REG16(0x284),
803 	REG16(0x280),
804 	REG16(0x27c),
805 	REG16(0x278),
806 	REG16(0x274),
807 	REG16(0x270),
808 
809 	END(80)
810 };
811 
812 static const u8 gen8_rcs_offsets[] = {
813 	NOP(1),
814 	LRI(14, POSTED),
815 	REG16(0x244),
816 	REG(0x034),
817 	REG(0x030),
818 	REG(0x038),
819 	REG(0x03c),
820 	REG(0x168),
821 	REG(0x140),
822 	REG(0x110),
823 	REG(0x11c),
824 	REG(0x114),
825 	REG(0x118),
826 	REG(0x1c0),
827 	REG(0x1c4),
828 	REG(0x1c8),
829 
830 	NOP(3),
831 	LRI(9, POSTED),
832 	REG16(0x3a8),
833 	REG16(0x28c),
834 	REG16(0x288),
835 	REG16(0x284),
836 	REG16(0x280),
837 	REG16(0x27c),
838 	REG16(0x278),
839 	REG16(0x274),
840 	REG16(0x270),
841 
842 	NOP(13),
843 	LRI(1, 0),
844 	REG(0x0c8),
845 
846 	END(80)
847 };
848 
849 static const u8 gen9_rcs_offsets[] = {
850 	NOP(1),
851 	LRI(14, POSTED),
852 	REG16(0x244),
853 	REG(0x34),
854 	REG(0x30),
855 	REG(0x38),
856 	REG(0x3c),
857 	REG(0x168),
858 	REG(0x140),
859 	REG(0x110),
860 	REG(0x11c),
861 	REG(0x114),
862 	REG(0x118),
863 	REG(0x1c0),
864 	REG(0x1c4),
865 	REG(0x1c8),
866 
867 	NOP(3),
868 	LRI(9, POSTED),
869 	REG16(0x3a8),
870 	REG16(0x28c),
871 	REG16(0x288),
872 	REG16(0x284),
873 	REG16(0x280),
874 	REG16(0x27c),
875 	REG16(0x278),
876 	REG16(0x274),
877 	REG16(0x270),
878 
879 	NOP(13),
880 	LRI(1, 0),
881 	REG(0xc8),
882 
883 	NOP(13),
884 	LRI(44, POSTED),
885 	REG(0x28),
886 	REG(0x9c),
887 	REG(0xc0),
888 	REG(0x178),
889 	REG(0x17c),
890 	REG16(0x358),
891 	REG(0x170),
892 	REG(0x150),
893 	REG(0x154),
894 	REG(0x158),
895 	REG16(0x41c),
896 	REG16(0x600),
897 	REG16(0x604),
898 	REG16(0x608),
899 	REG16(0x60c),
900 	REG16(0x610),
901 	REG16(0x614),
902 	REG16(0x618),
903 	REG16(0x61c),
904 	REG16(0x620),
905 	REG16(0x624),
906 	REG16(0x628),
907 	REG16(0x62c),
908 	REG16(0x630),
909 	REG16(0x634),
910 	REG16(0x638),
911 	REG16(0x63c),
912 	REG16(0x640),
913 	REG16(0x644),
914 	REG16(0x648),
915 	REG16(0x64c),
916 	REG16(0x650),
917 	REG16(0x654),
918 	REG16(0x658),
919 	REG16(0x65c),
920 	REG16(0x660),
921 	REG16(0x664),
922 	REG16(0x668),
923 	REG16(0x66c),
924 	REG16(0x670),
925 	REG16(0x674),
926 	REG16(0x678),
927 	REG16(0x67c),
928 	REG(0x68),
929 
930 	END(176)
931 };
932 
933 static const u8 gen11_rcs_offsets[] = {
934 	NOP(1),
935 	LRI(15, POSTED),
936 	REG16(0x244),
937 	REG(0x034),
938 	REG(0x030),
939 	REG(0x038),
940 	REG(0x03c),
941 	REG(0x168),
942 	REG(0x140),
943 	REG(0x110),
944 	REG(0x11c),
945 	REG(0x114),
946 	REG(0x118),
947 	REG(0x1c0),
948 	REG(0x1c4),
949 	REG(0x1c8),
950 	REG(0x180),
951 
952 	NOP(1),
953 	LRI(9, POSTED),
954 	REG16(0x3a8),
955 	REG16(0x28c),
956 	REG16(0x288),
957 	REG16(0x284),
958 	REG16(0x280),
959 	REG16(0x27c),
960 	REG16(0x278),
961 	REG16(0x274),
962 	REG16(0x270),
963 
964 	LRI(1, POSTED),
965 	REG(0x1b0),
966 
967 	NOP(10),
968 	LRI(1, 0),
969 	REG(0x0c8),
970 
971 	END(80)
972 };
973 
974 static const u8 gen12_rcs_offsets[] = {
975 	NOP(1),
976 	LRI(13, POSTED),
977 	REG16(0x244),
978 	REG(0x034),
979 	REG(0x030),
980 	REG(0x038),
981 	REG(0x03c),
982 	REG(0x168),
983 	REG(0x140),
984 	REG(0x110),
985 	REG(0x1c0),
986 	REG(0x1c4),
987 	REG(0x1c8),
988 	REG(0x180),
989 	REG16(0x2b4),
990 
991 	NOP(5),
992 	LRI(9, POSTED),
993 	REG16(0x3a8),
994 	REG16(0x28c),
995 	REG16(0x288),
996 	REG16(0x284),
997 	REG16(0x280),
998 	REG16(0x27c),
999 	REG16(0x278),
1000 	REG16(0x274),
1001 	REG16(0x270),
1002 
1003 	LRI(3, POSTED),
1004 	REG(0x1b0),
1005 	REG16(0x5a8),
1006 	REG16(0x5ac),
1007 
1008 	NOP(6),
1009 	LRI(1, 0),
1010 	REG(0x0c8),
1011 	NOP(3 + 9 + 1),
1012 
1013 	LRI(51, POSTED),
1014 	REG16(0x588),
1015 	REG16(0x588),
1016 	REG16(0x588),
1017 	REG16(0x588),
1018 	REG16(0x588),
1019 	REG16(0x588),
1020 	REG(0x028),
1021 	REG(0x09c),
1022 	REG(0x0c0),
1023 	REG(0x178),
1024 	REG(0x17c),
1025 	REG16(0x358),
1026 	REG(0x170),
1027 	REG(0x150),
1028 	REG(0x154),
1029 	REG(0x158),
1030 	REG16(0x41c),
1031 	REG16(0x600),
1032 	REG16(0x604),
1033 	REG16(0x608),
1034 	REG16(0x60c),
1035 	REG16(0x610),
1036 	REG16(0x614),
1037 	REG16(0x618),
1038 	REG16(0x61c),
1039 	REG16(0x620),
1040 	REG16(0x624),
1041 	REG16(0x628),
1042 	REG16(0x62c),
1043 	REG16(0x630),
1044 	REG16(0x634),
1045 	REG16(0x638),
1046 	REG16(0x63c),
1047 	REG16(0x640),
1048 	REG16(0x644),
1049 	REG16(0x648),
1050 	REG16(0x64c),
1051 	REG16(0x650),
1052 	REG16(0x654),
1053 	REG16(0x658),
1054 	REG16(0x65c),
1055 	REG16(0x660),
1056 	REG16(0x664),
1057 	REG16(0x668),
1058 	REG16(0x66c),
1059 	REG16(0x670),
1060 	REG16(0x674),
1061 	REG16(0x678),
1062 	REG16(0x67c),
1063 	REG(0x068),
1064 	REG(0x084),
1065 	NOP(1),
1066 
1067 	END(192)
1068 };
1069 
1070 #undef END
1071 #undef REG16
1072 #undef REG
1073 #undef LRI
1074 #undef NOP
1075 
1076 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1077 {
1078 	/*
1079 	 * The gen12+ lists only have the registers we program in the basic
1080 	 * default state. We rely on the context image using relative
1081 	 * addressing to automatic fixup the register state between the
1082 	 * physical engines for virtual engine.
1083 	 */
1084 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1085 		   !intel_engine_has_relative_mmio(engine));
1086 
1087 	if (engine->class == RENDER_CLASS) {
1088 		if (INTEL_GEN(engine->i915) >= 12)
1089 			return gen12_rcs_offsets;
1090 		else if (INTEL_GEN(engine->i915) >= 11)
1091 			return gen11_rcs_offsets;
1092 		else if (INTEL_GEN(engine->i915) >= 9)
1093 			return gen9_rcs_offsets;
1094 		else
1095 			return gen8_rcs_offsets;
1096 	} else {
1097 		if (INTEL_GEN(engine->i915) >= 12)
1098 			return gen12_xcs_offsets;
1099 		else if (INTEL_GEN(engine->i915) >= 9)
1100 			return gen9_xcs_offsets;
1101 		else
1102 			return gen8_xcs_offsets;
1103 	}
1104 }
1105 
1106 static struct i915_request *
1107 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1108 {
1109 	struct i915_request *rq, *rn, *active = NULL;
1110 	struct list_head *pl;
1111 	int prio = I915_PRIORITY_INVALID;
1112 
1113 	lockdep_assert_held(&engine->active.lock);
1114 
1115 	list_for_each_entry_safe_reverse(rq, rn,
1116 					 &engine->active.requests,
1117 					 sched.link) {
1118 		if (i915_request_completed(rq))
1119 			continue; /* XXX */
1120 
1121 		__i915_request_unsubmit(rq);
1122 
1123 		/*
1124 		 * Push the request back into the queue for later resubmission.
1125 		 * If this request is not native to this physical engine (i.e.
1126 		 * it came from a virtual source), push it back onto the virtual
1127 		 * engine so that it can be moved across onto another physical
1128 		 * engine as load dictates.
1129 		 */
1130 		if (likely(rq->execution_mask == engine->mask)) {
1131 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1132 			if (rq_prio(rq) != prio) {
1133 				prio = rq_prio(rq);
1134 				pl = i915_sched_lookup_priolist(engine, prio);
1135 			}
1136 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1137 
1138 			list_move(&rq->sched.link, pl);
1139 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1140 
1141 			/* Check in case we rollback so far we wrap [size/2] */
1142 			if (intel_ring_direction(rq->ring,
1143 						 rq->tail,
1144 						 rq->ring->tail + 8) > 0)
1145 				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1146 
1147 			active = rq;
1148 		} else {
1149 			struct intel_engine_cs *owner = rq->context->engine;
1150 
1151 			WRITE_ONCE(rq->engine, owner);
1152 			owner->submit_request(rq);
1153 			active = NULL;
1154 		}
1155 	}
1156 
1157 	return active;
1158 }
1159 
1160 struct i915_request *
1161 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1162 {
1163 	struct intel_engine_cs *engine =
1164 		container_of(execlists, typeof(*engine), execlists);
1165 
1166 	return __unwind_incomplete_requests(engine);
1167 }
1168 
1169 static inline void
1170 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1171 {
1172 	/*
1173 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1174 	 * The compiler should eliminate this function as dead-code.
1175 	 */
1176 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1177 		return;
1178 
1179 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1180 				   status, rq);
1181 }
1182 
1183 static void intel_engine_context_in(struct intel_engine_cs *engine)
1184 {
1185 	unsigned long flags;
1186 
1187 	if (atomic_add_unless(&engine->stats.active, 1, 0))
1188 		return;
1189 
1190 	write_seqlock_irqsave(&engine->stats.lock, flags);
1191 	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1192 		engine->stats.start = ktime_get();
1193 		atomic_inc(&engine->stats.active);
1194 	}
1195 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1196 }
1197 
1198 static void intel_engine_context_out(struct intel_engine_cs *engine)
1199 {
1200 	unsigned long flags;
1201 
1202 	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1203 
1204 	if (atomic_add_unless(&engine->stats.active, -1, 1))
1205 		return;
1206 
1207 	write_seqlock_irqsave(&engine->stats.lock, flags);
1208 	if (atomic_dec_and_test(&engine->stats.active)) {
1209 		engine->stats.total =
1210 			ktime_add(engine->stats.total,
1211 				  ktime_sub(ktime_get(), engine->stats.start));
1212 	}
1213 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1214 }
1215 
1216 static void
1217 execlists_check_context(const struct intel_context *ce,
1218 			const struct intel_engine_cs *engine)
1219 {
1220 	const struct intel_ring *ring = ce->ring;
1221 	u32 *regs = ce->lrc_reg_state;
1222 	bool valid = true;
1223 	int x;
1224 
1225 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1226 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1227 		       engine->name,
1228 		       regs[CTX_RING_START],
1229 		       i915_ggtt_offset(ring->vma));
1230 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1231 		valid = false;
1232 	}
1233 
1234 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1235 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1236 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1237 		       engine->name,
1238 		       regs[CTX_RING_CTL],
1239 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1240 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1241 		valid = false;
1242 	}
1243 
1244 	x = lrc_ring_mi_mode(engine);
1245 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1246 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1247 		       engine->name, regs[x + 1]);
1248 		regs[x + 1] &= ~STOP_RING;
1249 		regs[x + 1] |= STOP_RING << 16;
1250 		valid = false;
1251 	}
1252 
1253 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1254 }
1255 
1256 static void restore_default_state(struct intel_context *ce,
1257 				  struct intel_engine_cs *engine)
1258 {
1259 	u32 *regs;
1260 
1261 	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1262 	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1263 
1264 	ce->runtime.last = intel_context_get_runtime(ce);
1265 }
1266 
1267 static void reset_active(struct i915_request *rq,
1268 			 struct intel_engine_cs *engine)
1269 {
1270 	struct intel_context * const ce = rq->context;
1271 	u32 head;
1272 
1273 	/*
1274 	 * The executing context has been cancelled. We want to prevent
1275 	 * further execution along this context and propagate the error on
1276 	 * to anything depending on its results.
1277 	 *
1278 	 * In __i915_request_submit(), we apply the -EIO and remove the
1279 	 * requests' payloads for any banned requests. But first, we must
1280 	 * rewind the context back to the start of the incomplete request so
1281 	 * that we do not jump back into the middle of the batch.
1282 	 *
1283 	 * We preserve the breadcrumbs and semaphores of the incomplete
1284 	 * requests so that inter-timeline dependencies (i.e other timelines)
1285 	 * remain correctly ordered. And we defer to __i915_request_submit()
1286 	 * so that all asynchronous waits are correctly handled.
1287 	 */
1288 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1289 		     rq->fence.context, rq->fence.seqno);
1290 
1291 	/* On resubmission of the active request, payload will be scrubbed */
1292 	if (i915_request_completed(rq))
1293 		head = rq->tail;
1294 	else
1295 		head = active_request(ce->timeline, rq)->head;
1296 	head = intel_ring_wrap(ce->ring, head);
1297 
1298 	/* Scrub the context image to prevent replaying the previous batch */
1299 	restore_default_state(ce, engine);
1300 	__execlists_update_reg_state(ce, engine, head);
1301 
1302 	/* We've switched away, so this should be a no-op, but intent matters */
1303 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1304 }
1305 
1306 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1307 {
1308 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1309 	ce->runtime.num_underflow += dt < 0;
1310 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1311 #endif
1312 }
1313 
1314 static void intel_context_update_runtime(struct intel_context *ce)
1315 {
1316 	u32 old;
1317 	s32 dt;
1318 
1319 	if (intel_context_is_barrier(ce))
1320 		return;
1321 
1322 	old = ce->runtime.last;
1323 	ce->runtime.last = intel_context_get_runtime(ce);
1324 	dt = ce->runtime.last - old;
1325 
1326 	if (unlikely(dt <= 0)) {
1327 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1328 			 old, ce->runtime.last, dt);
1329 		st_update_runtime_underflow(ce, dt);
1330 		return;
1331 	}
1332 
1333 	ewma_runtime_add(&ce->runtime.avg, dt);
1334 	ce->runtime.total += dt;
1335 }
1336 
1337 static inline struct intel_engine_cs *
1338 __execlists_schedule_in(struct i915_request *rq)
1339 {
1340 	struct intel_engine_cs * const engine = rq->engine;
1341 	struct intel_context * const ce = rq->context;
1342 
1343 	intel_context_get(ce);
1344 
1345 	if (unlikely(intel_context_is_banned(ce)))
1346 		reset_active(rq, engine);
1347 
1348 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1349 		execlists_check_context(ce, engine);
1350 
1351 	if (ce->tag) {
1352 		/* Use a fixed tag for OA and friends */
1353 		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1354 		ce->lrc.ccid = ce->tag;
1355 	} else {
1356 		/* We don't need a strict matching tag, just different values */
1357 		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1358 
1359 		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1360 		clear_bit(tag - 1, &engine->context_tag);
1361 		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1362 
1363 		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1364 	}
1365 
1366 	ce->lrc.ccid |= engine->execlists.ccid;
1367 
1368 	__intel_gt_pm_get(engine->gt);
1369 	if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1370 		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1371 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1372 	intel_engine_context_in(engine);
1373 
1374 	return engine;
1375 }
1376 
1377 static inline struct i915_request *
1378 execlists_schedule_in(struct i915_request *rq, int idx)
1379 {
1380 	struct intel_context * const ce = rq->context;
1381 	struct intel_engine_cs *old;
1382 
1383 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1384 	trace_i915_request_in(rq, idx);
1385 
1386 	old = READ_ONCE(ce->inflight);
1387 	do {
1388 		if (!old) {
1389 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1390 			break;
1391 		}
1392 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1393 
1394 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1395 	return i915_request_get(rq);
1396 }
1397 
1398 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1399 {
1400 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1401 	struct i915_request *next = READ_ONCE(ve->request);
1402 
1403 	if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1404 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
1405 }
1406 
1407 static inline void
1408 __execlists_schedule_out(struct i915_request *rq,
1409 			 struct intel_engine_cs * const engine,
1410 			 unsigned int ccid)
1411 {
1412 	struct intel_context * const ce = rq->context;
1413 
1414 	/*
1415 	 * NB process_csb() is not under the engine->active.lock and hence
1416 	 * schedule_out can race with schedule_in meaning that we should
1417 	 * refrain from doing non-trivial work here.
1418 	 */
1419 
1420 	/*
1421 	 * If we have just completed this context, the engine may now be
1422 	 * idle and we want to re-enter powersaving.
1423 	 */
1424 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1425 	    i915_request_completed(rq))
1426 		intel_engine_add_retire(engine, ce->timeline);
1427 
1428 	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1429 	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1430 	if (ccid < BITS_PER_LONG) {
1431 		GEM_BUG_ON(ccid == 0);
1432 		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1433 		set_bit(ccid - 1, &engine->context_tag);
1434 	}
1435 
1436 	intel_context_update_runtime(ce);
1437 	intel_engine_context_out(engine);
1438 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1439 	if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1440 		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1441 	intel_gt_pm_put_async(engine->gt);
1442 
1443 	/*
1444 	 * If this is part of a virtual engine, its next request may
1445 	 * have been blocked waiting for access to the active context.
1446 	 * We have to kick all the siblings again in case we need to
1447 	 * switch (e.g. the next request is not runnable on this
1448 	 * engine). Hopefully, we will already have submitted the next
1449 	 * request before the tasklet runs and do not need to rebuild
1450 	 * each virtual tree and kick everyone again.
1451 	 */
1452 	if (ce->engine != engine)
1453 		kick_siblings(rq, ce);
1454 
1455 	intel_context_put(ce);
1456 }
1457 
1458 static inline void
1459 execlists_schedule_out(struct i915_request *rq)
1460 {
1461 	struct intel_context * const ce = rq->context;
1462 	struct intel_engine_cs *cur, *old;
1463 	u32 ccid;
1464 
1465 	trace_i915_request_out(rq);
1466 
1467 	ccid = rq->context->lrc.ccid;
1468 	old = READ_ONCE(ce->inflight);
1469 	do
1470 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1471 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1472 	if (!cur)
1473 		__execlists_schedule_out(rq, old, ccid);
1474 
1475 	i915_request_put(rq);
1476 }
1477 
1478 static u64 execlists_update_context(struct i915_request *rq)
1479 {
1480 	struct intel_context *ce = rq->context;
1481 	u64 desc = ce->lrc.desc;
1482 	u32 tail, prev;
1483 
1484 	/*
1485 	 * WaIdleLiteRestore:bdw,skl
1486 	 *
1487 	 * We should never submit the context with the same RING_TAIL twice
1488 	 * just in case we submit an empty ring, which confuses the HW.
1489 	 *
1490 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1491 	 * the normal request to be able to always advance the RING_TAIL on
1492 	 * subsequent resubmissions (for lite restore). Should that fail us,
1493 	 * and we try and submit the same tail again, force the context
1494 	 * reload.
1495 	 *
1496 	 * If we need to return to a preempted context, we need to skip the
1497 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1498 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1499 	 * an earlier request.
1500 	 */
1501 	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1502 	prev = rq->ring->tail;
1503 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1504 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1505 		desc |= CTX_DESC_FORCE_RESTORE;
1506 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1507 	rq->tail = rq->wa_tail;
1508 
1509 	/*
1510 	 * Make sure the context image is complete before we submit it to HW.
1511 	 *
1512 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1513 	 * an uncached write such as our mmio register access, the empirical
1514 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1515 	 * may not be visible to the HW prior to the completion of the UC
1516 	 * register write and that we may begin execution from the context
1517 	 * before its image is complete leading to invalid PD chasing.
1518 	 */
1519 	wmb();
1520 
1521 	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1522 	return desc;
1523 }
1524 
1525 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1526 {
1527 	if (execlists->ctrl_reg) {
1528 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1529 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1530 	} else {
1531 		writel(upper_32_bits(desc), execlists->submit_reg);
1532 		writel(lower_32_bits(desc), execlists->submit_reg);
1533 	}
1534 }
1535 
1536 static __maybe_unused char *
1537 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1538 {
1539 	if (!rq)
1540 		return "";
1541 
1542 	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1543 		 prefix,
1544 		 rq->context->lrc.ccid,
1545 		 rq->fence.context, rq->fence.seqno,
1546 		 i915_request_completed(rq) ? "!" :
1547 		 i915_request_started(rq) ? "*" :
1548 		 "",
1549 		 rq_prio(rq));
1550 
1551 	return buf;
1552 }
1553 
1554 static __maybe_unused void
1555 trace_ports(const struct intel_engine_execlists *execlists,
1556 	    const char *msg,
1557 	    struct i915_request * const *ports)
1558 {
1559 	const struct intel_engine_cs *engine =
1560 		container_of(execlists, typeof(*engine), execlists);
1561 	char __maybe_unused p0[40], p1[40];
1562 
1563 	if (!ports[0])
1564 		return;
1565 
1566 	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1567 		     dump_port(p0, sizeof(p0), "", ports[0]),
1568 		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1569 }
1570 
1571 static inline bool
1572 reset_in_progress(const struct intel_engine_execlists *execlists)
1573 {
1574 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1575 }
1576 
1577 static __maybe_unused bool
1578 assert_pending_valid(const struct intel_engine_execlists *execlists,
1579 		     const char *msg)
1580 {
1581 	struct intel_engine_cs *engine =
1582 		container_of(execlists, typeof(*engine), execlists);
1583 	struct i915_request * const *port, *rq;
1584 	struct intel_context *ce = NULL;
1585 	bool sentinel = false;
1586 	u32 ccid = -1;
1587 
1588 	trace_ports(execlists, msg, execlists->pending);
1589 
1590 	/* We may be messing around with the lists during reset, lalala */
1591 	if (reset_in_progress(execlists))
1592 		return true;
1593 
1594 	if (!execlists->pending[0]) {
1595 		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1596 			      engine->name);
1597 		return false;
1598 	}
1599 
1600 	if (execlists->pending[execlists_num_ports(execlists)]) {
1601 		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1602 			      engine->name, execlists_num_ports(execlists));
1603 		return false;
1604 	}
1605 
1606 	for (port = execlists->pending; (rq = *port); port++) {
1607 		unsigned long flags;
1608 		bool ok = true;
1609 
1610 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1611 		GEM_BUG_ON(!i915_request_is_active(rq));
1612 
1613 		if (ce == rq->context) {
1614 			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1615 				      engine->name,
1616 				      ce->timeline->fence_context,
1617 				      port - execlists->pending);
1618 			return false;
1619 		}
1620 		ce = rq->context;
1621 
1622 		if (ccid == ce->lrc.ccid) {
1623 			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1624 				      engine->name,
1625 				      ccid, ce->timeline->fence_context,
1626 				      port - execlists->pending);
1627 			return false;
1628 		}
1629 		ccid = ce->lrc.ccid;
1630 
1631 		/*
1632 		 * Sentinels are supposed to be the last request so they flush
1633 		 * the current execution off the HW. Check that they are the only
1634 		 * request in the pending submission.
1635 		 */
1636 		if (sentinel) {
1637 			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1638 				      engine->name,
1639 				      ce->timeline->fence_context,
1640 				      port - execlists->pending);
1641 			return false;
1642 		}
1643 		sentinel = i915_request_has_sentinel(rq);
1644 
1645 		/* Hold tightly onto the lock to prevent concurrent retires! */
1646 		if (!spin_trylock_irqsave(&rq->lock, flags))
1647 			continue;
1648 
1649 		if (i915_request_completed(rq))
1650 			goto unlock;
1651 
1652 		if (i915_active_is_idle(&ce->active) &&
1653 		    !intel_context_is_barrier(ce)) {
1654 			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1655 				      engine->name,
1656 				      ce->timeline->fence_context,
1657 				      port - execlists->pending);
1658 			ok = false;
1659 			goto unlock;
1660 		}
1661 
1662 		if (!i915_vma_is_pinned(ce->state)) {
1663 			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1664 				      engine->name,
1665 				      ce->timeline->fence_context,
1666 				      port - execlists->pending);
1667 			ok = false;
1668 			goto unlock;
1669 		}
1670 
1671 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1672 			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1673 				      engine->name,
1674 				      ce->timeline->fence_context,
1675 				      port - execlists->pending);
1676 			ok = false;
1677 			goto unlock;
1678 		}
1679 
1680 unlock:
1681 		spin_unlock_irqrestore(&rq->lock, flags);
1682 		if (!ok)
1683 			return false;
1684 	}
1685 
1686 	return ce;
1687 }
1688 
1689 static void execlists_submit_ports(struct intel_engine_cs *engine)
1690 {
1691 	struct intel_engine_execlists *execlists = &engine->execlists;
1692 	unsigned int n;
1693 
1694 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1695 
1696 	/*
1697 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1698 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1699 	 * not be relinquished until the device is idle (see
1700 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1701 	 * that all ELSP are drained i.e. we have processed the CSB,
1702 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1703 	 */
1704 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1705 
1706 	/*
1707 	 * ELSQ note: the submit queue is not cleared after being submitted
1708 	 * to the HW so we need to make sure we always clean it up. This is
1709 	 * currently ensured by the fact that we always write the same number
1710 	 * of elsq entries, keep this in mind before changing the loop below.
1711 	 */
1712 	for (n = execlists_num_ports(execlists); n--; ) {
1713 		struct i915_request *rq = execlists->pending[n];
1714 
1715 		write_desc(execlists,
1716 			   rq ? execlists_update_context(rq) : 0,
1717 			   n);
1718 	}
1719 
1720 	/* we need to manually load the submit queue */
1721 	if (execlists->ctrl_reg)
1722 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1723 }
1724 
1725 static bool ctx_single_port_submission(const struct intel_context *ce)
1726 {
1727 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1728 		intel_context_force_single_submission(ce));
1729 }
1730 
1731 static bool can_merge_ctx(const struct intel_context *prev,
1732 			  const struct intel_context *next)
1733 {
1734 	if (prev != next)
1735 		return false;
1736 
1737 	if (ctx_single_port_submission(prev))
1738 		return false;
1739 
1740 	return true;
1741 }
1742 
1743 static unsigned long i915_request_flags(const struct i915_request *rq)
1744 {
1745 	return READ_ONCE(rq->fence.flags);
1746 }
1747 
1748 static bool can_merge_rq(const struct i915_request *prev,
1749 			 const struct i915_request *next)
1750 {
1751 	GEM_BUG_ON(prev == next);
1752 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1753 
1754 	/*
1755 	 * We do not submit known completed requests. Therefore if the next
1756 	 * request is already completed, we can pretend to merge it in
1757 	 * with the previous context (and we will skip updating the ELSP
1758 	 * and tracking). Thus hopefully keeping the ELSP full with active
1759 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1760 	 * us.
1761 	 */
1762 	if (i915_request_completed(next))
1763 		return true;
1764 
1765 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1766 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1767 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1768 		return false;
1769 
1770 	if (!can_merge_ctx(prev->context, next->context))
1771 		return false;
1772 
1773 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1774 	return true;
1775 }
1776 
1777 static void virtual_update_register_offsets(u32 *regs,
1778 					    struct intel_engine_cs *engine)
1779 {
1780 	set_offsets(regs, reg_offsets(engine), engine, false);
1781 }
1782 
1783 static bool virtual_matches(const struct virtual_engine *ve,
1784 			    const struct i915_request *rq,
1785 			    const struct intel_engine_cs *engine)
1786 {
1787 	const struct intel_engine_cs *inflight;
1788 
1789 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1790 		return false;
1791 
1792 	/*
1793 	 * We track when the HW has completed saving the context image
1794 	 * (i.e. when we have seen the final CS event switching out of
1795 	 * the context) and must not overwrite the context image before
1796 	 * then. This restricts us to only using the active engine
1797 	 * while the previous virtualized request is inflight (so
1798 	 * we reuse the register offsets). This is a very small
1799 	 * hystersis on the greedy seelction algorithm.
1800 	 */
1801 	inflight = intel_context_inflight(&ve->context);
1802 	if (inflight && inflight != engine)
1803 		return false;
1804 
1805 	return true;
1806 }
1807 
1808 static void virtual_xfer_context(struct virtual_engine *ve,
1809 				 struct intel_engine_cs *engine)
1810 {
1811 	unsigned int n;
1812 
1813 	if (likely(engine == ve->siblings[0]))
1814 		return;
1815 
1816 	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1817 	if (!intel_engine_has_relative_mmio(engine))
1818 		virtual_update_register_offsets(ve->context.lrc_reg_state,
1819 						engine);
1820 
1821 	/*
1822 	 * Move the bound engine to the top of the list for
1823 	 * future execution. We then kick this tasklet first
1824 	 * before checking others, so that we preferentially
1825 	 * reuse this set of bound registers.
1826 	 */
1827 	for (n = 1; n < ve->num_siblings; n++) {
1828 		if (ve->siblings[n] == engine) {
1829 			swap(ve->siblings[n], ve->siblings[0]);
1830 			break;
1831 		}
1832 	}
1833 }
1834 
1835 #define for_each_waiter(p__, rq__) \
1836 	list_for_each_entry_lockless(p__, \
1837 				     &(rq__)->sched.waiters_list, \
1838 				     wait_link)
1839 
1840 #define for_each_signaler(p__, rq__) \
1841 	list_for_each_entry_rcu(p__, \
1842 				&(rq__)->sched.signalers_list, \
1843 				signal_link)
1844 
1845 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1846 {
1847 	LIST_HEAD(list);
1848 
1849 	/*
1850 	 * We want to move the interrupted request to the back of
1851 	 * the round-robin list (i.e. its priority level), but
1852 	 * in doing so, we must then move all requests that were in
1853 	 * flight and were waiting for the interrupted request to
1854 	 * be run after it again.
1855 	 */
1856 	do {
1857 		struct i915_dependency *p;
1858 
1859 		GEM_BUG_ON(i915_request_is_active(rq));
1860 		list_move_tail(&rq->sched.link, pl);
1861 
1862 		for_each_waiter(p, rq) {
1863 			struct i915_request *w =
1864 				container_of(p->waiter, typeof(*w), sched);
1865 
1866 			if (p->flags & I915_DEPENDENCY_WEAK)
1867 				continue;
1868 
1869 			/* Leave semaphores spinning on the other engines */
1870 			if (w->engine != rq->engine)
1871 				continue;
1872 
1873 			/* No waiter should start before its signaler */
1874 			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1875 				   i915_request_started(w) &&
1876 				   !i915_request_completed(rq));
1877 
1878 			GEM_BUG_ON(i915_request_is_active(w));
1879 			if (!i915_request_is_ready(w))
1880 				continue;
1881 
1882 			if (rq_prio(w) < rq_prio(rq))
1883 				continue;
1884 
1885 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1886 			list_move_tail(&w->sched.link, &list);
1887 		}
1888 
1889 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1890 	} while (rq);
1891 }
1892 
1893 static void defer_active(struct intel_engine_cs *engine)
1894 {
1895 	struct i915_request *rq;
1896 
1897 	rq = __unwind_incomplete_requests(engine);
1898 	if (!rq)
1899 		return;
1900 
1901 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1902 }
1903 
1904 static bool
1905 need_timeslice(const struct intel_engine_cs *engine,
1906 	       const struct i915_request *rq,
1907 	       const struct rb_node *rb)
1908 {
1909 	int hint;
1910 
1911 	if (!intel_engine_has_timeslices(engine))
1912 		return false;
1913 
1914 	hint = engine->execlists.queue_priority_hint;
1915 
1916 	if (rb) {
1917 		const struct virtual_engine *ve =
1918 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1919 		const struct intel_engine_cs *inflight =
1920 			intel_context_inflight(&ve->context);
1921 
1922 		if (!inflight || inflight == engine) {
1923 			struct i915_request *next;
1924 
1925 			rcu_read_lock();
1926 			next = READ_ONCE(ve->request);
1927 			if (next)
1928 				hint = max(hint, rq_prio(next));
1929 			rcu_read_unlock();
1930 		}
1931 	}
1932 
1933 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1934 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1935 
1936 	GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1937 	return hint >= effective_prio(rq);
1938 }
1939 
1940 static bool
1941 timeslice_yield(const struct intel_engine_execlists *el,
1942 		const struct i915_request *rq)
1943 {
1944 	/*
1945 	 * Once bitten, forever smitten!
1946 	 *
1947 	 * If the active context ever busy-waited on a semaphore,
1948 	 * it will be treated as a hog until the end of its timeslice (i.e.
1949 	 * until it is scheduled out and replaced by a new submission,
1950 	 * possibly even its own lite-restore). The HW only sends an interrupt
1951 	 * on the first miss, and we do know if that semaphore has been
1952 	 * signaled, or even if it is now stuck on another semaphore. Play
1953 	 * safe, yield if it might be stuck -- it will be given a fresh
1954 	 * timeslice in the near future.
1955 	 */
1956 	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1957 }
1958 
1959 static bool
1960 timeslice_expired(const struct intel_engine_execlists *el,
1961 		  const struct i915_request *rq)
1962 {
1963 	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1964 }
1965 
1966 static int
1967 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1968 {
1969 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1970 		return engine->execlists.queue_priority_hint;
1971 
1972 	return rq_prio(list_next_entry(rq, sched.link));
1973 }
1974 
1975 static inline unsigned long
1976 timeslice(const struct intel_engine_cs *engine)
1977 {
1978 	return READ_ONCE(engine->props.timeslice_duration_ms);
1979 }
1980 
1981 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1982 {
1983 	const struct intel_engine_execlists *execlists = &engine->execlists;
1984 	const struct i915_request *rq = *execlists->active;
1985 
1986 	if (!rq || i915_request_completed(rq))
1987 		return 0;
1988 
1989 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1990 		return 0;
1991 
1992 	return timeslice(engine);
1993 }
1994 
1995 static void set_timeslice(struct intel_engine_cs *engine)
1996 {
1997 	unsigned long duration;
1998 
1999 	if (!intel_engine_has_timeslices(engine))
2000 		return;
2001 
2002 	duration = active_timeslice(engine);
2003 	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2004 
2005 	set_timer_ms(&engine->execlists.timer, duration);
2006 }
2007 
2008 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2009 {
2010 	struct intel_engine_execlists *execlists = &engine->execlists;
2011 	unsigned long duration;
2012 
2013 	if (!intel_engine_has_timeslices(engine))
2014 		return;
2015 
2016 	WRITE_ONCE(execlists->switch_priority_hint, prio);
2017 	if (prio == INT_MIN)
2018 		return;
2019 
2020 	if (timer_pending(&execlists->timer))
2021 		return;
2022 
2023 	duration = timeslice(engine);
2024 	ENGINE_TRACE(engine,
2025 		     "start timeslicing, prio:%d, interval:%lu",
2026 		     prio, duration);
2027 
2028 	set_timer_ms(&execlists->timer, duration);
2029 }
2030 
2031 static void record_preemption(struct intel_engine_execlists *execlists)
2032 {
2033 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2034 }
2035 
2036 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2037 					    const struct i915_request *rq)
2038 {
2039 	if (!rq)
2040 		return 0;
2041 
2042 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2043 	if (unlikely(intel_context_is_banned(rq->context)))
2044 		return 1;
2045 
2046 	return READ_ONCE(engine->props.preempt_timeout_ms);
2047 }
2048 
2049 static void set_preempt_timeout(struct intel_engine_cs *engine,
2050 				const struct i915_request *rq)
2051 {
2052 	if (!intel_engine_has_preempt_reset(engine))
2053 		return;
2054 
2055 	set_timer_ms(&engine->execlists.preempt,
2056 		     active_preempt_timeout(engine, rq));
2057 }
2058 
2059 static inline void clear_ports(struct i915_request **ports, int count)
2060 {
2061 	memset_p((void **)ports, NULL, count);
2062 }
2063 
2064 static inline void
2065 copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2066 {
2067 	/* A memcpy_p() would be very useful here! */
2068 	while (count--)
2069 		WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2070 }
2071 
2072 static void execlists_dequeue(struct intel_engine_cs *engine)
2073 {
2074 	struct intel_engine_execlists * const execlists = &engine->execlists;
2075 	struct i915_request **port = execlists->pending;
2076 	struct i915_request ** const last_port = port + execlists->port_mask;
2077 	struct i915_request * const *active;
2078 	struct i915_request *last;
2079 	struct rb_node *rb;
2080 	bool submit = false;
2081 
2082 	/*
2083 	 * Hardware submission is through 2 ports. Conceptually each port
2084 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2085 	 * static for a context, and unique to each, so we only execute
2086 	 * requests belonging to a single context from each ring. RING_HEAD
2087 	 * is maintained by the CS in the context image, it marks the place
2088 	 * where it got up to last time, and through RING_TAIL we tell the CS
2089 	 * where we want to execute up to this time.
2090 	 *
2091 	 * In this list the requests are in order of execution. Consecutive
2092 	 * requests from the same context are adjacent in the ringbuffer. We
2093 	 * can combine these requests into a single RING_TAIL update:
2094 	 *
2095 	 *              RING_HEAD...req1...req2
2096 	 *                                    ^- RING_TAIL
2097 	 * since to execute req2 the CS must first execute req1.
2098 	 *
2099 	 * Our goal then is to point each port to the end of a consecutive
2100 	 * sequence of requests as being the most optimal (fewest wake ups
2101 	 * and context switches) submission.
2102 	 */
2103 
2104 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2105 		struct virtual_engine *ve =
2106 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2107 		struct i915_request *rq = READ_ONCE(ve->request);
2108 
2109 		if (!rq) { /* lazily cleanup after another engine handled rq */
2110 			rb_erase_cached(rb, &execlists->virtual);
2111 			RB_CLEAR_NODE(rb);
2112 			rb = rb_first_cached(&execlists->virtual);
2113 			continue;
2114 		}
2115 
2116 		if (!virtual_matches(ve, rq, engine)) {
2117 			rb = rb_next(rb);
2118 			continue;
2119 		}
2120 
2121 		break;
2122 	}
2123 
2124 	/*
2125 	 * If the queue is higher priority than the last
2126 	 * request in the currently active context, submit afresh.
2127 	 * We will resubmit again afterwards in case we need to split
2128 	 * the active context to interject the preemption request,
2129 	 * i.e. we will retrigger preemption following the ack in case
2130 	 * of trouble.
2131 	 */
2132 	active = READ_ONCE(execlists->active);
2133 
2134 	/*
2135 	 * In theory we can skip over completed contexts that have not
2136 	 * yet been processed by events (as those events are in flight):
2137 	 *
2138 	 * while ((last = *active) && i915_request_completed(last))
2139 	 *	active++;
2140 	 *
2141 	 * However, the GPU cannot handle this as it will ultimately
2142 	 * find itself trying to jump back into a context it has just
2143 	 * completed and barf.
2144 	 */
2145 
2146 	if ((last = *active)) {
2147 		if (need_preempt(engine, last, rb)) {
2148 			if (i915_request_completed(last)) {
2149 				tasklet_hi_schedule(&execlists->tasklet);
2150 				return;
2151 			}
2152 
2153 			ENGINE_TRACE(engine,
2154 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2155 				     last->fence.context,
2156 				     last->fence.seqno,
2157 				     last->sched.attr.priority,
2158 				     execlists->queue_priority_hint);
2159 			record_preemption(execlists);
2160 
2161 			/*
2162 			 * Don't let the RING_HEAD advance past the breadcrumb
2163 			 * as we unwind (and until we resubmit) so that we do
2164 			 * not accidentally tell it to go backwards.
2165 			 */
2166 			ring_set_paused(engine, 1);
2167 
2168 			/*
2169 			 * Note that we have not stopped the GPU at this point,
2170 			 * so we are unwinding the incomplete requests as they
2171 			 * remain inflight and so by the time we do complete
2172 			 * the preemption, some of the unwound requests may
2173 			 * complete!
2174 			 */
2175 			__unwind_incomplete_requests(engine);
2176 
2177 			last = NULL;
2178 		} else if (need_timeslice(engine, last, rb) &&
2179 			   timeslice_expired(execlists, last)) {
2180 			if (i915_request_completed(last)) {
2181 				tasklet_hi_schedule(&execlists->tasklet);
2182 				return;
2183 			}
2184 
2185 			ENGINE_TRACE(engine,
2186 				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2187 				     last->fence.context,
2188 				     last->fence.seqno,
2189 				     last->sched.attr.priority,
2190 				     execlists->queue_priority_hint,
2191 				     yesno(timeslice_yield(execlists, last)));
2192 
2193 			ring_set_paused(engine, 1);
2194 			defer_active(engine);
2195 
2196 			/*
2197 			 * Unlike for preemption, if we rewind and continue
2198 			 * executing the same context as previously active,
2199 			 * the order of execution will remain the same and
2200 			 * the tail will only advance. We do not need to
2201 			 * force a full context restore, as a lite-restore
2202 			 * is sufficient to resample the monotonic TAIL.
2203 			 *
2204 			 * If we switch to any other context, similarly we
2205 			 * will not rewind TAIL of current context, and
2206 			 * normal save/restore will preserve state and allow
2207 			 * us to later continue executing the same request.
2208 			 */
2209 			last = NULL;
2210 		} else {
2211 			/*
2212 			 * Otherwise if we already have a request pending
2213 			 * for execution after the current one, we can
2214 			 * just wait until the next CS event before
2215 			 * queuing more. In either case we will force a
2216 			 * lite-restore preemption event, but if we wait
2217 			 * we hopefully coalesce several updates into a single
2218 			 * submission.
2219 			 */
2220 			if (!list_is_last(&last->sched.link,
2221 					  &engine->active.requests)) {
2222 				/*
2223 				 * Even if ELSP[1] is occupied and not worthy
2224 				 * of timeslices, our queue might be.
2225 				 */
2226 				start_timeslice(engine, queue_prio(execlists));
2227 				return;
2228 			}
2229 		}
2230 	}
2231 
2232 	while (rb) { /* XXX virtual is always taking precedence */
2233 		struct virtual_engine *ve =
2234 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2235 		struct i915_request *rq;
2236 
2237 		spin_lock(&ve->base.active.lock);
2238 
2239 		rq = ve->request;
2240 		if (unlikely(!rq)) { /* lost the race to a sibling */
2241 			spin_unlock(&ve->base.active.lock);
2242 			rb_erase_cached(rb, &execlists->virtual);
2243 			RB_CLEAR_NODE(rb);
2244 			rb = rb_first_cached(&execlists->virtual);
2245 			continue;
2246 		}
2247 
2248 		GEM_BUG_ON(rq != ve->request);
2249 		GEM_BUG_ON(rq->engine != &ve->base);
2250 		GEM_BUG_ON(rq->context != &ve->context);
2251 
2252 		if (rq_prio(rq) >= queue_prio(execlists)) {
2253 			if (!virtual_matches(ve, rq, engine)) {
2254 				spin_unlock(&ve->base.active.lock);
2255 				rb = rb_next(rb);
2256 				continue;
2257 			}
2258 
2259 			if (last && !can_merge_rq(last, rq)) {
2260 				spin_unlock(&ve->base.active.lock);
2261 				start_timeslice(engine, rq_prio(rq));
2262 				return; /* leave this for another sibling */
2263 			}
2264 
2265 			ENGINE_TRACE(engine,
2266 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2267 				     rq->fence.context,
2268 				     rq->fence.seqno,
2269 				     i915_request_completed(rq) ? "!" :
2270 				     i915_request_started(rq) ? "*" :
2271 				     "",
2272 				     yesno(engine != ve->siblings[0]));
2273 
2274 			WRITE_ONCE(ve->request, NULL);
2275 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2276 				   INT_MIN);
2277 			rb_erase_cached(rb, &execlists->virtual);
2278 			RB_CLEAR_NODE(rb);
2279 
2280 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2281 			WRITE_ONCE(rq->engine, engine);
2282 
2283 			if (__i915_request_submit(rq)) {
2284 				/*
2285 				 * Only after we confirm that we will submit
2286 				 * this request (i.e. it has not already
2287 				 * completed), do we want to update the context.
2288 				 *
2289 				 * This serves two purposes. It avoids
2290 				 * unnecessary work if we are resubmitting an
2291 				 * already completed request after timeslicing.
2292 				 * But more importantly, it prevents us altering
2293 				 * ve->siblings[] on an idle context, where
2294 				 * we may be using ve->siblings[] in
2295 				 * virtual_context_enter / virtual_context_exit.
2296 				 */
2297 				virtual_xfer_context(ve, engine);
2298 				GEM_BUG_ON(ve->siblings[0] != engine);
2299 
2300 				submit = true;
2301 				last = rq;
2302 			}
2303 			i915_request_put(rq);
2304 
2305 			/*
2306 			 * Hmm, we have a bunch of virtual engine requests,
2307 			 * but the first one was already completed (thanks
2308 			 * preempt-to-busy!). Keep looking at the veng queue
2309 			 * until we have no more relevant requests (i.e.
2310 			 * the normal submit queue has higher priority).
2311 			 */
2312 			if (!submit) {
2313 				spin_unlock(&ve->base.active.lock);
2314 				rb = rb_first_cached(&execlists->virtual);
2315 				continue;
2316 			}
2317 		}
2318 
2319 		spin_unlock(&ve->base.active.lock);
2320 		break;
2321 	}
2322 
2323 	while ((rb = rb_first_cached(&execlists->queue))) {
2324 		struct i915_priolist *p = to_priolist(rb);
2325 		struct i915_request *rq, *rn;
2326 		int i;
2327 
2328 		priolist_for_each_request_consume(rq, rn, p, i) {
2329 			bool merge = true;
2330 
2331 			/*
2332 			 * Can we combine this request with the current port?
2333 			 * It has to be the same context/ringbuffer and not
2334 			 * have any exceptions (e.g. GVT saying never to
2335 			 * combine contexts).
2336 			 *
2337 			 * If we can combine the requests, we can execute both
2338 			 * by updating the RING_TAIL to point to the end of the
2339 			 * second request, and so we never need to tell the
2340 			 * hardware about the first.
2341 			 */
2342 			if (last && !can_merge_rq(last, rq)) {
2343 				/*
2344 				 * If we are on the second port and cannot
2345 				 * combine this request with the last, then we
2346 				 * are done.
2347 				 */
2348 				if (port == last_port)
2349 					goto done;
2350 
2351 				/*
2352 				 * We must not populate both ELSP[] with the
2353 				 * same LRCA, i.e. we must submit 2 different
2354 				 * contexts if we submit 2 ELSP.
2355 				 */
2356 				if (last->context == rq->context)
2357 					goto done;
2358 
2359 				if (i915_request_has_sentinel(last))
2360 					goto done;
2361 
2362 				/*
2363 				 * If GVT overrides us we only ever submit
2364 				 * port[0], leaving port[1] empty. Note that we
2365 				 * also have to be careful that we don't queue
2366 				 * the same context (even though a different
2367 				 * request) to the second port.
2368 				 */
2369 				if (ctx_single_port_submission(last->context) ||
2370 				    ctx_single_port_submission(rq->context))
2371 					goto done;
2372 
2373 				merge = false;
2374 			}
2375 
2376 			if (__i915_request_submit(rq)) {
2377 				if (!merge) {
2378 					*port = execlists_schedule_in(last, port - execlists->pending);
2379 					port++;
2380 					last = NULL;
2381 				}
2382 
2383 				GEM_BUG_ON(last &&
2384 					   !can_merge_ctx(last->context,
2385 							  rq->context));
2386 				GEM_BUG_ON(last &&
2387 					   i915_seqno_passed(last->fence.seqno,
2388 							     rq->fence.seqno));
2389 
2390 				submit = true;
2391 				last = rq;
2392 			}
2393 		}
2394 
2395 		rb_erase_cached(&p->node, &execlists->queue);
2396 		i915_priolist_free(p);
2397 	}
2398 
2399 done:
2400 	/*
2401 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2402 	 *
2403 	 * We choose the priority hint such that if we add a request of greater
2404 	 * priority than this, we kick the submission tasklet to decide on
2405 	 * the right order of submitting the requests to hardware. We must
2406 	 * also be prepared to reorder requests as they are in-flight on the
2407 	 * HW. We derive the priority hint then as the first "hole" in
2408 	 * the HW submission ports and if there are no available slots,
2409 	 * the priority of the lowest executing request, i.e. last.
2410 	 *
2411 	 * When we do receive a higher priority request ready to run from the
2412 	 * user, see queue_request(), the priority hint is bumped to that
2413 	 * request triggering preemption on the next dequeue (or subsequent
2414 	 * interrupt for secondary ports).
2415 	 */
2416 	execlists->queue_priority_hint = queue_prio(execlists);
2417 
2418 	if (submit) {
2419 		*port = execlists_schedule_in(last, port - execlists->pending);
2420 		execlists->switch_priority_hint =
2421 			switch_prio(engine, *execlists->pending);
2422 
2423 		/*
2424 		 * Skip if we ended up with exactly the same set of requests,
2425 		 * e.g. trying to timeslice a pair of ordered contexts
2426 		 */
2427 		if (!memcmp(active, execlists->pending,
2428 			    (port - execlists->pending + 1) * sizeof(*port))) {
2429 			do
2430 				execlists_schedule_out(fetch_and_zero(port));
2431 			while (port-- != execlists->pending);
2432 
2433 			goto skip_submit;
2434 		}
2435 		clear_ports(port + 1, last_port - port);
2436 
2437 		WRITE_ONCE(execlists->yield, -1);
2438 		set_preempt_timeout(engine, *active);
2439 		execlists_submit_ports(engine);
2440 	} else {
2441 		start_timeslice(engine, execlists->queue_priority_hint);
2442 skip_submit:
2443 		ring_set_paused(engine, 0);
2444 	}
2445 }
2446 
2447 static void
2448 cancel_port_requests(struct intel_engine_execlists * const execlists)
2449 {
2450 	struct i915_request * const *port;
2451 
2452 	for (port = execlists->pending; *port; port++)
2453 		execlists_schedule_out(*port);
2454 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2455 
2456 	/* Mark the end of active before we overwrite *active */
2457 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2458 		execlists_schedule_out(*port);
2459 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2460 
2461 	smp_wmb(); /* complete the seqlock for execlists_active() */
2462 	WRITE_ONCE(execlists->active, execlists->inflight);
2463 }
2464 
2465 static inline void
2466 invalidate_csb_entries(const u64 *first, const u64 *last)
2467 {
2468 	clflush((void *)first);
2469 	clflush((void *)last);
2470 }
2471 
2472 /*
2473  * Starting with Gen12, the status has a new format:
2474  *
2475  *     bit  0:     switched to new queue
2476  *     bit  1:     reserved
2477  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2478  *                 switch detail is set to "wait on semaphore"
2479  *     bits 3-5:   engine class
2480  *     bits 6-11:  engine instance
2481  *     bits 12-14: reserved
2482  *     bits 15-25: sw context id of the lrc the GT switched to
2483  *     bits 26-31: sw counter of the lrc the GT switched to
2484  *     bits 32-35: context switch detail
2485  *                  - 0: ctx complete
2486  *                  - 1: wait on sync flip
2487  *                  - 2: wait on vblank
2488  *                  - 3: wait on scanline
2489  *                  - 4: wait on semaphore
2490  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2491  *                       WAIT_FOR_EVENT)
2492  *     bit  36:    reserved
2493  *     bits 37-43: wait detail (for switch detail 1 to 4)
2494  *     bits 44-46: reserved
2495  *     bits 47-57: sw context id of the lrc the GT switched away from
2496  *     bits 58-63: sw counter of the lrc the GT switched away from
2497  */
2498 static inline bool gen12_csb_parse(const u64 *csb)
2499 {
2500 	bool ctx_away_valid;
2501 	bool new_queue;
2502 	u64 entry;
2503 
2504 	/* HSD#22011248461 */
2505 	entry = READ_ONCE(*csb);
2506 	if (unlikely(entry == -1)) {
2507 		preempt_disable();
2508 		if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50))
2509 			GEM_WARN_ON("50us CSB timeout");
2510 		preempt_enable();
2511 	}
2512 	WRITE_ONCE(*(u64 *)csb, -1);
2513 
2514 	ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry));
2515 	new_queue =
2516 		lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2517 
2518 	/*
2519 	 * The context switch detail is not guaranteed to be 5 when a preemption
2520 	 * occurs, so we can't just check for that. The check below works for
2521 	 * all the cases we care about, including preemptions of WAIT
2522 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2523 	 * would require some extra handling, but we don't support that.
2524 	 */
2525 	if (!ctx_away_valid || new_queue) {
2526 		GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry)));
2527 		return true;
2528 	}
2529 
2530 	/*
2531 	 * switch detail = 5 is covered by the case above and we do not expect a
2532 	 * context switch on an unsuccessful wait instruction since we always
2533 	 * use polling mode.
2534 	 */
2535 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry)));
2536 	return false;
2537 }
2538 
2539 static inline bool gen8_csb_parse(const u64 *csb)
2540 {
2541 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2542 }
2543 
2544 static void process_csb(struct intel_engine_cs *engine)
2545 {
2546 	struct intel_engine_execlists * const execlists = &engine->execlists;
2547 	const u64 * const buf = execlists->csb_status;
2548 	const u8 num_entries = execlists->csb_size;
2549 	u8 head, tail;
2550 
2551 	/*
2552 	 * As we modify our execlists state tracking we require exclusive
2553 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2554 	 * and we assume that is only inside the reset paths and so serialised.
2555 	 */
2556 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2557 		   !reset_in_progress(execlists));
2558 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2559 
2560 	/*
2561 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2562 	 * When reading from the csb_write mmio register, we have to be
2563 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2564 	 * the low 4bits. As it happens we know the next 4bits are always
2565 	 * zero and so we can simply masked off the low u8 of the register
2566 	 * and treat it identically to reading from the HWSP (without having
2567 	 * to use explicit shifting and masking, and probably bifurcating
2568 	 * the code to handle the legacy mmio read).
2569 	 */
2570 	head = execlists->csb_head;
2571 	tail = READ_ONCE(*execlists->csb_write);
2572 	if (unlikely(head == tail))
2573 		return;
2574 
2575 	/*
2576 	 * We will consume all events from HW, or at least pretend to.
2577 	 *
2578 	 * The sequence of events from the HW is deterministic, and derived
2579 	 * from our writes to the ELSP, with a smidgen of variability for
2580 	 * the arrival of the asynchronous requests wrt to the inflight
2581 	 * execution. If the HW sends an event that does not correspond with
2582 	 * the one we are expecting, we have to abandon all hope as we lose
2583 	 * all tracking of what the engine is actually executing. We will
2584 	 * only detect we are out of sequence with the HW when we get an
2585 	 * 'impossible' event because we have already drained our own
2586 	 * preemption/promotion queue. If this occurs, we know that we likely
2587 	 * lost track of execution earlier and must unwind and restart, the
2588 	 * simplest way is by stop processing the event queue and force the
2589 	 * engine to reset.
2590 	 */
2591 	execlists->csb_head = tail;
2592 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2593 
2594 	/*
2595 	 * Hopefully paired with a wmb() in HW!
2596 	 *
2597 	 * We must complete the read of the write pointer before any reads
2598 	 * from the CSB, so that we do not see stale values. Without an rmb
2599 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2600 	 * we perform the READ_ONCE(*csb_write).
2601 	 */
2602 	rmb();
2603 	do {
2604 		bool promote;
2605 
2606 		if (++head == num_entries)
2607 			head = 0;
2608 
2609 		/*
2610 		 * We are flying near dragons again.
2611 		 *
2612 		 * We hold a reference to the request in execlist_port[]
2613 		 * but no more than that. We are operating in softirq
2614 		 * context and so cannot hold any mutex or sleep. That
2615 		 * prevents us stopping the requests we are processing
2616 		 * in port[] from being retired simultaneously (the
2617 		 * breadcrumb will be complete before we see the
2618 		 * context-switch). As we only hold the reference to the
2619 		 * request, any pointer chasing underneath the request
2620 		 * is subject to a potential use-after-free. Thus we
2621 		 * store all of the bookkeeping within port[] as
2622 		 * required, and avoid using unguarded pointers beneath
2623 		 * request itself. The same applies to the atomic
2624 		 * status notifier.
2625 		 */
2626 
2627 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2628 			     head,
2629 			     upper_32_bits(buf[head]),
2630 			     lower_32_bits(buf[head]));
2631 
2632 		if (INTEL_GEN(engine->i915) >= 12)
2633 			promote = gen12_csb_parse(buf + head);
2634 		else
2635 			promote = gen8_csb_parse(buf + head);
2636 		if (promote) {
2637 			struct i915_request * const *old = execlists->active;
2638 
2639 			if (GEM_WARN_ON(!*execlists->pending)) {
2640 				execlists->error_interrupt |= ERROR_CSB;
2641 				break;
2642 			}
2643 
2644 			ring_set_paused(engine, 0);
2645 
2646 			/* Point active to the new ELSP; prevent overwriting */
2647 			WRITE_ONCE(execlists->active, execlists->pending);
2648 			smp_wmb(); /* notify execlists_active() */
2649 
2650 			/* cancel old inflight, prepare for switch */
2651 			trace_ports(execlists, "preempted", old);
2652 			while (*old)
2653 				execlists_schedule_out(*old++);
2654 
2655 			/* switch pending to inflight */
2656 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2657 			copy_ports(execlists->inflight,
2658 				   execlists->pending,
2659 				   execlists_num_ports(execlists));
2660 			smp_wmb(); /* complete the seqlock */
2661 			WRITE_ONCE(execlists->active, execlists->inflight);
2662 
2663 			/* XXX Magic delay for tgl */
2664 			ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2665 
2666 			WRITE_ONCE(execlists->pending[0], NULL);
2667 		} else {
2668 			if (GEM_WARN_ON(!*execlists->active)) {
2669 				execlists->error_interrupt |= ERROR_CSB;
2670 				break;
2671 			}
2672 
2673 			/* port0 completed, advanced to port1 */
2674 			trace_ports(execlists, "completed", execlists->active);
2675 
2676 			/*
2677 			 * We rely on the hardware being strongly
2678 			 * ordered, that the breadcrumb write is
2679 			 * coherent (visible from the CPU) before the
2680 			 * user interrupt is processed. One might assume
2681 			 * that the breadcrumb write being before the
2682 			 * user interrupt and the CS event for the context
2683 			 * switch would therefore be before the CS event
2684 			 * itself...
2685 			 */
2686 			if (GEM_SHOW_DEBUG() &&
2687 			    !i915_request_completed(*execlists->active)) {
2688 				struct i915_request *rq = *execlists->active;
2689 				const u32 *regs __maybe_unused =
2690 					rq->context->lrc_reg_state;
2691 
2692 				ENGINE_TRACE(engine,
2693 					     "context completed before request!\n");
2694 				ENGINE_TRACE(engine,
2695 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2696 					     ENGINE_READ(engine, RING_START),
2697 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2698 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2699 					     ENGINE_READ(engine, RING_CTL),
2700 					     ENGINE_READ(engine, RING_MI_MODE));
2701 				ENGINE_TRACE(engine,
2702 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2703 					     i915_ggtt_offset(rq->ring->vma),
2704 					     rq->head, rq->tail,
2705 					     rq->fence.context,
2706 					     lower_32_bits(rq->fence.seqno),
2707 					     hwsp_seqno(rq));
2708 				ENGINE_TRACE(engine,
2709 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2710 					     regs[CTX_RING_START],
2711 					     regs[CTX_RING_HEAD],
2712 					     regs[CTX_RING_TAIL]);
2713 			}
2714 
2715 			execlists_schedule_out(*execlists->active++);
2716 
2717 			GEM_BUG_ON(execlists->active - execlists->inflight >
2718 				   execlists_num_ports(execlists));
2719 		}
2720 	} while (head != tail);
2721 
2722 	set_timeslice(engine);
2723 
2724 	/*
2725 	 * Gen11 has proven to fail wrt global observation point between
2726 	 * entry and tail update, failing on the ordering and thus
2727 	 * we see an old entry in the context status buffer.
2728 	 *
2729 	 * Forcibly evict out entries for the next gpu csb update,
2730 	 * to increase the odds that we get a fresh entries with non
2731 	 * working hardware. The cost for doing so comes out mostly with
2732 	 * the wash as hardware, working or not, will need to do the
2733 	 * invalidation before.
2734 	 */
2735 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2736 }
2737 
2738 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2739 {
2740 	lockdep_assert_held(&engine->active.lock);
2741 	if (!READ_ONCE(engine->execlists.pending[0])) {
2742 		rcu_read_lock(); /* protect peeking at execlists->active */
2743 		execlists_dequeue(engine);
2744 		rcu_read_unlock();
2745 	}
2746 }
2747 
2748 static void __execlists_hold(struct i915_request *rq)
2749 {
2750 	LIST_HEAD(list);
2751 
2752 	do {
2753 		struct i915_dependency *p;
2754 
2755 		if (i915_request_is_active(rq))
2756 			__i915_request_unsubmit(rq);
2757 
2758 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2759 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2760 		i915_request_set_hold(rq);
2761 		RQ_TRACE(rq, "on hold\n");
2762 
2763 		for_each_waiter(p, rq) {
2764 			struct i915_request *w =
2765 				container_of(p->waiter, typeof(*w), sched);
2766 
2767 			/* Leave semaphores spinning on the other engines */
2768 			if (w->engine != rq->engine)
2769 				continue;
2770 
2771 			if (!i915_request_is_ready(w))
2772 				continue;
2773 
2774 			if (i915_request_completed(w))
2775 				continue;
2776 
2777 			if (i915_request_on_hold(w))
2778 				continue;
2779 
2780 			list_move_tail(&w->sched.link, &list);
2781 		}
2782 
2783 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2784 	} while (rq);
2785 }
2786 
2787 static bool execlists_hold(struct intel_engine_cs *engine,
2788 			   struct i915_request *rq)
2789 {
2790 	spin_lock_irq(&engine->active.lock);
2791 
2792 	if (i915_request_completed(rq)) { /* too late! */
2793 		rq = NULL;
2794 		goto unlock;
2795 	}
2796 
2797 	if (rq->engine != engine) { /* preempted virtual engine */
2798 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2799 
2800 		/*
2801 		 * intel_context_inflight() is only protected by virtue
2802 		 * of process_csb() being called only by the tasklet (or
2803 		 * directly from inside reset while the tasklet is suspended).
2804 		 * Assert that neither of those are allowed to run while we
2805 		 * poke at the request queues.
2806 		 */
2807 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2808 
2809 		/*
2810 		 * An unsubmitted request along a virtual engine will
2811 		 * remain on the active (this) engine until we are able
2812 		 * to process the context switch away (and so mark the
2813 		 * context as no longer in flight). That cannot have happened
2814 		 * yet, otherwise we would not be hanging!
2815 		 */
2816 		spin_lock(&ve->base.active.lock);
2817 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2818 		GEM_BUG_ON(ve->request != rq);
2819 		ve->request = NULL;
2820 		spin_unlock(&ve->base.active.lock);
2821 		i915_request_put(rq);
2822 
2823 		rq->engine = engine;
2824 	}
2825 
2826 	/*
2827 	 * Transfer this request onto the hold queue to prevent it
2828 	 * being resumbitted to HW (and potentially completed) before we have
2829 	 * released it. Since we may have already submitted following
2830 	 * requests, we need to remove those as well.
2831 	 */
2832 	GEM_BUG_ON(i915_request_on_hold(rq));
2833 	GEM_BUG_ON(rq->engine != engine);
2834 	__execlists_hold(rq);
2835 	GEM_BUG_ON(list_empty(&engine->active.hold));
2836 
2837 unlock:
2838 	spin_unlock_irq(&engine->active.lock);
2839 	return rq;
2840 }
2841 
2842 static bool hold_request(const struct i915_request *rq)
2843 {
2844 	struct i915_dependency *p;
2845 	bool result = false;
2846 
2847 	/*
2848 	 * If one of our ancestors is on hold, we must also be on hold,
2849 	 * otherwise we will bypass it and execute before it.
2850 	 */
2851 	rcu_read_lock();
2852 	for_each_signaler(p, rq) {
2853 		const struct i915_request *s =
2854 			container_of(p->signaler, typeof(*s), sched);
2855 
2856 		if (s->engine != rq->engine)
2857 			continue;
2858 
2859 		result = i915_request_on_hold(s);
2860 		if (result)
2861 			break;
2862 	}
2863 	rcu_read_unlock();
2864 
2865 	return result;
2866 }
2867 
2868 static void __execlists_unhold(struct i915_request *rq)
2869 {
2870 	LIST_HEAD(list);
2871 
2872 	do {
2873 		struct i915_dependency *p;
2874 
2875 		RQ_TRACE(rq, "hold release\n");
2876 
2877 		GEM_BUG_ON(!i915_request_on_hold(rq));
2878 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2879 
2880 		i915_request_clear_hold(rq);
2881 		list_move_tail(&rq->sched.link,
2882 			       i915_sched_lookup_priolist(rq->engine,
2883 							  rq_prio(rq)));
2884 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2885 
2886 		/* Also release any children on this engine that are ready */
2887 		for_each_waiter(p, rq) {
2888 			struct i915_request *w =
2889 				container_of(p->waiter, typeof(*w), sched);
2890 
2891 			/* Propagate any change in error status */
2892 			if (rq->fence.error)
2893 				i915_request_set_error_once(w, rq->fence.error);
2894 
2895 			if (w->engine != rq->engine)
2896 				continue;
2897 
2898 			if (!i915_request_on_hold(w))
2899 				continue;
2900 
2901 			/* Check that no other parents are also on hold */
2902 			if (hold_request(w))
2903 				continue;
2904 
2905 			list_move_tail(&w->sched.link, &list);
2906 		}
2907 
2908 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2909 	} while (rq);
2910 }
2911 
2912 static void execlists_unhold(struct intel_engine_cs *engine,
2913 			     struct i915_request *rq)
2914 {
2915 	spin_lock_irq(&engine->active.lock);
2916 
2917 	/*
2918 	 * Move this request back to the priority queue, and all of its
2919 	 * children and grandchildren that were suspended along with it.
2920 	 */
2921 	__execlists_unhold(rq);
2922 
2923 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2924 		engine->execlists.queue_priority_hint = rq_prio(rq);
2925 		tasklet_hi_schedule(&engine->execlists.tasklet);
2926 	}
2927 
2928 	spin_unlock_irq(&engine->active.lock);
2929 }
2930 
2931 struct execlists_capture {
2932 	struct work_struct work;
2933 	struct i915_request *rq;
2934 	struct i915_gpu_coredump *error;
2935 };
2936 
2937 static void execlists_capture_work(struct work_struct *work)
2938 {
2939 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2940 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2941 	struct intel_engine_cs *engine = cap->rq->engine;
2942 	struct intel_gt_coredump *gt = cap->error->gt;
2943 	struct intel_engine_capture_vma *vma;
2944 
2945 	/* Compress all the objects attached to the request, slow! */
2946 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2947 	if (vma) {
2948 		struct i915_vma_compress *compress =
2949 			i915_vma_capture_prepare(gt);
2950 
2951 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2952 		i915_vma_capture_finish(gt, compress);
2953 	}
2954 
2955 	gt->simulated = gt->engine->simulated;
2956 	cap->error->simulated = gt->simulated;
2957 
2958 	/* Publish the error state, and announce it to the world */
2959 	i915_error_state_store(cap->error);
2960 	i915_gpu_coredump_put(cap->error);
2961 
2962 	/* Return this request and all that depend upon it for signaling */
2963 	execlists_unhold(engine, cap->rq);
2964 	i915_request_put(cap->rq);
2965 
2966 	kfree(cap);
2967 }
2968 
2969 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2970 {
2971 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2972 	struct execlists_capture *cap;
2973 
2974 	cap = kmalloc(sizeof(*cap), gfp);
2975 	if (!cap)
2976 		return NULL;
2977 
2978 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2979 	if (!cap->error)
2980 		goto err_cap;
2981 
2982 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2983 	if (!cap->error->gt)
2984 		goto err_gpu;
2985 
2986 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2987 	if (!cap->error->gt->engine)
2988 		goto err_gt;
2989 
2990 	return cap;
2991 
2992 err_gt:
2993 	kfree(cap->error->gt);
2994 err_gpu:
2995 	kfree(cap->error);
2996 err_cap:
2997 	kfree(cap);
2998 	return NULL;
2999 }
3000 
3001 static struct i915_request *
3002 active_context(struct intel_engine_cs *engine, u32 ccid)
3003 {
3004 	const struct intel_engine_execlists * const el = &engine->execlists;
3005 	struct i915_request * const *port, *rq;
3006 
3007 	/*
3008 	 * Use the most recent result from process_csb(), but just in case
3009 	 * we trigger an error (via interrupt) before the first CS event has
3010 	 * been written, peek at the next submission.
3011 	 */
3012 
3013 	for (port = el->active; (rq = *port); port++) {
3014 		if (rq->context->lrc.ccid == ccid) {
3015 			ENGINE_TRACE(engine,
3016 				     "ccid found at active:%zd\n",
3017 				     port - el->active);
3018 			return rq;
3019 		}
3020 	}
3021 
3022 	for (port = el->pending; (rq = *port); port++) {
3023 		if (rq->context->lrc.ccid == ccid) {
3024 			ENGINE_TRACE(engine,
3025 				     "ccid found at pending:%zd\n",
3026 				     port - el->pending);
3027 			return rq;
3028 		}
3029 	}
3030 
3031 	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3032 	return NULL;
3033 }
3034 
3035 static u32 active_ccid(struct intel_engine_cs *engine)
3036 {
3037 	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3038 }
3039 
3040 static void execlists_capture(struct intel_engine_cs *engine)
3041 {
3042 	struct execlists_capture *cap;
3043 
3044 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3045 		return;
3046 
3047 	/*
3048 	 * We need to _quickly_ capture the engine state before we reset.
3049 	 * We are inside an atomic section (softirq) here and we are delaying
3050 	 * the forced preemption event.
3051 	 */
3052 	cap = capture_regs(engine);
3053 	if (!cap)
3054 		return;
3055 
3056 	spin_lock_irq(&engine->active.lock);
3057 	cap->rq = active_context(engine, active_ccid(engine));
3058 	if (cap->rq) {
3059 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3060 		cap->rq = i915_request_get_rcu(cap->rq);
3061 	}
3062 	spin_unlock_irq(&engine->active.lock);
3063 	if (!cap->rq)
3064 		goto err_free;
3065 
3066 	/*
3067 	 * Remove the request from the execlists queue, and take ownership
3068 	 * of the request. We pass it to our worker who will _slowly_ compress
3069 	 * all the pages the _user_ requested for debugging their batch, after
3070 	 * which we return it to the queue for signaling.
3071 	 *
3072 	 * By removing them from the execlists queue, we also remove the
3073 	 * requests from being processed by __unwind_incomplete_requests()
3074 	 * during the intel_engine_reset(), and so they will *not* be replayed
3075 	 * afterwards.
3076 	 *
3077 	 * Note that because we have not yet reset the engine at this point,
3078 	 * it is possible for the request that we have identified as being
3079 	 * guilty, did in fact complete and we will then hit an arbitration
3080 	 * point allowing the outstanding preemption to succeed. The likelihood
3081 	 * of that is very low (as capturing of the engine registers should be
3082 	 * fast enough to run inside an irq-off atomic section!), so we will
3083 	 * simply hold that request accountable for being non-preemptible
3084 	 * long enough to force the reset.
3085 	 */
3086 	if (!execlists_hold(engine, cap->rq))
3087 		goto err_rq;
3088 
3089 	INIT_WORK(&cap->work, execlists_capture_work);
3090 	schedule_work(&cap->work);
3091 	return;
3092 
3093 err_rq:
3094 	i915_request_put(cap->rq);
3095 err_free:
3096 	i915_gpu_coredump_put(cap->error);
3097 	kfree(cap);
3098 }
3099 
3100 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3101 {
3102 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3103 	unsigned long *lock = &engine->gt->reset.flags;
3104 
3105 	if (!intel_has_reset_engine(engine->gt))
3106 		return;
3107 
3108 	if (test_and_set_bit(bit, lock))
3109 		return;
3110 
3111 	ENGINE_TRACE(engine, "reset for %s\n", msg);
3112 
3113 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3114 	tasklet_disable_nosync(&engine->execlists.tasklet);
3115 
3116 	ring_set_paused(engine, 1); /* Freeze the current request in place */
3117 	execlists_capture(engine);
3118 	intel_engine_reset(engine, msg);
3119 
3120 	tasklet_enable(&engine->execlists.tasklet);
3121 	clear_and_wake_up_bit(bit, lock);
3122 }
3123 
3124 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3125 {
3126 	const struct timer_list *t = &engine->execlists.preempt;
3127 
3128 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3129 		return false;
3130 
3131 	if (!timer_expired(t))
3132 		return false;
3133 
3134 	return READ_ONCE(engine->execlists.pending[0]);
3135 }
3136 
3137 /*
3138  * Check the unread Context Status Buffers and manage the submission of new
3139  * contexts to the ELSP accordingly.
3140  */
3141 static void execlists_submission_tasklet(unsigned long data)
3142 {
3143 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3144 	bool timeout = preempt_timeout(engine);
3145 
3146 	process_csb(engine);
3147 
3148 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3149 		const char *msg;
3150 
3151 		/* Generate the error message in priority wrt to the user! */
3152 		if (engine->execlists.error_interrupt & GENMASK(15, 0))
3153 			msg = "CS error"; /* thrown by a user payload */
3154 		else if (engine->execlists.error_interrupt & ERROR_CSB)
3155 			msg = "invalid CSB event";
3156 		else
3157 			msg = "internal error";
3158 
3159 		engine->execlists.error_interrupt = 0;
3160 		execlists_reset(engine, msg);
3161 	}
3162 
3163 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3164 		unsigned long flags;
3165 
3166 		spin_lock_irqsave(&engine->active.lock, flags);
3167 		__execlists_submission_tasklet(engine);
3168 		spin_unlock_irqrestore(&engine->active.lock, flags);
3169 
3170 		/* Recheck after serialising with direct-submission */
3171 		if (unlikely(timeout && preempt_timeout(engine)))
3172 			execlists_reset(engine, "preemption time out");
3173 	}
3174 }
3175 
3176 static void __execlists_kick(struct intel_engine_execlists *execlists)
3177 {
3178 	/* Kick the tasklet for some interrupt coalescing and reset handling */
3179 	tasklet_hi_schedule(&execlists->tasklet);
3180 }
3181 
3182 #define execlists_kick(t, member) \
3183 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3184 
3185 static void execlists_timeslice(struct timer_list *timer)
3186 {
3187 	execlists_kick(timer, timer);
3188 }
3189 
3190 static void execlists_preempt(struct timer_list *timer)
3191 {
3192 	execlists_kick(timer, preempt);
3193 }
3194 
3195 static void queue_request(struct intel_engine_cs *engine,
3196 			  struct i915_request *rq)
3197 {
3198 	GEM_BUG_ON(!list_empty(&rq->sched.link));
3199 	list_add_tail(&rq->sched.link,
3200 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3201 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3202 }
3203 
3204 static void __submit_queue_imm(struct intel_engine_cs *engine)
3205 {
3206 	struct intel_engine_execlists * const execlists = &engine->execlists;
3207 
3208 	if (reset_in_progress(execlists))
3209 		return; /* defer until we restart the engine following reset */
3210 
3211 	__execlists_submission_tasklet(engine);
3212 }
3213 
3214 static void submit_queue(struct intel_engine_cs *engine,
3215 			 const struct i915_request *rq)
3216 {
3217 	struct intel_engine_execlists *execlists = &engine->execlists;
3218 
3219 	if (rq_prio(rq) <= execlists->queue_priority_hint)
3220 		return;
3221 
3222 	execlists->queue_priority_hint = rq_prio(rq);
3223 	__submit_queue_imm(engine);
3224 }
3225 
3226 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3227 			     const struct i915_request *rq)
3228 {
3229 	GEM_BUG_ON(i915_request_on_hold(rq));
3230 	return !list_empty(&engine->active.hold) && hold_request(rq);
3231 }
3232 
3233 static void flush_csb(struct intel_engine_cs *engine)
3234 {
3235 	struct intel_engine_execlists *el = &engine->execlists;
3236 
3237 	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3238 		if (!reset_in_progress(el))
3239 			process_csb(engine);
3240 		tasklet_unlock(&el->tasklet);
3241 	}
3242 }
3243 
3244 static void execlists_submit_request(struct i915_request *request)
3245 {
3246 	struct intel_engine_cs *engine = request->engine;
3247 	unsigned long flags;
3248 
3249 	/* Hopefully we clear execlists->pending[] to let us through */
3250 	flush_csb(engine);
3251 
3252 	/* Will be called from irq-context when using foreign fences. */
3253 	spin_lock_irqsave(&engine->active.lock, flags);
3254 
3255 	if (unlikely(ancestor_on_hold(engine, request))) {
3256 		RQ_TRACE(request, "ancestor on hold\n");
3257 		list_add_tail(&request->sched.link, &engine->active.hold);
3258 		i915_request_set_hold(request);
3259 	} else {
3260 		queue_request(engine, request);
3261 
3262 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3263 		GEM_BUG_ON(list_empty(&request->sched.link));
3264 
3265 		submit_queue(engine, request);
3266 	}
3267 
3268 	spin_unlock_irqrestore(&engine->active.lock, flags);
3269 }
3270 
3271 static void __execlists_context_fini(struct intel_context *ce)
3272 {
3273 	intel_ring_put(ce->ring);
3274 	i915_vma_put(ce->state);
3275 }
3276 
3277 static void execlists_context_destroy(struct kref *kref)
3278 {
3279 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3280 
3281 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3282 	GEM_BUG_ON(intel_context_is_pinned(ce));
3283 
3284 	if (ce->state)
3285 		__execlists_context_fini(ce);
3286 
3287 	intel_context_fini(ce);
3288 	intel_context_free(ce);
3289 }
3290 
3291 static void
3292 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3293 {
3294 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3295 		return;
3296 
3297 	vaddr += engine->context_size;
3298 
3299 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3300 }
3301 
3302 static void
3303 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3304 {
3305 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3306 		return;
3307 
3308 	vaddr += engine->context_size;
3309 
3310 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3311 		drm_err_once(&engine->i915->drm,
3312 			     "%s context redzone overwritten!\n",
3313 			     engine->name);
3314 }
3315 
3316 static void execlists_context_unpin(struct intel_context *ce)
3317 {
3318 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3319 		      ce->engine);
3320 }
3321 
3322 static void execlists_context_post_unpin(struct intel_context *ce)
3323 {
3324 	i915_gem_object_unpin_map(ce->state->obj);
3325 }
3326 
3327 static u32 *
3328 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3329 {
3330 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3331 		MI_SRM_LRM_GLOBAL_GTT |
3332 		MI_LRI_LRM_CS_MMIO;
3333 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3334 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3335 		CTX_TIMESTAMP * sizeof(u32);
3336 	*cs++ = 0;
3337 
3338 	*cs++ = MI_LOAD_REGISTER_REG |
3339 		MI_LRR_SOURCE_CS_MMIO |
3340 		MI_LRI_LRM_CS_MMIO;
3341 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3342 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3343 
3344 	*cs++ = MI_LOAD_REGISTER_REG |
3345 		MI_LRR_SOURCE_CS_MMIO |
3346 		MI_LRI_LRM_CS_MMIO;
3347 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3348 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3349 
3350 	return cs;
3351 }
3352 
3353 static u32 *
3354 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3355 {
3356 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3357 
3358 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3359 		MI_SRM_LRM_GLOBAL_GTT |
3360 		MI_LRI_LRM_CS_MMIO;
3361 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3362 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3363 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3364 	*cs++ = 0;
3365 
3366 	return cs;
3367 }
3368 
3369 static u32 *
3370 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3371 {
3372 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3373 
3374 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3375 		MI_SRM_LRM_GLOBAL_GTT |
3376 		MI_LRI_LRM_CS_MMIO;
3377 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3378 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3379 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3380 	*cs++ = 0;
3381 
3382 	*cs++ = MI_LOAD_REGISTER_REG |
3383 		MI_LRR_SOURCE_CS_MMIO |
3384 		MI_LRI_LRM_CS_MMIO;
3385 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3386 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3387 
3388 	return cs;
3389 }
3390 
3391 static u32 *
3392 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3393 {
3394 	cs = gen12_emit_timestamp_wa(ce, cs);
3395 	cs = gen12_emit_cmd_buf_wa(ce, cs);
3396 	cs = gen12_emit_restore_scratch(ce, cs);
3397 
3398 	return cs;
3399 }
3400 
3401 static u32 *
3402 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3403 {
3404 	cs = gen12_emit_timestamp_wa(ce, cs);
3405 	cs = gen12_emit_restore_scratch(ce, cs);
3406 
3407 	return cs;
3408 }
3409 
3410 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3411 {
3412 	return PAGE_SIZE * ce->wa_bb_page;
3413 }
3414 
3415 static u32 *context_indirect_bb(const struct intel_context *ce)
3416 {
3417 	void *ptr;
3418 
3419 	GEM_BUG_ON(!ce->wa_bb_page);
3420 
3421 	ptr = ce->lrc_reg_state;
3422 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3423 	ptr += context_wa_bb_offset(ce);
3424 
3425 	return ptr;
3426 }
3427 
3428 static void
3429 setup_indirect_ctx_bb(const struct intel_context *ce,
3430 		      const struct intel_engine_cs *engine,
3431 		      u32 *(*emit)(const struct intel_context *, u32 *))
3432 {
3433 	u32 * const start = context_indirect_bb(ce);
3434 	u32 *cs;
3435 
3436 	cs = emit(ce, start);
3437 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3438 	while ((unsigned long)cs % CACHELINE_BYTES)
3439 		*cs++ = MI_NOOP;
3440 
3441 	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3442 				    i915_ggtt_offset(ce->state) +
3443 				    context_wa_bb_offset(ce),
3444 				    (cs - start) * sizeof(*cs));
3445 }
3446 
3447 static void
3448 __execlists_update_reg_state(const struct intel_context *ce,
3449 			     const struct intel_engine_cs *engine,
3450 			     u32 head)
3451 {
3452 	struct intel_ring *ring = ce->ring;
3453 	u32 *regs = ce->lrc_reg_state;
3454 
3455 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3456 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3457 
3458 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3459 	regs[CTX_RING_HEAD] = head;
3460 	regs[CTX_RING_TAIL] = ring->tail;
3461 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3462 
3463 	/* RPCS */
3464 	if (engine->class == RENDER_CLASS) {
3465 		regs[CTX_R_PWR_CLK_STATE] =
3466 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3467 
3468 		i915_oa_init_reg_state(ce, engine);
3469 	}
3470 
3471 	if (ce->wa_bb_page) {
3472 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3473 
3474 		fn = gen12_emit_indirect_ctx_xcs;
3475 		if (ce->engine->class == RENDER_CLASS)
3476 			fn = gen12_emit_indirect_ctx_rcs;
3477 
3478 		/* Mutually exclusive wrt to global indirect bb */
3479 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3480 		setup_indirect_ctx_bb(ce, engine, fn);
3481 	}
3482 }
3483 
3484 static int
3485 execlists_context_pre_pin(struct intel_context *ce,
3486 			  struct i915_gem_ww_ctx *ww, void **vaddr)
3487 {
3488 	GEM_BUG_ON(!ce->state);
3489 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3490 
3491 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
3492 					i915_coherent_map_type(ce->engine->i915) |
3493 					I915_MAP_OVERRIDE);
3494 
3495 	return PTR_ERR_OR_ZERO(*vaddr);
3496 }
3497 
3498 static int
3499 __execlists_context_pin(struct intel_context *ce,
3500 			struct intel_engine_cs *engine,
3501 			void *vaddr)
3502 {
3503 	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3504 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3505 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3506 
3507 	return 0;
3508 }
3509 
3510 static int execlists_context_pin(struct intel_context *ce, void *vaddr)
3511 {
3512 	return __execlists_context_pin(ce, ce->engine, vaddr);
3513 }
3514 
3515 static int execlists_context_alloc(struct intel_context *ce)
3516 {
3517 	return __execlists_context_alloc(ce, ce->engine);
3518 }
3519 
3520 static void execlists_context_reset(struct intel_context *ce)
3521 {
3522 	CE_TRACE(ce, "reset\n");
3523 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3524 
3525 	intel_ring_reset(ce->ring, ce->ring->emit);
3526 
3527 	/* Scrub away the garbage */
3528 	execlists_init_reg_state(ce->lrc_reg_state,
3529 				 ce, ce->engine, ce->ring, true);
3530 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3531 
3532 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3533 }
3534 
3535 static const struct intel_context_ops execlists_context_ops = {
3536 	.alloc = execlists_context_alloc,
3537 
3538 	.pre_pin = execlists_context_pre_pin,
3539 	.pin = execlists_context_pin,
3540 	.unpin = execlists_context_unpin,
3541 	.post_unpin = execlists_context_post_unpin,
3542 
3543 	.enter = intel_context_enter_engine,
3544 	.exit = intel_context_exit_engine,
3545 
3546 	.reset = execlists_context_reset,
3547 	.destroy = execlists_context_destroy,
3548 };
3549 
3550 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3551 {
3552 	u32 *cs;
3553 
3554 	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3555 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3556 		return 0;
3557 
3558 	cs = intel_ring_begin(rq, 6);
3559 	if (IS_ERR(cs))
3560 		return PTR_ERR(cs);
3561 
3562 	/*
3563 	 * Check if we have been preempted before we even get started.
3564 	 *
3565 	 * After this point i915_request_started() reports true, even if
3566 	 * we get preempted and so are no longer running.
3567 	 */
3568 	*cs++ = MI_ARB_CHECK;
3569 	*cs++ = MI_NOOP;
3570 
3571 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3572 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3573 	*cs++ = 0;
3574 	*cs++ = rq->fence.seqno - 1;
3575 
3576 	intel_ring_advance(rq, cs);
3577 
3578 	/* Record the updated position of the request's payload */
3579 	rq->infix = intel_ring_offset(rq, cs);
3580 
3581 	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3582 
3583 	return 0;
3584 }
3585 
3586 static int emit_pdps(struct i915_request *rq)
3587 {
3588 	const struct intel_engine_cs * const engine = rq->engine;
3589 	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3590 	int err, i;
3591 	u32 *cs;
3592 
3593 	GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3594 
3595 	/*
3596 	 * Beware ye of the dragons, this sequence is magic!
3597 	 *
3598 	 * Small changes to this sequence can cause anything from
3599 	 * GPU hangs to forcewake errors and machine lockups!
3600 	 */
3601 
3602 	/* Flush any residual operations from the context load */
3603 	err = engine->emit_flush(rq, EMIT_FLUSH);
3604 	if (err)
3605 		return err;
3606 
3607 	/* Magic required to prevent forcewake errors! */
3608 	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3609 	if (err)
3610 		return err;
3611 
3612 	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3613 	if (IS_ERR(cs))
3614 		return PTR_ERR(cs);
3615 
3616 	/* Ensure the LRI have landed before we invalidate & continue */
3617 	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3618 	for (i = GEN8_3LVL_PDPES; i--; ) {
3619 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3620 		u32 base = engine->mmio_base;
3621 
3622 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3623 		*cs++ = upper_32_bits(pd_daddr);
3624 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3625 		*cs++ = lower_32_bits(pd_daddr);
3626 	}
3627 	*cs++ = MI_NOOP;
3628 
3629 	intel_ring_advance(rq, cs);
3630 
3631 	return 0;
3632 }
3633 
3634 static int execlists_request_alloc(struct i915_request *request)
3635 {
3636 	int ret;
3637 
3638 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3639 
3640 	/*
3641 	 * Flush enough space to reduce the likelihood of waiting after
3642 	 * we start building the request - in which case we will just
3643 	 * have to repeat work.
3644 	 */
3645 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3646 
3647 	/*
3648 	 * Note that after this point, we have committed to using
3649 	 * this request as it is being used to both track the
3650 	 * state of engine initialisation and liveness of the
3651 	 * golden renderstate above. Think twice before you try
3652 	 * to cancel/unwind this request now.
3653 	 */
3654 
3655 	if (!i915_vm_is_4lvl(request->context->vm)) {
3656 		ret = emit_pdps(request);
3657 		if (ret)
3658 			return ret;
3659 	}
3660 
3661 	/* Unconditionally invalidate GPU caches and TLBs. */
3662 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3663 	if (ret)
3664 		return ret;
3665 
3666 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3667 	return 0;
3668 }
3669 
3670 /*
3671  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3672  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3673  * but there is a slight complication as this is applied in WA batch where the
3674  * values are only initialized once so we cannot take register value at the
3675  * beginning and reuse it further; hence we save its value to memory, upload a
3676  * constant value with bit21 set and then we restore it back with the saved value.
3677  * To simplify the WA, a constant value is formed by using the default value
3678  * of this register. This shouldn't be a problem because we are only modifying
3679  * it for a short period and this batch in non-premptible. We can ofcourse
3680  * use additional instructions that read the actual value of the register
3681  * at that time and set our bit of interest but it makes the WA complicated.
3682  *
3683  * This WA is also required for Gen9 so extracting as a function avoids
3684  * code duplication.
3685  */
3686 static u32 *
3687 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3688 {
3689 	/* NB no one else is allowed to scribble over scratch + 256! */
3690 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3691 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3692 	*batch++ = intel_gt_scratch_offset(engine->gt,
3693 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3694 	*batch++ = 0;
3695 
3696 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3697 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3698 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3699 
3700 	batch = gen8_emit_pipe_control(batch,
3701 				       PIPE_CONTROL_CS_STALL |
3702 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3703 				       0);
3704 
3705 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3706 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3707 	*batch++ = intel_gt_scratch_offset(engine->gt,
3708 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3709 	*batch++ = 0;
3710 
3711 	return batch;
3712 }
3713 
3714 /*
3715  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3716  * initialized at the beginning and shared across all contexts but this field
3717  * helps us to have multiple batches at different offsets and select them based
3718  * on a criteria. At the moment this batch always start at the beginning of the page
3719  * and at this point we don't have multiple wa_ctx batch buffers.
3720  *
3721  * The number of WA applied are not known at the beginning; we use this field
3722  * to return the no of DWORDS written.
3723  *
3724  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3725  * so it adds NOOPs as padding to make it cacheline aligned.
3726  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3727  * makes a complete batch buffer.
3728  */
3729 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3730 {
3731 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3732 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3733 
3734 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3735 	if (IS_BROADWELL(engine->i915))
3736 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3737 
3738 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3739 	/* Actual scratch location is at 128 bytes offset */
3740 	batch = gen8_emit_pipe_control(batch,
3741 				       PIPE_CONTROL_FLUSH_L3 |
3742 				       PIPE_CONTROL_STORE_DATA_INDEX |
3743 				       PIPE_CONTROL_CS_STALL |
3744 				       PIPE_CONTROL_QW_WRITE,
3745 				       LRC_PPHWSP_SCRATCH_ADDR);
3746 
3747 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3748 
3749 	/* Pad to end of cacheline */
3750 	while ((unsigned long)batch % CACHELINE_BYTES)
3751 		*batch++ = MI_NOOP;
3752 
3753 	/*
3754 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3755 	 * execution depends on the length specified in terms of cache lines
3756 	 * in the register CTX_RCS_INDIRECT_CTX
3757 	 */
3758 
3759 	return batch;
3760 }
3761 
3762 struct lri {
3763 	i915_reg_t reg;
3764 	u32 value;
3765 };
3766 
3767 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3768 {
3769 	GEM_BUG_ON(!count || count > 63);
3770 
3771 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3772 	do {
3773 		*batch++ = i915_mmio_reg_offset(lri->reg);
3774 		*batch++ = lri->value;
3775 	} while (lri++, --count);
3776 	*batch++ = MI_NOOP;
3777 
3778 	return batch;
3779 }
3780 
3781 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3782 {
3783 	static const struct lri lri[] = {
3784 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3785 		{
3786 			COMMON_SLICE_CHICKEN2,
3787 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3788 				       0),
3789 		},
3790 
3791 		/* BSpec: 11391 */
3792 		{
3793 			FF_SLICE_CHICKEN,
3794 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3795 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3796 		},
3797 
3798 		/* BSpec: 11299 */
3799 		{
3800 			_3D_CHICKEN3,
3801 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3802 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3803 		}
3804 	};
3805 
3806 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3807 
3808 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3809 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3810 
3811 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3812 	batch = gen8_emit_pipe_control(batch,
3813 				       PIPE_CONTROL_FLUSH_L3 |
3814 				       PIPE_CONTROL_STORE_DATA_INDEX |
3815 				       PIPE_CONTROL_CS_STALL |
3816 				       PIPE_CONTROL_QW_WRITE,
3817 				       LRC_PPHWSP_SCRATCH_ADDR);
3818 
3819 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3820 
3821 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3822 	if (HAS_POOLED_EU(engine->i915)) {
3823 		/*
3824 		 * EU pool configuration is setup along with golden context
3825 		 * during context initialization. This value depends on
3826 		 * device type (2x6 or 3x6) and needs to be updated based
3827 		 * on which subslice is disabled especially for 2x6
3828 		 * devices, however it is safe to load default
3829 		 * configuration of 3x6 device instead of masking off
3830 		 * corresponding bits because HW ignores bits of a disabled
3831 		 * subslice and drops down to appropriate config. Please
3832 		 * see render_state_setup() in i915_gem_render_state.c for
3833 		 * possible configurations, to avoid duplication they are
3834 		 * not shown here again.
3835 		 */
3836 		*batch++ = GEN9_MEDIA_POOL_STATE;
3837 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3838 		*batch++ = 0x00777000;
3839 		*batch++ = 0;
3840 		*batch++ = 0;
3841 		*batch++ = 0;
3842 	}
3843 
3844 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3845 
3846 	/* Pad to end of cacheline */
3847 	while ((unsigned long)batch % CACHELINE_BYTES)
3848 		*batch++ = MI_NOOP;
3849 
3850 	return batch;
3851 }
3852 
3853 static u32 *
3854 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3855 {
3856 	int i;
3857 
3858 	/*
3859 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3860 	 *
3861 	 * Ensure the engine is idle prior to programming a
3862 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3863 	 */
3864 	batch = gen8_emit_pipe_control(batch,
3865 				       PIPE_CONTROL_CS_STALL,
3866 				       0);
3867 	/*
3868 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3869 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3870 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3871 	 * confusing. Since gen8_emit_pipe_control() already advances the
3872 	 * batch by 6 dwords, we advance the other 10 here, completing a
3873 	 * cacheline. It's not clear if the workaround requires this padding
3874 	 * before other commands, or if it's just the regular padding we would
3875 	 * already have for the workaround bb, so leave it here for now.
3876 	 */
3877 	for (i = 0; i < 10; i++)
3878 		*batch++ = MI_NOOP;
3879 
3880 	/* Pad to end of cacheline */
3881 	while ((unsigned long)batch % CACHELINE_BYTES)
3882 		*batch++ = MI_NOOP;
3883 
3884 	return batch;
3885 }
3886 
3887 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3888 
3889 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3890 {
3891 	struct drm_i915_gem_object *obj;
3892 	struct i915_vma *vma;
3893 	int err;
3894 
3895 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3896 	if (IS_ERR(obj))
3897 		return PTR_ERR(obj);
3898 
3899 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3900 	if (IS_ERR(vma)) {
3901 		err = PTR_ERR(vma);
3902 		goto err;
3903 	}
3904 
3905 	err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
3906 	if (err)
3907 		goto err;
3908 
3909 	engine->wa_ctx.vma = vma;
3910 	return 0;
3911 
3912 err:
3913 	i915_gem_object_put(obj);
3914 	return err;
3915 }
3916 
3917 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3918 {
3919 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3920 }
3921 
3922 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3923 
3924 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3925 {
3926 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3927 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3928 					    &wa_ctx->per_ctx };
3929 	wa_bb_func_t wa_bb_fn[2];
3930 	void *batch, *batch_ptr;
3931 	unsigned int i;
3932 	int ret;
3933 
3934 	if (engine->class != RENDER_CLASS)
3935 		return 0;
3936 
3937 	switch (INTEL_GEN(engine->i915)) {
3938 	case 12:
3939 	case 11:
3940 		return 0;
3941 	case 10:
3942 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3943 		wa_bb_fn[1] = NULL;
3944 		break;
3945 	case 9:
3946 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3947 		wa_bb_fn[1] = NULL;
3948 		break;
3949 	case 8:
3950 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3951 		wa_bb_fn[1] = NULL;
3952 		break;
3953 	default:
3954 		MISSING_CASE(INTEL_GEN(engine->i915));
3955 		return 0;
3956 	}
3957 
3958 	ret = lrc_setup_wa_ctx(engine);
3959 	if (ret) {
3960 		drm_dbg(&engine->i915->drm,
3961 			"Failed to setup context WA page: %d\n", ret);
3962 		return ret;
3963 	}
3964 
3965 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
3966 
3967 	/*
3968 	 * Emit the two workaround batch buffers, recording the offset from the
3969 	 * start of the workaround batch buffer object for each and their
3970 	 * respective sizes.
3971 	 */
3972 	batch_ptr = batch;
3973 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3974 		wa_bb[i]->offset = batch_ptr - batch;
3975 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3976 						  CACHELINE_BYTES))) {
3977 			ret = -EINVAL;
3978 			break;
3979 		}
3980 		if (wa_bb_fn[i])
3981 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3982 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3983 	}
3984 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3985 
3986 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
3987 	__i915_gem_object_release_map(wa_ctx->vma->obj);
3988 	if (ret)
3989 		lrc_destroy_wa_ctx(engine);
3990 
3991 	return ret;
3992 }
3993 
3994 static void reset_csb_pointers(struct intel_engine_cs *engine)
3995 {
3996 	struct intel_engine_execlists * const execlists = &engine->execlists;
3997 	const unsigned int reset_value = execlists->csb_size - 1;
3998 
3999 	ring_set_paused(engine, 0);
4000 
4001 	/*
4002 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
4003 	 * Bludgeon them with a mmio update to be sure.
4004 	 */
4005 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4006 		     0xffff << 16 | reset_value << 8 | reset_value);
4007 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4008 
4009 	/*
4010 	 * After a reset, the HW starts writing into CSB entry [0]. We
4011 	 * therefore have to set our HEAD pointer back one entry so that
4012 	 * the *first* entry we check is entry 0. To complicate this further,
4013 	 * as we don't wait for the first interrupt after reset, we have to
4014 	 * fake the HW write to point back to the last entry so that our
4015 	 * inline comparison of our cached head position against the last HW
4016 	 * write works even before the first interrupt.
4017 	 */
4018 	execlists->csb_head = reset_value;
4019 	WRITE_ONCE(*execlists->csb_write, reset_value);
4020 	wmb(); /* Make sure this is visible to HW (paranoia?) */
4021 
4022 	/* Check that the GPU does indeed update the CSB entries! */
4023 	memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64));
4024 	invalidate_csb_entries(&execlists->csb_status[0],
4025 			       &execlists->csb_status[reset_value]);
4026 
4027 	/* Once more for luck and our trusty paranoia */
4028 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4029 		     0xffff << 16 | reset_value << 8 | reset_value);
4030 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4031 
4032 	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4033 }
4034 
4035 static void execlists_sanitize(struct intel_engine_cs *engine)
4036 {
4037 	/*
4038 	 * Poison residual state on resume, in case the suspend didn't!
4039 	 *
4040 	 * We have to assume that across suspend/resume (or other loss
4041 	 * of control) that the contents of our pinned buffers has been
4042 	 * lost, replaced by garbage. Since this doesn't always happen,
4043 	 * let's poison such state so that we more quickly spot when
4044 	 * we falsely assume it has been preserved.
4045 	 */
4046 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4047 		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4048 
4049 	reset_csb_pointers(engine);
4050 
4051 	/*
4052 	 * The kernel_context HWSP is stored in the status_page. As above,
4053 	 * that may be lost on resume/initialisation, and so we need to
4054 	 * reset the value in the HWSP.
4055 	 */
4056 	intel_timeline_reset_seqno(engine->kernel_context->timeline);
4057 
4058 	/* And scrub the dirty cachelines for the HWSP */
4059 	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4060 }
4061 
4062 static void enable_error_interrupt(struct intel_engine_cs *engine)
4063 {
4064 	u32 status;
4065 
4066 	engine->execlists.error_interrupt = 0;
4067 	ENGINE_WRITE(engine, RING_EMR, ~0u);
4068 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4069 
4070 	status = ENGINE_READ(engine, RING_ESR);
4071 	if (unlikely(status)) {
4072 		drm_err(&engine->i915->drm,
4073 			"engine '%s' resumed still in error: %08x\n",
4074 			engine->name, status);
4075 		__intel_gt_reset(engine->gt, engine->mask);
4076 	}
4077 
4078 	/*
4079 	 * On current gen8+, we have 2 signals to play with
4080 	 *
4081 	 * - I915_ERROR_INSTUCTION (bit 0)
4082 	 *
4083 	 *    Generate an error if the command parser encounters an invalid
4084 	 *    instruction
4085 	 *
4086 	 *    This is a fatal error.
4087 	 *
4088 	 * - CP_PRIV (bit 2)
4089 	 *
4090 	 *    Generate an error on privilege violation (where the CP replaces
4091 	 *    the instruction with a no-op). This also fires for writes into
4092 	 *    read-only scratch pages.
4093 	 *
4094 	 *    This is a non-fatal error, parsing continues.
4095 	 *
4096 	 * * there are a few others defined for odd HW that we do not use
4097 	 *
4098 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4099 	 * error (as the HW is validating and suppressing the mistakes), we
4100 	 * only unmask the instruction error bit.
4101 	 */
4102 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4103 }
4104 
4105 static void enable_execlists(struct intel_engine_cs *engine)
4106 {
4107 	u32 mode;
4108 
4109 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4110 
4111 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4112 
4113 	if (INTEL_GEN(engine->i915) >= 11)
4114 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4115 	else
4116 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4117 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4118 
4119 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4120 
4121 	ENGINE_WRITE_FW(engine,
4122 			RING_HWS_PGA,
4123 			i915_ggtt_offset(engine->status_page.vma));
4124 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4125 
4126 	enable_error_interrupt(engine);
4127 
4128 	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4129 }
4130 
4131 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4132 {
4133 	bool unexpected = false;
4134 
4135 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4136 		drm_dbg(&engine->i915->drm,
4137 			"STOP_RING still set in RING_MI_MODE\n");
4138 		unexpected = true;
4139 	}
4140 
4141 	return unexpected;
4142 }
4143 
4144 static int execlists_resume(struct intel_engine_cs *engine)
4145 {
4146 	intel_mocs_init_engine(engine);
4147 
4148 	intel_breadcrumbs_reset(engine->breadcrumbs);
4149 
4150 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4151 		struct drm_printer p = drm_debug_printer(__func__);
4152 
4153 		intel_engine_dump(engine, &p, NULL);
4154 	}
4155 
4156 	enable_execlists(engine);
4157 
4158 	return 0;
4159 }
4160 
4161 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4162 {
4163 	struct intel_engine_execlists * const execlists = &engine->execlists;
4164 	unsigned long flags;
4165 
4166 	ENGINE_TRACE(engine, "depth<-%d\n",
4167 		     atomic_read(&execlists->tasklet.count));
4168 
4169 	/*
4170 	 * Prevent request submission to the hardware until we have
4171 	 * completed the reset in i915_gem_reset_finish(). If a request
4172 	 * is completed by one engine, it may then queue a request
4173 	 * to a second via its execlists->tasklet *just* as we are
4174 	 * calling engine->resume() and also writing the ELSP.
4175 	 * Turning off the execlists->tasklet until the reset is over
4176 	 * prevents the race.
4177 	 */
4178 	__tasklet_disable_sync_once(&execlists->tasklet);
4179 	GEM_BUG_ON(!reset_in_progress(execlists));
4180 
4181 	/* And flush any current direct submission. */
4182 	spin_lock_irqsave(&engine->active.lock, flags);
4183 	spin_unlock_irqrestore(&engine->active.lock, flags);
4184 
4185 	/*
4186 	 * We stop engines, otherwise we might get failed reset and a
4187 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4188 	 * from system hang if batchbuffer is progressing when
4189 	 * the reset is issued, regardless of READY_TO_RESET ack.
4190 	 * Thus assume it is best to stop engines on all gens
4191 	 * where we have a gpu reset.
4192 	 *
4193 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4194 	 *
4195 	 * FIXME: Wa for more modern gens needs to be validated
4196 	 */
4197 	ring_set_paused(engine, 1);
4198 	intel_engine_stop_cs(engine);
4199 
4200 	engine->execlists.reset_ccid = active_ccid(engine);
4201 }
4202 
4203 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4204 {
4205 	int x;
4206 
4207 	x = lrc_ring_mi_mode(engine);
4208 	if (x != -1) {
4209 		regs[x + 1] &= ~STOP_RING;
4210 		regs[x + 1] |= STOP_RING << 16;
4211 	}
4212 }
4213 
4214 static void __execlists_reset_reg_state(const struct intel_context *ce,
4215 					const struct intel_engine_cs *engine)
4216 {
4217 	u32 *regs = ce->lrc_reg_state;
4218 
4219 	__reset_stop_ring(regs, engine);
4220 }
4221 
4222 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4223 {
4224 	struct intel_engine_execlists * const execlists = &engine->execlists;
4225 	struct intel_context *ce;
4226 	struct i915_request *rq;
4227 	u32 head;
4228 
4229 	mb(); /* paranoia: read the CSB pointers from after the reset */
4230 	clflush(execlists->csb_write);
4231 	mb();
4232 
4233 	process_csb(engine); /* drain preemption events */
4234 
4235 	/* Following the reset, we need to reload the CSB read/write pointers */
4236 	reset_csb_pointers(engine);
4237 
4238 	/*
4239 	 * Save the currently executing context, even if we completed
4240 	 * its request, it was still running at the time of the
4241 	 * reset and will have been clobbered.
4242 	 */
4243 	rq = active_context(engine, engine->execlists.reset_ccid);
4244 	if (!rq)
4245 		goto unwind;
4246 
4247 	ce = rq->context;
4248 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4249 
4250 	if (i915_request_completed(rq)) {
4251 		/* Idle context; tidy up the ring so we can restart afresh */
4252 		head = intel_ring_wrap(ce->ring, rq->tail);
4253 		goto out_replay;
4254 	}
4255 
4256 	/* We still have requests in-flight; the engine should be active */
4257 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4258 
4259 	/* Context has requests still in-flight; it should not be idle! */
4260 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4261 
4262 	rq = active_request(ce->timeline, rq);
4263 	head = intel_ring_wrap(ce->ring, rq->head);
4264 	GEM_BUG_ON(head == ce->ring->tail);
4265 
4266 	/*
4267 	 * If this request hasn't started yet, e.g. it is waiting on a
4268 	 * semaphore, we need to avoid skipping the request or else we
4269 	 * break the signaling chain. However, if the context is corrupt
4270 	 * the request will not restart and we will be stuck with a wedged
4271 	 * device. It is quite often the case that if we issue a reset
4272 	 * while the GPU is loading the context image, that the context
4273 	 * image becomes corrupt.
4274 	 *
4275 	 * Otherwise, if we have not started yet, the request should replay
4276 	 * perfectly and we do not need to flag the result as being erroneous.
4277 	 */
4278 	if (!i915_request_started(rq))
4279 		goto out_replay;
4280 
4281 	/*
4282 	 * If the request was innocent, we leave the request in the ELSP
4283 	 * and will try to replay it on restarting. The context image may
4284 	 * have been corrupted by the reset, in which case we may have
4285 	 * to service a new GPU hang, but more likely we can continue on
4286 	 * without impact.
4287 	 *
4288 	 * If the request was guilty, we presume the context is corrupt
4289 	 * and have to at least restore the RING register in the context
4290 	 * image back to the expected values to skip over the guilty request.
4291 	 */
4292 	__i915_request_reset(rq, stalled);
4293 
4294 	/*
4295 	 * We want a simple context + ring to execute the breadcrumb update.
4296 	 * We cannot rely on the context being intact across the GPU hang,
4297 	 * so clear it and rebuild just what we need for the breadcrumb.
4298 	 * All pending requests for this context will be zapped, and any
4299 	 * future request will be after userspace has had the opportunity
4300 	 * to recreate its own state.
4301 	 */
4302 out_replay:
4303 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4304 		     head, ce->ring->tail);
4305 	__execlists_reset_reg_state(ce, engine);
4306 	__execlists_update_reg_state(ce, engine, head);
4307 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4308 
4309 unwind:
4310 	/* Push back any incomplete requests for replay after the reset. */
4311 	cancel_port_requests(execlists);
4312 	__unwind_incomplete_requests(engine);
4313 }
4314 
4315 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4316 {
4317 	unsigned long flags;
4318 
4319 	ENGINE_TRACE(engine, "\n");
4320 
4321 	spin_lock_irqsave(&engine->active.lock, flags);
4322 
4323 	__execlists_reset(engine, stalled);
4324 
4325 	spin_unlock_irqrestore(&engine->active.lock, flags);
4326 }
4327 
4328 static void nop_submission_tasklet(unsigned long data)
4329 {
4330 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4331 
4332 	/* The driver is wedged; don't process any more events. */
4333 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4334 }
4335 
4336 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4337 {
4338 	struct intel_engine_execlists * const execlists = &engine->execlists;
4339 	struct i915_request *rq, *rn;
4340 	struct rb_node *rb;
4341 	unsigned long flags;
4342 
4343 	ENGINE_TRACE(engine, "\n");
4344 
4345 	/*
4346 	 * Before we call engine->cancel_requests(), we should have exclusive
4347 	 * access to the submission state. This is arranged for us by the
4348 	 * caller disabling the interrupt generation, the tasklet and other
4349 	 * threads that may then access the same state, giving us a free hand
4350 	 * to reset state. However, we still need to let lockdep be aware that
4351 	 * we know this state may be accessed in hardirq context, so we
4352 	 * disable the irq around this manipulation and we want to keep
4353 	 * the spinlock focused on its duties and not accidentally conflate
4354 	 * coverage to the submission's irq state. (Similarly, although we
4355 	 * shouldn't need to disable irq around the manipulation of the
4356 	 * submission's irq state, we also wish to remind ourselves that
4357 	 * it is irq state.)
4358 	 */
4359 	spin_lock_irqsave(&engine->active.lock, flags);
4360 
4361 	__execlists_reset(engine, true);
4362 
4363 	/* Mark all executing requests as skipped. */
4364 	list_for_each_entry(rq, &engine->active.requests, sched.link)
4365 		mark_eio(rq);
4366 
4367 	/* Flush the queued requests to the timeline list (for retiring). */
4368 	while ((rb = rb_first_cached(&execlists->queue))) {
4369 		struct i915_priolist *p = to_priolist(rb);
4370 		int i;
4371 
4372 		priolist_for_each_request_consume(rq, rn, p, i) {
4373 			mark_eio(rq);
4374 			__i915_request_submit(rq);
4375 		}
4376 
4377 		rb_erase_cached(&p->node, &execlists->queue);
4378 		i915_priolist_free(p);
4379 	}
4380 
4381 	/* On-hold requests will be flushed to timeline upon their release */
4382 	list_for_each_entry(rq, &engine->active.hold, sched.link)
4383 		mark_eio(rq);
4384 
4385 	/* Cancel all attached virtual engines */
4386 	while ((rb = rb_first_cached(&execlists->virtual))) {
4387 		struct virtual_engine *ve =
4388 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4389 
4390 		rb_erase_cached(rb, &execlists->virtual);
4391 		RB_CLEAR_NODE(rb);
4392 
4393 		spin_lock(&ve->base.active.lock);
4394 		rq = fetch_and_zero(&ve->request);
4395 		if (rq) {
4396 			mark_eio(rq);
4397 
4398 			rq->engine = engine;
4399 			__i915_request_submit(rq);
4400 			i915_request_put(rq);
4401 
4402 			ve->base.execlists.queue_priority_hint = INT_MIN;
4403 		}
4404 		spin_unlock(&ve->base.active.lock);
4405 	}
4406 
4407 	/* Remaining _unready_ requests will be nop'ed when submitted */
4408 
4409 	execlists->queue_priority_hint = INT_MIN;
4410 	execlists->queue = RB_ROOT_CACHED;
4411 
4412 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4413 	execlists->tasklet.func = nop_submission_tasklet;
4414 
4415 	spin_unlock_irqrestore(&engine->active.lock, flags);
4416 }
4417 
4418 static void execlists_reset_finish(struct intel_engine_cs *engine)
4419 {
4420 	struct intel_engine_execlists * const execlists = &engine->execlists;
4421 
4422 	/*
4423 	 * After a GPU reset, we may have requests to replay. Do so now while
4424 	 * we still have the forcewake to be sure that the GPU is not allowed
4425 	 * to sleep before we restart and reload a context.
4426 	 */
4427 	GEM_BUG_ON(!reset_in_progress(execlists));
4428 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4429 		execlists->tasklet.func(execlists->tasklet.data);
4430 
4431 	if (__tasklet_enable(&execlists->tasklet))
4432 		/* And kick in case we missed a new request submission. */
4433 		tasklet_hi_schedule(&execlists->tasklet);
4434 	ENGINE_TRACE(engine, "depth->%d\n",
4435 		     atomic_read(&execlists->tasklet.count));
4436 }
4437 
4438 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4439 				    u64 offset, u32 len,
4440 				    const unsigned int flags)
4441 {
4442 	u32 *cs;
4443 
4444 	cs = intel_ring_begin(rq, 4);
4445 	if (IS_ERR(cs))
4446 		return PTR_ERR(cs);
4447 
4448 	/*
4449 	 * WaDisableCtxRestoreArbitration:bdw,chv
4450 	 *
4451 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4452 	 * particular all the gen that do not need the w/a at all!), if we
4453 	 * took care to make sure that on every switch into this context
4454 	 * (both ordinary and for preemption) that arbitrartion was enabled
4455 	 * we would be fine.  However, for gen8 there is another w/a that
4456 	 * requires us to not preempt inside GPGPU execution, so we keep
4457 	 * arbitration disabled for gen8 batches. Arbitration will be
4458 	 * re-enabled before we close the request
4459 	 * (engine->emit_fini_breadcrumb).
4460 	 */
4461 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4462 
4463 	/* FIXME(BDW+): Address space and security selectors. */
4464 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4465 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4466 	*cs++ = lower_32_bits(offset);
4467 	*cs++ = upper_32_bits(offset);
4468 
4469 	intel_ring_advance(rq, cs);
4470 
4471 	return 0;
4472 }
4473 
4474 static int gen8_emit_bb_start(struct i915_request *rq,
4475 			      u64 offset, u32 len,
4476 			      const unsigned int flags)
4477 {
4478 	u32 *cs;
4479 
4480 	cs = intel_ring_begin(rq, 6);
4481 	if (IS_ERR(cs))
4482 		return PTR_ERR(cs);
4483 
4484 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4485 
4486 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4487 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4488 	*cs++ = lower_32_bits(offset);
4489 	*cs++ = upper_32_bits(offset);
4490 
4491 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4492 	*cs++ = MI_NOOP;
4493 
4494 	intel_ring_advance(rq, cs);
4495 
4496 	return 0;
4497 }
4498 
4499 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4500 {
4501 	ENGINE_WRITE(engine, RING_IMR,
4502 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4503 	ENGINE_POSTING_READ(engine, RING_IMR);
4504 }
4505 
4506 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4507 {
4508 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4509 }
4510 
4511 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4512 {
4513 	u32 cmd, *cs;
4514 
4515 	cs = intel_ring_begin(request, 4);
4516 	if (IS_ERR(cs))
4517 		return PTR_ERR(cs);
4518 
4519 	cmd = MI_FLUSH_DW + 1;
4520 
4521 	/* We always require a command barrier so that subsequent
4522 	 * commands, such as breadcrumb interrupts, are strictly ordered
4523 	 * wrt the contents of the write cache being flushed to memory
4524 	 * (and thus being coherent from the CPU).
4525 	 */
4526 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4527 
4528 	if (mode & EMIT_INVALIDATE) {
4529 		cmd |= MI_INVALIDATE_TLB;
4530 		if (request->engine->class == VIDEO_DECODE_CLASS)
4531 			cmd |= MI_INVALIDATE_BSD;
4532 	}
4533 
4534 	*cs++ = cmd;
4535 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4536 	*cs++ = 0; /* upper addr */
4537 	*cs++ = 0; /* value */
4538 	intel_ring_advance(request, cs);
4539 
4540 	return 0;
4541 }
4542 
4543 static int gen8_emit_flush_render(struct i915_request *request,
4544 				  u32 mode)
4545 {
4546 	bool vf_flush_wa = false, dc_flush_wa = false;
4547 	u32 *cs, flags = 0;
4548 	int len;
4549 
4550 	flags |= PIPE_CONTROL_CS_STALL;
4551 
4552 	if (mode & EMIT_FLUSH) {
4553 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4554 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4555 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4556 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4557 	}
4558 
4559 	if (mode & EMIT_INVALIDATE) {
4560 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4561 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4562 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4563 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4564 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4565 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4566 		flags |= PIPE_CONTROL_QW_WRITE;
4567 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4568 
4569 		/*
4570 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4571 		 * pipe control.
4572 		 */
4573 		if (IS_GEN(request->engine->i915, 9))
4574 			vf_flush_wa = true;
4575 
4576 		/* WaForGAMHang:kbl */
4577 		if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0))
4578 			dc_flush_wa = true;
4579 	}
4580 
4581 	len = 6;
4582 
4583 	if (vf_flush_wa)
4584 		len += 6;
4585 
4586 	if (dc_flush_wa)
4587 		len += 12;
4588 
4589 	cs = intel_ring_begin(request, len);
4590 	if (IS_ERR(cs))
4591 		return PTR_ERR(cs);
4592 
4593 	if (vf_flush_wa)
4594 		cs = gen8_emit_pipe_control(cs, 0, 0);
4595 
4596 	if (dc_flush_wa)
4597 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4598 					    0);
4599 
4600 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4601 
4602 	if (dc_flush_wa)
4603 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4604 
4605 	intel_ring_advance(request, cs);
4606 
4607 	return 0;
4608 }
4609 
4610 static int gen11_emit_flush_render(struct i915_request *request,
4611 				   u32 mode)
4612 {
4613 	if (mode & EMIT_FLUSH) {
4614 		u32 *cs;
4615 		u32 flags = 0;
4616 
4617 		flags |= PIPE_CONTROL_CS_STALL;
4618 
4619 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4620 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4621 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4622 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4623 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4624 		flags |= PIPE_CONTROL_QW_WRITE;
4625 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4626 
4627 		cs = intel_ring_begin(request, 6);
4628 		if (IS_ERR(cs))
4629 			return PTR_ERR(cs);
4630 
4631 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4632 		intel_ring_advance(request, cs);
4633 	}
4634 
4635 	if (mode & EMIT_INVALIDATE) {
4636 		u32 *cs;
4637 		u32 flags = 0;
4638 
4639 		flags |= PIPE_CONTROL_CS_STALL;
4640 
4641 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4642 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4643 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4644 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4645 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4646 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4647 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4648 		flags |= PIPE_CONTROL_QW_WRITE;
4649 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4650 
4651 		cs = intel_ring_begin(request, 6);
4652 		if (IS_ERR(cs))
4653 			return PTR_ERR(cs);
4654 
4655 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4656 		intel_ring_advance(request, cs);
4657 	}
4658 
4659 	return 0;
4660 }
4661 
4662 static u32 preparser_disable(bool state)
4663 {
4664 	return MI_ARB_CHECK | 1 << 8 | state;
4665 }
4666 
4667 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4668 {
4669 	static const i915_reg_t vd[] = {
4670 		GEN12_VD0_AUX_NV,
4671 		GEN12_VD1_AUX_NV,
4672 		GEN12_VD2_AUX_NV,
4673 		GEN12_VD3_AUX_NV,
4674 	};
4675 
4676 	static const i915_reg_t ve[] = {
4677 		GEN12_VE0_AUX_NV,
4678 		GEN12_VE1_AUX_NV,
4679 	};
4680 
4681 	if (engine->class == VIDEO_DECODE_CLASS)
4682 		return vd[engine->instance];
4683 
4684 	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4685 		return ve[engine->instance];
4686 
4687 	GEM_BUG_ON("unknown aux_inv_reg\n");
4688 
4689 	return INVALID_MMIO_REG;
4690 }
4691 
4692 static u32 *
4693 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4694 {
4695 	*cs++ = MI_LOAD_REGISTER_IMM(1);
4696 	*cs++ = i915_mmio_reg_offset(inv_reg);
4697 	*cs++ = AUX_INV;
4698 	*cs++ = MI_NOOP;
4699 
4700 	return cs;
4701 }
4702 
4703 static int gen12_emit_flush_render(struct i915_request *request,
4704 				   u32 mode)
4705 {
4706 	if (mode & EMIT_FLUSH) {
4707 		u32 flags = 0;
4708 		u32 *cs;
4709 
4710 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4711 		flags |= PIPE_CONTROL_FLUSH_L3;
4712 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4713 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4714 		/* Wa_1409600907:tgl */
4715 		flags |= PIPE_CONTROL_DEPTH_STALL;
4716 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4717 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4718 
4719 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4720 		flags |= PIPE_CONTROL_QW_WRITE;
4721 
4722 		flags |= PIPE_CONTROL_CS_STALL;
4723 
4724 		cs = intel_ring_begin(request, 6);
4725 		if (IS_ERR(cs))
4726 			return PTR_ERR(cs);
4727 
4728 		cs = gen12_emit_pipe_control(cs,
4729 					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4730 					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4731 		intel_ring_advance(request, cs);
4732 	}
4733 
4734 	if (mode & EMIT_INVALIDATE) {
4735 		u32 flags = 0;
4736 		u32 *cs;
4737 
4738 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4739 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4740 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4741 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4742 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4743 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4744 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4745 
4746 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4747 		flags |= PIPE_CONTROL_QW_WRITE;
4748 
4749 		flags |= PIPE_CONTROL_CS_STALL;
4750 
4751 		cs = intel_ring_begin(request, 8 + 4);
4752 		if (IS_ERR(cs))
4753 			return PTR_ERR(cs);
4754 
4755 		/*
4756 		 * Prevent the pre-parser from skipping past the TLB
4757 		 * invalidate and loading a stale page for the batch
4758 		 * buffer / request payload.
4759 		 */
4760 		*cs++ = preparser_disable(true);
4761 
4762 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4763 
4764 		/* hsdes: 1809175790 */
4765 		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4766 
4767 		*cs++ = preparser_disable(false);
4768 		intel_ring_advance(request, cs);
4769 	}
4770 
4771 	return 0;
4772 }
4773 
4774 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4775 {
4776 	intel_engine_mask_t aux_inv = 0;
4777 	u32 cmd, *cs;
4778 
4779 	cmd = 4;
4780 	if (mode & EMIT_INVALIDATE)
4781 		cmd += 2;
4782 	if (mode & EMIT_INVALIDATE)
4783 		aux_inv = request->engine->mask & ~BIT(BCS0);
4784 	if (aux_inv)
4785 		cmd += 2 * hweight8(aux_inv) + 2;
4786 
4787 	cs = intel_ring_begin(request, cmd);
4788 	if (IS_ERR(cs))
4789 		return PTR_ERR(cs);
4790 
4791 	if (mode & EMIT_INVALIDATE)
4792 		*cs++ = preparser_disable(true);
4793 
4794 	cmd = MI_FLUSH_DW + 1;
4795 
4796 	/* We always require a command barrier so that subsequent
4797 	 * commands, such as breadcrumb interrupts, are strictly ordered
4798 	 * wrt the contents of the write cache being flushed to memory
4799 	 * (and thus being coherent from the CPU).
4800 	 */
4801 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4802 
4803 	if (mode & EMIT_INVALIDATE) {
4804 		cmd |= MI_INVALIDATE_TLB;
4805 		if (request->engine->class == VIDEO_DECODE_CLASS)
4806 			cmd |= MI_INVALIDATE_BSD;
4807 	}
4808 
4809 	*cs++ = cmd;
4810 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4811 	*cs++ = 0; /* upper addr */
4812 	*cs++ = 0; /* value */
4813 
4814 	if (aux_inv) { /* hsdes: 1809175790 */
4815 		struct intel_engine_cs *engine;
4816 		unsigned int tmp;
4817 
4818 		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4819 		for_each_engine_masked(engine, request->engine->gt,
4820 				       aux_inv, tmp) {
4821 			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4822 			*cs++ = AUX_INV;
4823 		}
4824 		*cs++ = MI_NOOP;
4825 	}
4826 
4827 	if (mode & EMIT_INVALIDATE)
4828 		*cs++ = preparser_disable(false);
4829 
4830 	intel_ring_advance(request, cs);
4831 
4832 	return 0;
4833 }
4834 
4835 static void assert_request_valid(struct i915_request *rq)
4836 {
4837 	struct intel_ring *ring __maybe_unused = rq->ring;
4838 
4839 	/* Can we unwind this request without appearing to go forwards? */
4840 	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4841 }
4842 
4843 /*
4844  * Reserve space for 2 NOOPs at the end of each request to be
4845  * used as a workaround for not being allowed to do lite
4846  * restore with HEAD==TAIL (WaIdleLiteRestore).
4847  */
4848 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4849 {
4850 	/* Ensure there's always at least one preemption point per-request. */
4851 	*cs++ = MI_ARB_CHECK;
4852 	*cs++ = MI_NOOP;
4853 	request->wa_tail = intel_ring_offset(request, cs);
4854 
4855 	/* Check that entire request is less than half the ring */
4856 	assert_request_valid(request);
4857 
4858 	return cs;
4859 }
4860 
4861 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4862 {
4863 	*cs++ = MI_SEMAPHORE_WAIT |
4864 		MI_SEMAPHORE_GLOBAL_GTT |
4865 		MI_SEMAPHORE_POLL |
4866 		MI_SEMAPHORE_SAD_EQ_SDD;
4867 	*cs++ = 0;
4868 	*cs++ = intel_hws_preempt_address(request->engine);
4869 	*cs++ = 0;
4870 
4871 	return cs;
4872 }
4873 
4874 static __always_inline u32*
4875 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4876 {
4877 	*cs++ = MI_USER_INTERRUPT;
4878 
4879 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4880 	if (intel_engine_has_semaphores(request->engine))
4881 		cs = emit_preempt_busywait(request, cs);
4882 
4883 	request->tail = intel_ring_offset(request, cs);
4884 	assert_ring_tail_valid(request->ring, request->tail);
4885 
4886 	return gen8_emit_wa_tail(request, cs);
4887 }
4888 
4889 static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs)
4890 {
4891 	u32 addr = i915_request_active_timeline(request)->hwsp_offset;
4892 
4893 	return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0);
4894 }
4895 
4896 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4897 {
4898 	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4899 }
4900 
4901 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4902 {
4903 	cs = gen8_emit_pipe_control(cs,
4904 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4905 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4906 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4907 				    0);
4908 
4909 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4910 	cs = gen8_emit_ggtt_write_rcs(cs,
4911 				      request->fence.seqno,
4912 				      i915_request_active_timeline(request)->hwsp_offset,
4913 				      PIPE_CONTROL_FLUSH_ENABLE |
4914 				      PIPE_CONTROL_CS_STALL);
4915 
4916 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4917 }
4918 
4919 static u32 *
4920 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4921 {
4922 	cs = gen8_emit_ggtt_write_rcs(cs,
4923 				      request->fence.seqno,
4924 				      i915_request_active_timeline(request)->hwsp_offset,
4925 				      PIPE_CONTROL_CS_STALL |
4926 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4927 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4928 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4929 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4930 				      PIPE_CONTROL_FLUSH_ENABLE);
4931 
4932 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4933 }
4934 
4935 /*
4936  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4937  * flush and will continue pre-fetching the instructions after it before the
4938  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4939  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4940  * of the next request before the memory has been flushed, we're guaranteed that
4941  * we won't access the batch itself too early.
4942  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4943  * so, if the current request is modifying an instruction in the next request on
4944  * the same intel_context, we might pre-fetch and then execute the pre-update
4945  * instruction. To avoid this, the users of self-modifying code should either
4946  * disable the parser around the code emitting the memory writes, via a new flag
4947  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4948  * the in-kernel use-cases we've opted to use a separate context, see
4949  * reloc_gpu() as an example.
4950  * All the above applies only to the instructions themselves. Non-inline data
4951  * used by the instructions is not pre-fetched.
4952  */
4953 
4954 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4955 {
4956 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4957 		MI_SEMAPHORE_GLOBAL_GTT |
4958 		MI_SEMAPHORE_POLL |
4959 		MI_SEMAPHORE_SAD_EQ_SDD;
4960 	*cs++ = 0;
4961 	*cs++ = intel_hws_preempt_address(request->engine);
4962 	*cs++ = 0;
4963 	*cs++ = 0;
4964 	*cs++ = MI_NOOP;
4965 
4966 	return cs;
4967 }
4968 
4969 static __always_inline u32*
4970 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4971 {
4972 	*cs++ = MI_USER_INTERRUPT;
4973 
4974 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4975 	if (intel_engine_has_semaphores(request->engine))
4976 		cs = gen12_emit_preempt_busywait(request, cs);
4977 
4978 	request->tail = intel_ring_offset(request, cs);
4979 	assert_ring_tail_valid(request->ring, request->tail);
4980 
4981 	return gen8_emit_wa_tail(request, cs);
4982 }
4983 
4984 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4985 {
4986 	return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4987 }
4988 
4989 static u32 *
4990 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4991 {
4992 	cs = gen12_emit_ggtt_write_rcs(cs,
4993 				       request->fence.seqno,
4994 				       i915_request_active_timeline(request)->hwsp_offset,
4995 				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4996 				       PIPE_CONTROL_CS_STALL |
4997 				       PIPE_CONTROL_TILE_CACHE_FLUSH |
4998 				       PIPE_CONTROL_FLUSH_L3 |
4999 				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5000 				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5001 				       /* Wa_1409600907:tgl */
5002 				       PIPE_CONTROL_DEPTH_STALL |
5003 				       PIPE_CONTROL_DC_FLUSH_ENABLE |
5004 				       PIPE_CONTROL_FLUSH_ENABLE);
5005 
5006 	return gen12_emit_fini_breadcrumb_tail(request, cs);
5007 }
5008 
5009 static void execlists_park(struct intel_engine_cs *engine)
5010 {
5011 	cancel_timer(&engine->execlists.timer);
5012 	cancel_timer(&engine->execlists.preempt);
5013 }
5014 
5015 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
5016 {
5017 	engine->submit_request = execlists_submit_request;
5018 	engine->schedule = i915_schedule;
5019 	engine->execlists.tasklet.func = execlists_submission_tasklet;
5020 
5021 	engine->reset.prepare = execlists_reset_prepare;
5022 	engine->reset.rewind = execlists_reset_rewind;
5023 	engine->reset.cancel = execlists_reset_cancel;
5024 	engine->reset.finish = execlists_reset_finish;
5025 
5026 	engine->park = execlists_park;
5027 	engine->unpark = NULL;
5028 
5029 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5030 	if (!intel_vgpu_active(engine->i915)) {
5031 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5032 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5033 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5034 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5035 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5036 		}
5037 	}
5038 
5039 	if (INTEL_GEN(engine->i915) >= 12)
5040 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5041 
5042 	if (intel_engine_has_preemption(engine))
5043 		engine->emit_bb_start = gen8_emit_bb_start;
5044 	else
5045 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
5046 }
5047 
5048 static void execlists_shutdown(struct intel_engine_cs *engine)
5049 {
5050 	/* Synchronise with residual timers and any softirq they raise */
5051 	del_timer_sync(&engine->execlists.timer);
5052 	del_timer_sync(&engine->execlists.preempt);
5053 	tasklet_kill(&engine->execlists.tasklet);
5054 }
5055 
5056 static void execlists_release(struct intel_engine_cs *engine)
5057 {
5058 	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5059 
5060 	execlists_shutdown(engine);
5061 
5062 	intel_engine_cleanup_common(engine);
5063 	lrc_destroy_wa_ctx(engine);
5064 }
5065 
5066 static void
5067 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5068 {
5069 	/* Default vfuncs which can be overriden by each engine. */
5070 
5071 	engine->resume = execlists_resume;
5072 
5073 	engine->cops = &execlists_context_ops;
5074 	engine->request_alloc = execlists_request_alloc;
5075 
5076 	engine->emit_flush = gen8_emit_flush;
5077 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5078 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5079 	if (INTEL_GEN(engine->i915) >= 12) {
5080 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5081 		engine->emit_flush = gen12_emit_flush;
5082 	}
5083 	engine->set_default_submission = intel_execlists_set_default_submission;
5084 
5085 	if (INTEL_GEN(engine->i915) < 11) {
5086 		engine->irq_enable = gen8_logical_ring_enable_irq;
5087 		engine->irq_disable = gen8_logical_ring_disable_irq;
5088 	} else {
5089 		/*
5090 		 * TODO: On Gen11 interrupt masks need to be clear
5091 		 * to allow C6 entry. Keep interrupts enabled at
5092 		 * and take the hit of generating extra interrupts
5093 		 * until a more refined solution exists.
5094 		 */
5095 	}
5096 }
5097 
5098 static inline void
5099 logical_ring_default_irqs(struct intel_engine_cs *engine)
5100 {
5101 	unsigned int shift = 0;
5102 
5103 	if (INTEL_GEN(engine->i915) < 11) {
5104 		const u8 irq_shifts[] = {
5105 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5106 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5107 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5108 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5109 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5110 		};
5111 
5112 		shift = irq_shifts[engine->id];
5113 	}
5114 
5115 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5116 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5117 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5118 	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5119 }
5120 
5121 static void rcs_submission_override(struct intel_engine_cs *engine)
5122 {
5123 	switch (INTEL_GEN(engine->i915)) {
5124 	case 12:
5125 		engine->emit_flush = gen12_emit_flush_render;
5126 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5127 		break;
5128 	case 11:
5129 		engine->emit_flush = gen11_emit_flush_render;
5130 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5131 		break;
5132 	default:
5133 		engine->emit_flush = gen8_emit_flush_render;
5134 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5135 		break;
5136 	}
5137 }
5138 
5139 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5140 {
5141 	struct intel_engine_execlists * const execlists = &engine->execlists;
5142 	struct drm_i915_private *i915 = engine->i915;
5143 	struct intel_uncore *uncore = engine->uncore;
5144 	u32 base = engine->mmio_base;
5145 
5146 	tasklet_init(&engine->execlists.tasklet,
5147 		     execlists_submission_tasklet, (unsigned long)engine);
5148 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5149 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5150 
5151 	logical_ring_default_vfuncs(engine);
5152 	logical_ring_default_irqs(engine);
5153 
5154 	if (engine->class == RENDER_CLASS)
5155 		rcs_submission_override(engine);
5156 
5157 	if (intel_init_workaround_bb(engine))
5158 		/*
5159 		 * We continue even if we fail to initialize WA batch
5160 		 * because we only expect rare glitches but nothing
5161 		 * critical to prevent us from using GPU
5162 		 */
5163 		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5164 
5165 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5166 		execlists->submit_reg = uncore->regs +
5167 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5168 		execlists->ctrl_reg = uncore->regs +
5169 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5170 	} else {
5171 		execlists->submit_reg = uncore->regs +
5172 			i915_mmio_reg_offset(RING_ELSP(base));
5173 	}
5174 
5175 	execlists->csb_status =
5176 		(u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5177 
5178 	execlists->csb_write =
5179 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5180 
5181 	if (INTEL_GEN(i915) < 11)
5182 		execlists->csb_size = GEN8_CSB_ENTRIES;
5183 	else
5184 		execlists->csb_size = GEN11_CSB_ENTRIES;
5185 
5186 	if (INTEL_GEN(engine->i915) >= 11) {
5187 		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5188 		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5189 	}
5190 
5191 	/* Finally, take ownership and responsibility for cleanup! */
5192 	engine->sanitize = execlists_sanitize;
5193 	engine->release = execlists_release;
5194 
5195 	return 0;
5196 }
5197 
5198 static void init_common_reg_state(u32 * const regs,
5199 				  const struct intel_engine_cs *engine,
5200 				  const struct intel_ring *ring,
5201 				  bool inhibit)
5202 {
5203 	u32 ctl;
5204 
5205 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5206 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5207 	if (inhibit)
5208 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5209 	if (INTEL_GEN(engine->i915) < 11)
5210 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5211 					   CTX_CTRL_RS_CTX_ENABLE);
5212 	regs[CTX_CONTEXT_CONTROL] = ctl;
5213 
5214 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5215 	regs[CTX_TIMESTAMP] = 0;
5216 }
5217 
5218 static void init_wa_bb_reg_state(u32 * const regs,
5219 				 const struct intel_engine_cs *engine)
5220 {
5221 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5222 
5223 	if (wa_ctx->per_ctx.size) {
5224 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5225 
5226 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5227 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5228 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5229 	}
5230 
5231 	if (wa_ctx->indirect_ctx.size) {
5232 		lrc_ring_setup_indirect_ctx(regs, engine,
5233 					    i915_ggtt_offset(wa_ctx->vma) +
5234 					    wa_ctx->indirect_ctx.offset,
5235 					    wa_ctx->indirect_ctx.size);
5236 	}
5237 }
5238 
5239 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5240 {
5241 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5242 		/* 64b PPGTT (48bit canonical)
5243 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5244 		 * other PDP Descriptors are ignored.
5245 		 */
5246 		ASSIGN_CTX_PML4(ppgtt, regs);
5247 	} else {
5248 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5249 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5250 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5251 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5252 	}
5253 }
5254 
5255 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5256 {
5257 	if (i915_is_ggtt(vm))
5258 		return i915_vm_to_ggtt(vm)->alias;
5259 	else
5260 		return i915_vm_to_ppgtt(vm);
5261 }
5262 
5263 static void execlists_init_reg_state(u32 *regs,
5264 				     const struct intel_context *ce,
5265 				     const struct intel_engine_cs *engine,
5266 				     const struct intel_ring *ring,
5267 				     bool inhibit)
5268 {
5269 	/*
5270 	 * A context is actually a big batch buffer with several
5271 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5272 	 * values we are setting here are only for the first context restore:
5273 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5274 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5275 	 * we are not initializing here).
5276 	 *
5277 	 * Must keep consistent with virtual_update_register_offsets().
5278 	 */
5279 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5280 
5281 	init_common_reg_state(regs, engine, ring, inhibit);
5282 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5283 
5284 	init_wa_bb_reg_state(regs, engine);
5285 
5286 	__reset_stop_ring(regs, engine);
5287 }
5288 
5289 static int
5290 populate_lr_context(struct intel_context *ce,
5291 		    struct drm_i915_gem_object *ctx_obj,
5292 		    struct intel_engine_cs *engine,
5293 		    struct intel_ring *ring)
5294 {
5295 	bool inhibit = true;
5296 	void *vaddr;
5297 
5298 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5299 	if (IS_ERR(vaddr)) {
5300 		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5301 		return PTR_ERR(vaddr);
5302 	}
5303 
5304 	set_redzone(vaddr, engine);
5305 
5306 	if (engine->default_state) {
5307 		shmem_read(engine->default_state, 0,
5308 			   vaddr, engine->context_size);
5309 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5310 		inhibit = false;
5311 	}
5312 
5313 	/* Clear the ppHWSP (inc. per-context counters) */
5314 	memset(vaddr, 0, PAGE_SIZE);
5315 
5316 	/*
5317 	 * The second page of the context object contains some registers which
5318 	 * must be set up prior to the first execution.
5319 	 */
5320 	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5321 				 ce, engine, ring, inhibit);
5322 
5323 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5324 	i915_gem_object_unpin_map(ctx_obj);
5325 	return 0;
5326 }
5327 
5328 static struct intel_timeline *pinned_timeline(struct intel_context *ce)
5329 {
5330 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
5331 
5332 	return intel_timeline_create_from_engine(ce->engine,
5333 						 page_unmask_bits(tl));
5334 }
5335 
5336 static int __execlists_context_alloc(struct intel_context *ce,
5337 				     struct intel_engine_cs *engine)
5338 {
5339 	struct drm_i915_gem_object *ctx_obj;
5340 	struct intel_ring *ring;
5341 	struct i915_vma *vma;
5342 	u32 context_size;
5343 	int ret;
5344 
5345 	GEM_BUG_ON(ce->state);
5346 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5347 
5348 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5349 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5350 
5351 	if (INTEL_GEN(engine->i915) == 12) {
5352 		ce->wa_bb_page = context_size / PAGE_SIZE;
5353 		context_size += PAGE_SIZE;
5354 	}
5355 
5356 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5357 	if (IS_ERR(ctx_obj))
5358 		return PTR_ERR(ctx_obj);
5359 
5360 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5361 	if (IS_ERR(vma)) {
5362 		ret = PTR_ERR(vma);
5363 		goto error_deref_obj;
5364 	}
5365 
5366 	if (!page_mask_bits(ce->timeline)) {
5367 		struct intel_timeline *tl;
5368 
5369 		/*
5370 		 * Use the static global HWSP for the kernel context, and
5371 		 * a dynamically allocated cacheline for everyone else.
5372 		 */
5373 		if (unlikely(ce->timeline))
5374 			tl = pinned_timeline(ce);
5375 		else
5376 			tl = intel_timeline_create(engine->gt);
5377 		if (IS_ERR(tl)) {
5378 			ret = PTR_ERR(tl);
5379 			goto error_deref_obj;
5380 		}
5381 
5382 		ce->timeline = tl;
5383 	}
5384 
5385 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5386 	if (IS_ERR(ring)) {
5387 		ret = PTR_ERR(ring);
5388 		goto error_deref_obj;
5389 	}
5390 
5391 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5392 	if (ret) {
5393 		drm_dbg(&engine->i915->drm,
5394 			"Failed to populate LRC: %d\n", ret);
5395 		goto error_ring_free;
5396 	}
5397 
5398 	ce->ring = ring;
5399 	ce->state = vma;
5400 
5401 	return 0;
5402 
5403 error_ring_free:
5404 	intel_ring_put(ring);
5405 error_deref_obj:
5406 	i915_gem_object_put(ctx_obj);
5407 	return ret;
5408 }
5409 
5410 static struct list_head *virtual_queue(struct virtual_engine *ve)
5411 {
5412 	return &ve->base.execlists.default_priolist.requests[0];
5413 }
5414 
5415 static void virtual_context_destroy(struct kref *kref)
5416 {
5417 	struct virtual_engine *ve =
5418 		container_of(kref, typeof(*ve), context.ref);
5419 	unsigned int n;
5420 
5421 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5422 	GEM_BUG_ON(ve->request);
5423 	GEM_BUG_ON(ve->context.inflight);
5424 
5425 	for (n = 0; n < ve->num_siblings; n++) {
5426 		struct intel_engine_cs *sibling = ve->siblings[n];
5427 		struct rb_node *node = &ve->nodes[sibling->id].rb;
5428 		unsigned long flags;
5429 
5430 		if (RB_EMPTY_NODE(node))
5431 			continue;
5432 
5433 		spin_lock_irqsave(&sibling->active.lock, flags);
5434 
5435 		/* Detachment is lazily performed in the execlists tasklet */
5436 		if (!RB_EMPTY_NODE(node))
5437 			rb_erase_cached(node, &sibling->execlists.virtual);
5438 
5439 		spin_unlock_irqrestore(&sibling->active.lock, flags);
5440 	}
5441 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5442 
5443 	if (ve->context.state)
5444 		__execlists_context_fini(&ve->context);
5445 	intel_context_fini(&ve->context);
5446 
5447 	intel_engine_free_request_pool(&ve->base);
5448 
5449 	kfree(ve->bonds);
5450 	kfree(ve);
5451 }
5452 
5453 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5454 {
5455 	int swp;
5456 
5457 	/*
5458 	 * Pick a random sibling on starting to help spread the load around.
5459 	 *
5460 	 * New contexts are typically created with exactly the same order
5461 	 * of siblings, and often started in batches. Due to the way we iterate
5462 	 * the array of sibling when submitting requests, sibling[0] is
5463 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5464 	 * randomised across the system, we also help spread the load by the
5465 	 * first engine we inspect being different each time.
5466 	 *
5467 	 * NB This does not force us to execute on this engine, it will just
5468 	 * typically be the first we inspect for submission.
5469 	 */
5470 	swp = prandom_u32_max(ve->num_siblings);
5471 	if (swp)
5472 		swap(ve->siblings[swp], ve->siblings[0]);
5473 }
5474 
5475 static int virtual_context_alloc(struct intel_context *ce)
5476 {
5477 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5478 
5479 	return __execlists_context_alloc(ce, ve->siblings[0]);
5480 }
5481 
5482 static int virtual_context_pin(struct intel_context *ce, void *vaddr)
5483 {
5484 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5485 
5486 	/* Note: we must use a real engine class for setting up reg state */
5487 	return __execlists_context_pin(ce, ve->siblings[0], vaddr);
5488 }
5489 
5490 static void virtual_context_enter(struct intel_context *ce)
5491 {
5492 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5493 	unsigned int n;
5494 
5495 	for (n = 0; n < ve->num_siblings; n++)
5496 		intel_engine_pm_get(ve->siblings[n]);
5497 
5498 	intel_timeline_enter(ce->timeline);
5499 }
5500 
5501 static void virtual_context_exit(struct intel_context *ce)
5502 {
5503 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5504 	unsigned int n;
5505 
5506 	intel_timeline_exit(ce->timeline);
5507 
5508 	for (n = 0; n < ve->num_siblings; n++)
5509 		intel_engine_pm_put(ve->siblings[n]);
5510 }
5511 
5512 static const struct intel_context_ops virtual_context_ops = {
5513 	.alloc = virtual_context_alloc,
5514 
5515 	.pre_pin = execlists_context_pre_pin,
5516 	.pin = virtual_context_pin,
5517 	.unpin = execlists_context_unpin,
5518 	.post_unpin = execlists_context_post_unpin,
5519 
5520 	.enter = virtual_context_enter,
5521 	.exit = virtual_context_exit,
5522 
5523 	.destroy = virtual_context_destroy,
5524 };
5525 
5526 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5527 {
5528 	struct i915_request *rq;
5529 	intel_engine_mask_t mask;
5530 
5531 	rq = READ_ONCE(ve->request);
5532 	if (!rq)
5533 		return 0;
5534 
5535 	/* The rq is ready for submission; rq->execution_mask is now stable. */
5536 	mask = rq->execution_mask;
5537 	if (unlikely(!mask)) {
5538 		/* Invalid selection, submit to a random engine in error */
5539 		i915_request_set_error_once(rq, -ENODEV);
5540 		mask = ve->siblings[0]->mask;
5541 	}
5542 
5543 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5544 		     rq->fence.context, rq->fence.seqno,
5545 		     mask, ve->base.execlists.queue_priority_hint);
5546 
5547 	return mask;
5548 }
5549 
5550 static void virtual_submission_tasklet(unsigned long data)
5551 {
5552 	struct virtual_engine * const ve = (struct virtual_engine *)data;
5553 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5554 	intel_engine_mask_t mask;
5555 	unsigned int n;
5556 
5557 	rcu_read_lock();
5558 	mask = virtual_submission_mask(ve);
5559 	rcu_read_unlock();
5560 	if (unlikely(!mask))
5561 		return;
5562 
5563 	local_irq_disable();
5564 	for (n = 0; n < ve->num_siblings; n++) {
5565 		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5566 		struct ve_node * const node = &ve->nodes[sibling->id];
5567 		struct rb_node **parent, *rb;
5568 		bool first;
5569 
5570 		if (!READ_ONCE(ve->request))
5571 			break; /* already handled by a sibling's tasklet */
5572 
5573 		if (unlikely(!(mask & sibling->mask))) {
5574 			if (!RB_EMPTY_NODE(&node->rb)) {
5575 				spin_lock(&sibling->active.lock);
5576 				rb_erase_cached(&node->rb,
5577 						&sibling->execlists.virtual);
5578 				RB_CLEAR_NODE(&node->rb);
5579 				spin_unlock(&sibling->active.lock);
5580 			}
5581 			continue;
5582 		}
5583 
5584 		spin_lock(&sibling->active.lock);
5585 
5586 		if (!RB_EMPTY_NODE(&node->rb)) {
5587 			/*
5588 			 * Cheat and avoid rebalancing the tree if we can
5589 			 * reuse this node in situ.
5590 			 */
5591 			first = rb_first_cached(&sibling->execlists.virtual) ==
5592 				&node->rb;
5593 			if (prio == node->prio || (prio > node->prio && first))
5594 				goto submit_engine;
5595 
5596 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5597 		}
5598 
5599 		rb = NULL;
5600 		first = true;
5601 		parent = &sibling->execlists.virtual.rb_root.rb_node;
5602 		while (*parent) {
5603 			struct ve_node *other;
5604 
5605 			rb = *parent;
5606 			other = rb_entry(rb, typeof(*other), rb);
5607 			if (prio > other->prio) {
5608 				parent = &rb->rb_left;
5609 			} else {
5610 				parent = &rb->rb_right;
5611 				first = false;
5612 			}
5613 		}
5614 
5615 		rb_link_node(&node->rb, rb, parent);
5616 		rb_insert_color_cached(&node->rb,
5617 				       &sibling->execlists.virtual,
5618 				       first);
5619 
5620 submit_engine:
5621 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5622 		node->prio = prio;
5623 		if (first && prio > sibling->execlists.queue_priority_hint)
5624 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5625 
5626 		spin_unlock(&sibling->active.lock);
5627 	}
5628 	local_irq_enable();
5629 }
5630 
5631 static void virtual_submit_request(struct i915_request *rq)
5632 {
5633 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5634 	struct i915_request *old;
5635 	unsigned long flags;
5636 
5637 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5638 		     rq->fence.context,
5639 		     rq->fence.seqno);
5640 
5641 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5642 
5643 	spin_lock_irqsave(&ve->base.active.lock, flags);
5644 
5645 	old = ve->request;
5646 	if (old) { /* background completion event from preempt-to-busy */
5647 		GEM_BUG_ON(!i915_request_completed(old));
5648 		__i915_request_submit(old);
5649 		i915_request_put(old);
5650 	}
5651 
5652 	if (i915_request_completed(rq)) {
5653 		__i915_request_submit(rq);
5654 
5655 		ve->base.execlists.queue_priority_hint = INT_MIN;
5656 		ve->request = NULL;
5657 	} else {
5658 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5659 		ve->request = i915_request_get(rq);
5660 
5661 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5662 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5663 
5664 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
5665 	}
5666 
5667 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5668 }
5669 
5670 static struct ve_bond *
5671 virtual_find_bond(struct virtual_engine *ve,
5672 		  const struct intel_engine_cs *master)
5673 {
5674 	int i;
5675 
5676 	for (i = 0; i < ve->num_bonds; i++) {
5677 		if (ve->bonds[i].master == master)
5678 			return &ve->bonds[i];
5679 	}
5680 
5681 	return NULL;
5682 }
5683 
5684 static void
5685 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5686 {
5687 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5688 	intel_engine_mask_t allowed, exec;
5689 	struct ve_bond *bond;
5690 
5691 	allowed = ~to_request(signal)->engine->mask;
5692 
5693 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5694 	if (bond)
5695 		allowed &= bond->sibling_mask;
5696 
5697 	/* Restrict the bonded request to run on only the available engines */
5698 	exec = READ_ONCE(rq->execution_mask);
5699 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5700 		;
5701 
5702 	/* Prevent the master from being re-run on the bonded engines */
5703 	to_request(signal)->execution_mask &= ~allowed;
5704 }
5705 
5706 struct intel_context *
5707 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5708 			       unsigned int count)
5709 {
5710 	struct virtual_engine *ve;
5711 	unsigned int n;
5712 	int err;
5713 
5714 	if (count == 0)
5715 		return ERR_PTR(-EINVAL);
5716 
5717 	if (count == 1)
5718 		return intel_context_create(siblings[0]);
5719 
5720 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5721 	if (!ve)
5722 		return ERR_PTR(-ENOMEM);
5723 
5724 	ve->base.i915 = siblings[0]->i915;
5725 	ve->base.gt = siblings[0]->gt;
5726 	ve->base.uncore = siblings[0]->uncore;
5727 	ve->base.id = -1;
5728 
5729 	ve->base.class = OTHER_CLASS;
5730 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5731 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5732 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5733 
5734 	/*
5735 	 * The decision on whether to submit a request using semaphores
5736 	 * depends on the saturated state of the engine. We only compute
5737 	 * this during HW submission of the request, and we need for this
5738 	 * state to be globally applied to all requests being submitted
5739 	 * to this engine. Virtual engines encompass more than one physical
5740 	 * engine and so we cannot accurately tell in advance if one of those
5741 	 * engines is already saturated and so cannot afford to use a semaphore
5742 	 * and be pessimized in priority for doing so -- if we are the only
5743 	 * context using semaphores after all other clients have stopped, we
5744 	 * will be starved on the saturated system. Such a global switch for
5745 	 * semaphores is less than ideal, but alas is the current compromise.
5746 	 */
5747 	ve->base.saturated = ALL_ENGINES;
5748 
5749 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5750 
5751 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5752 	intel_engine_init_execlists(&ve->base);
5753 
5754 	ve->base.cops = &virtual_context_ops;
5755 	ve->base.request_alloc = execlists_request_alloc;
5756 
5757 	ve->base.schedule = i915_schedule;
5758 	ve->base.submit_request = virtual_submit_request;
5759 	ve->base.bond_execute = virtual_bond_execute;
5760 
5761 	INIT_LIST_HEAD(virtual_queue(ve));
5762 	ve->base.execlists.queue_priority_hint = INT_MIN;
5763 	tasklet_init(&ve->base.execlists.tasklet,
5764 		     virtual_submission_tasklet,
5765 		     (unsigned long)ve);
5766 
5767 	intel_context_init(&ve->context, &ve->base);
5768 
5769 	ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
5770 	if (!ve->base.breadcrumbs) {
5771 		err = -ENOMEM;
5772 		goto err_put;
5773 	}
5774 
5775 	for (n = 0; n < count; n++) {
5776 		struct intel_engine_cs *sibling = siblings[n];
5777 
5778 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5779 		if (sibling->mask & ve->base.mask) {
5780 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5781 				  sibling->name);
5782 			err = -EINVAL;
5783 			goto err_put;
5784 		}
5785 
5786 		/*
5787 		 * The virtual engine implementation is tightly coupled to
5788 		 * the execlists backend -- we push out request directly
5789 		 * into a tree inside each physical engine. We could support
5790 		 * layering if we handle cloning of the requests and
5791 		 * submitting a copy into each backend.
5792 		 */
5793 		if (sibling->execlists.tasklet.func !=
5794 		    execlists_submission_tasklet) {
5795 			err = -ENODEV;
5796 			goto err_put;
5797 		}
5798 
5799 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5800 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5801 
5802 		ve->siblings[ve->num_siblings++] = sibling;
5803 		ve->base.mask |= sibling->mask;
5804 
5805 		/*
5806 		 * All physical engines must be compatible for their emission
5807 		 * functions (as we build the instructions during request
5808 		 * construction and do not alter them before submission
5809 		 * on the physical engine). We use the engine class as a guide
5810 		 * here, although that could be refined.
5811 		 */
5812 		if (ve->base.class != OTHER_CLASS) {
5813 			if (ve->base.class != sibling->class) {
5814 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5815 					  sibling->class, ve->base.class);
5816 				err = -EINVAL;
5817 				goto err_put;
5818 			}
5819 			continue;
5820 		}
5821 
5822 		ve->base.class = sibling->class;
5823 		ve->base.uabi_class = sibling->uabi_class;
5824 		snprintf(ve->base.name, sizeof(ve->base.name),
5825 			 "v%dx%d", ve->base.class, count);
5826 		ve->base.context_size = sibling->context_size;
5827 
5828 		ve->base.emit_bb_start = sibling->emit_bb_start;
5829 		ve->base.emit_flush = sibling->emit_flush;
5830 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5831 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5832 		ve->base.emit_fini_breadcrumb_dw =
5833 			sibling->emit_fini_breadcrumb_dw;
5834 
5835 		ve->base.flags = sibling->flags;
5836 	}
5837 
5838 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5839 
5840 	virtual_engine_initial_hint(ve);
5841 	return &ve->context;
5842 
5843 err_put:
5844 	intel_context_put(&ve->context);
5845 	return ERR_PTR(err);
5846 }
5847 
5848 struct intel_context *
5849 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5850 {
5851 	struct virtual_engine *se = to_virtual_engine(src);
5852 	struct intel_context *dst;
5853 
5854 	dst = intel_execlists_create_virtual(se->siblings,
5855 					     se->num_siblings);
5856 	if (IS_ERR(dst))
5857 		return dst;
5858 
5859 	if (se->num_bonds) {
5860 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5861 
5862 		de->bonds = kmemdup(se->bonds,
5863 				    sizeof(*se->bonds) * se->num_bonds,
5864 				    GFP_KERNEL);
5865 		if (!de->bonds) {
5866 			intel_context_put(dst);
5867 			return ERR_PTR(-ENOMEM);
5868 		}
5869 
5870 		de->num_bonds = se->num_bonds;
5871 	}
5872 
5873 	return dst;
5874 }
5875 
5876 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5877 				     const struct intel_engine_cs *master,
5878 				     const struct intel_engine_cs *sibling)
5879 {
5880 	struct virtual_engine *ve = to_virtual_engine(engine);
5881 	struct ve_bond *bond;
5882 	int n;
5883 
5884 	/* Sanity check the sibling is part of the virtual engine */
5885 	for (n = 0; n < ve->num_siblings; n++)
5886 		if (sibling == ve->siblings[n])
5887 			break;
5888 	if (n == ve->num_siblings)
5889 		return -EINVAL;
5890 
5891 	bond = virtual_find_bond(ve, master);
5892 	if (bond) {
5893 		bond->sibling_mask |= sibling->mask;
5894 		return 0;
5895 	}
5896 
5897 	bond = krealloc(ve->bonds,
5898 			sizeof(*bond) * (ve->num_bonds + 1),
5899 			GFP_KERNEL);
5900 	if (!bond)
5901 		return -ENOMEM;
5902 
5903 	bond[ve->num_bonds].master = master;
5904 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5905 
5906 	ve->bonds = bond;
5907 	ve->num_bonds++;
5908 
5909 	return 0;
5910 }
5911 
5912 struct intel_engine_cs *
5913 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5914 				 unsigned int sibling)
5915 {
5916 	struct virtual_engine *ve = to_virtual_engine(engine);
5917 
5918 	if (sibling >= ve->num_siblings)
5919 		return NULL;
5920 
5921 	return ve->siblings[sibling];
5922 }
5923 
5924 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5925 				   struct drm_printer *m,
5926 				   void (*show_request)(struct drm_printer *m,
5927 							struct i915_request *rq,
5928 							const char *prefix),
5929 				   unsigned int max)
5930 {
5931 	const struct intel_engine_execlists *execlists = &engine->execlists;
5932 	struct i915_request *rq, *last;
5933 	unsigned long flags;
5934 	unsigned int count;
5935 	struct rb_node *rb;
5936 
5937 	spin_lock_irqsave(&engine->active.lock, flags);
5938 
5939 	last = NULL;
5940 	count = 0;
5941 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5942 		if (count++ < max - 1)
5943 			show_request(m, rq, "\t\tE ");
5944 		else
5945 			last = rq;
5946 	}
5947 	if (last) {
5948 		if (count > max) {
5949 			drm_printf(m,
5950 				   "\t\t...skipping %d executing requests...\n",
5951 				   count - max);
5952 		}
5953 		show_request(m, last, "\t\tE ");
5954 	}
5955 
5956 	if (execlists->switch_priority_hint != INT_MIN)
5957 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5958 			   READ_ONCE(execlists->switch_priority_hint));
5959 	if (execlists->queue_priority_hint != INT_MIN)
5960 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5961 			   READ_ONCE(execlists->queue_priority_hint));
5962 
5963 	last = NULL;
5964 	count = 0;
5965 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5966 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5967 		int i;
5968 
5969 		priolist_for_each_request(rq, p, i) {
5970 			if (count++ < max - 1)
5971 				show_request(m, rq, "\t\tQ ");
5972 			else
5973 				last = rq;
5974 		}
5975 	}
5976 	if (last) {
5977 		if (count > max) {
5978 			drm_printf(m,
5979 				   "\t\t...skipping %d queued requests...\n",
5980 				   count - max);
5981 		}
5982 		show_request(m, last, "\t\tQ ");
5983 	}
5984 
5985 	last = NULL;
5986 	count = 0;
5987 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5988 		struct virtual_engine *ve =
5989 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5990 		struct i915_request *rq = READ_ONCE(ve->request);
5991 
5992 		if (rq) {
5993 			if (count++ < max - 1)
5994 				show_request(m, rq, "\t\tV ");
5995 			else
5996 				last = rq;
5997 		}
5998 	}
5999 	if (last) {
6000 		if (count > max) {
6001 			drm_printf(m,
6002 				   "\t\t...skipping %d virtual requests...\n",
6003 				   count - max);
6004 		}
6005 		show_request(m, last, "\t\tV ");
6006 	}
6007 
6008 	spin_unlock_irqrestore(&engine->active.lock, flags);
6009 }
6010 
6011 void intel_lr_context_reset(struct intel_engine_cs *engine,
6012 			    struct intel_context *ce,
6013 			    u32 head,
6014 			    bool scrub)
6015 {
6016 	GEM_BUG_ON(!intel_context_is_pinned(ce));
6017 
6018 	/*
6019 	 * We want a simple context + ring to execute the breadcrumb update.
6020 	 * We cannot rely on the context being intact across the GPU hang,
6021 	 * so clear it and rebuild just what we need for the breadcrumb.
6022 	 * All pending requests for this context will be zapped, and any
6023 	 * future request will be after userspace has had the opportunity
6024 	 * to recreate its own state.
6025 	 */
6026 	if (scrub)
6027 		restore_default_state(ce, engine);
6028 
6029 	/* Rerun the request; its payload has been neutered (if guilty). */
6030 	__execlists_update_reg_state(ce, engine, head);
6031 }
6032 
6033 bool
6034 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6035 {
6036 	return engine->set_default_submission ==
6037 	       intel_execlists_set_default_submission;
6038 }
6039 
6040 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6041 #include "selftest_lrc.c"
6042 #endif
6043