xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 2208f39c)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_breadcrumbs.h"
141 #include "intel_context.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_gt_requests.h"
146 #include "intel_lrc_reg.h"
147 #include "intel_mocs.h"
148 #include "intel_reset.h"
149 #include "intel_ring.h"
150 #include "intel_workarounds.h"
151 #include "shmem_utils.h"
152 
153 #define RING_EXECLIST_QFULL		(1 << 0x2)
154 #define RING_EXECLIST1_VALID		(1 << 0x3)
155 #define RING_EXECLIST0_VALID		(1 << 0x4)
156 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
157 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
158 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
159 
160 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
161 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
163 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
164 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
165 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
166 
167 #define GEN8_CTX_STATUS_COMPLETED_MASK \
168 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
169 
170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
171 
172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
174 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
175 #define GEN12_IDLE_CTX_ID		0x7FF
176 #define GEN12_CSB_CTX_VALID(csb_dw) \
177 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
178 
179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
181 
182 struct virtual_engine {
183 	struct intel_engine_cs base;
184 	struct intel_context context;
185 
186 	/*
187 	 * We allow only a single request through the virtual engine at a time
188 	 * (each request in the timeline waits for the completion fence of
189 	 * the previous before being submitted). By restricting ourselves to
190 	 * only submitting a single request, each request is placed on to a
191 	 * physical to maximise load spreading (by virtue of the late greedy
192 	 * scheduling -- each real engine takes the next available request
193 	 * upon idling).
194 	 */
195 	struct i915_request *request;
196 
197 	/*
198 	 * We keep a rbtree of available virtual engines inside each physical
199 	 * engine, sorted by priority. Here we preallocate the nodes we need
200 	 * for the virtual engine, indexed by physical_engine->id.
201 	 */
202 	struct ve_node {
203 		struct rb_node rb;
204 		int prio;
205 	} nodes[I915_NUM_ENGINES];
206 
207 	/*
208 	 * Keep track of bonded pairs -- restrictions upon on our selection
209 	 * of physical engines any particular request may be submitted to.
210 	 * If we receive a submit-fence from a master engine, we will only
211 	 * use one of sibling_mask physical engines.
212 	 */
213 	struct ve_bond {
214 		const struct intel_engine_cs *master;
215 		intel_engine_mask_t sibling_mask;
216 	} *bonds;
217 	unsigned int num_bonds;
218 
219 	/* And finally, which physical engines this virtual engine maps onto. */
220 	unsigned int num_siblings;
221 	struct intel_engine_cs *siblings[];
222 };
223 
224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
225 {
226 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
227 	return container_of(engine, struct virtual_engine, base);
228 }
229 
230 static int __execlists_context_alloc(struct intel_context *ce,
231 				     struct intel_engine_cs *engine);
232 
233 static void execlists_init_reg_state(u32 *reg_state,
234 				     const struct intel_context *ce,
235 				     const struct intel_engine_cs *engine,
236 				     const struct intel_ring *ring,
237 				     bool close);
238 static void
239 __execlists_update_reg_state(const struct intel_context *ce,
240 			     const struct intel_engine_cs *engine,
241 			     u32 head);
242 
243 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
244 {
245 	if (INTEL_GEN(engine->i915) >= 12)
246 		return 0x60;
247 	else if (INTEL_GEN(engine->i915) >= 9)
248 		return 0x54;
249 	else if (engine->class == RENDER_CLASS)
250 		return 0x58;
251 	else
252 		return -1;
253 }
254 
255 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
256 {
257 	if (INTEL_GEN(engine->i915) >= 12)
258 		return 0x74;
259 	else if (INTEL_GEN(engine->i915) >= 9)
260 		return 0x68;
261 	else if (engine->class == RENDER_CLASS)
262 		return 0xd8;
263 	else
264 		return -1;
265 }
266 
267 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
268 {
269 	if (INTEL_GEN(engine->i915) >= 12)
270 		return 0x12;
271 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
272 		return 0x18;
273 	else
274 		return -1;
275 }
276 
277 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
278 {
279 	int x;
280 
281 	x = lrc_ring_wa_bb_per_ctx(engine);
282 	if (x < 0)
283 		return x;
284 
285 	return x + 2;
286 }
287 
288 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
289 {
290 	int x;
291 
292 	x = lrc_ring_indirect_ptr(engine);
293 	if (x < 0)
294 		return x;
295 
296 	return x + 2;
297 }
298 
299 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
300 {
301 	if (engine->class != RENDER_CLASS)
302 		return -1;
303 
304 	if (INTEL_GEN(engine->i915) >= 12)
305 		return 0xb6;
306 	else if (INTEL_GEN(engine->i915) >= 11)
307 		return 0xaa;
308 	else
309 		return -1;
310 }
311 
312 static u32
313 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
314 {
315 	switch (INTEL_GEN(engine->i915)) {
316 	default:
317 		MISSING_CASE(INTEL_GEN(engine->i915));
318 		fallthrough;
319 	case 12:
320 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
321 	case 11:
322 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
323 	case 10:
324 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
325 	case 9:
326 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
327 	case 8:
328 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
329 	}
330 }
331 
332 static void
333 lrc_ring_setup_indirect_ctx(u32 *regs,
334 			    const struct intel_engine_cs *engine,
335 			    u32 ctx_bb_ggtt_addr,
336 			    u32 size)
337 {
338 	GEM_BUG_ON(!size);
339 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
340 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
341 	regs[lrc_ring_indirect_ptr(engine) + 1] =
342 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
343 
344 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
345 	regs[lrc_ring_indirect_offset(engine) + 1] =
346 		lrc_ring_indirect_offset_default(engine) << 6;
347 }
348 
349 static u32 intel_context_get_runtime(const struct intel_context *ce)
350 {
351 	/*
352 	 * We can use either ppHWSP[16] which is recorded before the context
353 	 * switch (and so excludes the cost of context switches) or use the
354 	 * value from the context image itself, which is saved/restored earlier
355 	 * and so includes the cost of the save.
356 	 */
357 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
358 }
359 
360 static void mark_eio(struct i915_request *rq)
361 {
362 	if (i915_request_completed(rq))
363 		return;
364 
365 	GEM_BUG_ON(i915_request_signaled(rq));
366 
367 	i915_request_set_error_once(rq, -EIO);
368 	i915_request_mark_complete(rq);
369 }
370 
371 static struct i915_request *
372 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
373 {
374 	struct i915_request *active = rq;
375 
376 	rcu_read_lock();
377 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
378 		if (i915_request_completed(rq))
379 			break;
380 
381 		active = rq;
382 	}
383 	rcu_read_unlock();
384 
385 	return active;
386 }
387 
388 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
389 {
390 	return (i915_ggtt_offset(engine->status_page.vma) +
391 		I915_GEM_HWS_PREEMPT_ADDR);
392 }
393 
394 static inline void
395 ring_set_paused(const struct intel_engine_cs *engine, int state)
396 {
397 	/*
398 	 * We inspect HWS_PREEMPT with a semaphore inside
399 	 * engine->emit_fini_breadcrumb. If the dword is true,
400 	 * the ring is paused as the semaphore will busywait
401 	 * until the dword is false.
402 	 */
403 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
404 	if (state)
405 		wmb();
406 }
407 
408 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
409 {
410 	return rb_entry(rb, struct i915_priolist, node);
411 }
412 
413 static inline int rq_prio(const struct i915_request *rq)
414 {
415 	return READ_ONCE(rq->sched.attr.priority);
416 }
417 
418 static int effective_prio(const struct i915_request *rq)
419 {
420 	int prio = rq_prio(rq);
421 
422 	/*
423 	 * If this request is special and must not be interrupted at any
424 	 * cost, so be it. Note we are only checking the most recent request
425 	 * in the context and so may be masking an earlier vip request. It
426 	 * is hoped that under the conditions where nopreempt is used, this
427 	 * will not matter (i.e. all requests to that context will be
428 	 * nopreempt for as long as desired).
429 	 */
430 	if (i915_request_has_nopreempt(rq))
431 		prio = I915_PRIORITY_UNPREEMPTABLE;
432 
433 	return prio;
434 }
435 
436 static int queue_prio(const struct intel_engine_execlists *execlists)
437 {
438 	struct i915_priolist *p;
439 	struct rb_node *rb;
440 
441 	rb = rb_first_cached(&execlists->queue);
442 	if (!rb)
443 		return INT_MIN;
444 
445 	/*
446 	 * As the priolist[] are inverted, with the highest priority in [0],
447 	 * we have to flip the index value to become priority.
448 	 */
449 	p = to_priolist(rb);
450 	if (!I915_USER_PRIORITY_SHIFT)
451 		return p->priority;
452 
453 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
454 }
455 
456 static inline bool need_preempt(const struct intel_engine_cs *engine,
457 				const struct i915_request *rq,
458 				struct rb_node *rb)
459 {
460 	int last_prio;
461 
462 	if (!intel_engine_has_semaphores(engine))
463 		return false;
464 
465 	/*
466 	 * Check if the current priority hint merits a preemption attempt.
467 	 *
468 	 * We record the highest value priority we saw during rescheduling
469 	 * prior to this dequeue, therefore we know that if it is strictly
470 	 * less than the current tail of ESLP[0], we do not need to force
471 	 * a preempt-to-idle cycle.
472 	 *
473 	 * However, the priority hint is a mere hint that we may need to
474 	 * preempt. If that hint is stale or we may be trying to preempt
475 	 * ourselves, ignore the request.
476 	 *
477 	 * More naturally we would write
478 	 *      prio >= max(0, last);
479 	 * except that we wish to prevent triggering preemption at the same
480 	 * priority level: the task that is running should remain running
481 	 * to preserve FIFO ordering of dependencies.
482 	 */
483 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
484 	if (engine->execlists.queue_priority_hint <= last_prio)
485 		return false;
486 
487 	/*
488 	 * Check against the first request in ELSP[1], it will, thanks to the
489 	 * power of PI, be the highest priority of that context.
490 	 */
491 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
492 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
493 		return true;
494 
495 	if (rb) {
496 		struct virtual_engine *ve =
497 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
498 		bool preempt = false;
499 
500 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
501 			struct i915_request *next;
502 
503 			rcu_read_lock();
504 			next = READ_ONCE(ve->request);
505 			if (next)
506 				preempt = rq_prio(next) > last_prio;
507 			rcu_read_unlock();
508 		}
509 
510 		if (preempt)
511 			return preempt;
512 	}
513 
514 	/*
515 	 * If the inflight context did not trigger the preemption, then maybe
516 	 * it was the set of queued requests? Pick the highest priority in
517 	 * the queue (the first active priolist) and see if it deserves to be
518 	 * running instead of ELSP[0].
519 	 *
520 	 * The highest priority request in the queue can not be either
521 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
522 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
523 	 */
524 	return queue_prio(&engine->execlists) > last_prio;
525 }
526 
527 __maybe_unused static inline bool
528 assert_priority_queue(const struct i915_request *prev,
529 		      const struct i915_request *next)
530 {
531 	/*
532 	 * Without preemption, the prev may refer to the still active element
533 	 * which we refuse to let go.
534 	 *
535 	 * Even with preemption, there are times when we think it is better not
536 	 * to preempt and leave an ostensibly lower priority request in flight.
537 	 */
538 	if (i915_request_is_active(prev))
539 		return true;
540 
541 	return rq_prio(prev) >= rq_prio(next);
542 }
543 
544 /*
545  * The context descriptor encodes various attributes of a context,
546  * including its GTT address and some flags. Because it's fairly
547  * expensive to calculate, we'll just do it once and cache the result,
548  * which remains valid until the context is unpinned.
549  *
550  * This is what a descriptor looks like, from LSB to MSB::
551  *
552  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
553  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
554  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
555  *      bits 53-54:    mbz, reserved for use by hardware
556  *      bits 55-63:    group ID, currently unused and set to 0
557  *
558  * Starting from Gen11, the upper dword of the descriptor has a new format:
559  *
560  *      bits 32-36:    reserved
561  *      bits 37-47:    SW context ID
562  *      bits 48:53:    engine instance
563  *      bit 54:        mbz, reserved for use by hardware
564  *      bits 55-60:    SW counter
565  *      bits 61-63:    engine class
566  *
567  * engine info, SW context ID and SW counter need to form a unique number
568  * (Context ID) per lrc.
569  */
570 static u32
571 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
572 {
573 	u32 desc;
574 
575 	desc = INTEL_LEGACY_32B_CONTEXT;
576 	if (i915_vm_is_4lvl(ce->vm))
577 		desc = INTEL_LEGACY_64B_CONTEXT;
578 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
579 
580 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
581 	if (IS_GEN(engine->i915, 8))
582 		desc |= GEN8_CTX_L3LLC_COHERENT;
583 
584 	return i915_ggtt_offset(ce->state) | desc;
585 }
586 
587 static inline unsigned int dword_in_page(void *addr)
588 {
589 	return offset_in_page(addr) / sizeof(u32);
590 }
591 
592 static void set_offsets(u32 *regs,
593 			const u8 *data,
594 			const struct intel_engine_cs *engine,
595 			bool clear)
596 #define NOP(x) (BIT(7) | (x))
597 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
598 #define POSTED BIT(0)
599 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
600 #define REG16(x) \
601 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
602 	(((x) >> 2) & 0x7f)
603 #define END(total_state_size) 0, (total_state_size)
604 {
605 	const u32 base = engine->mmio_base;
606 
607 	while (*data) {
608 		u8 count, flags;
609 
610 		if (*data & BIT(7)) { /* skip */
611 			count = *data++ & ~BIT(7);
612 			if (clear)
613 				memset32(regs, MI_NOOP, count);
614 			regs += count;
615 			continue;
616 		}
617 
618 		count = *data & 0x3f;
619 		flags = *data >> 6;
620 		data++;
621 
622 		*regs = MI_LOAD_REGISTER_IMM(count);
623 		if (flags & POSTED)
624 			*regs |= MI_LRI_FORCE_POSTED;
625 		if (INTEL_GEN(engine->i915) >= 11)
626 			*regs |= MI_LRI_LRM_CS_MMIO;
627 		regs++;
628 
629 		GEM_BUG_ON(!count);
630 		do {
631 			u32 offset = 0;
632 			u8 v;
633 
634 			do {
635 				v = *data++;
636 				offset <<= 7;
637 				offset |= v & ~BIT(7);
638 			} while (v & BIT(7));
639 
640 			regs[0] = base + (offset << 2);
641 			if (clear)
642 				regs[1] = 0;
643 			regs += 2;
644 		} while (--count);
645 	}
646 
647 	if (clear) {
648 		u8 count = *++data;
649 
650 		/* Clear past the tail for HW access */
651 		GEM_BUG_ON(dword_in_page(regs) > count);
652 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
653 
654 		/* Close the batch; used mainly by live_lrc_layout() */
655 		*regs = MI_BATCH_BUFFER_END;
656 		if (INTEL_GEN(engine->i915) >= 10)
657 			*regs |= BIT(0);
658 	}
659 }
660 
661 static const u8 gen8_xcs_offsets[] = {
662 	NOP(1),
663 	LRI(11, 0),
664 	REG16(0x244),
665 	REG(0x034),
666 	REG(0x030),
667 	REG(0x038),
668 	REG(0x03c),
669 	REG(0x168),
670 	REG(0x140),
671 	REG(0x110),
672 	REG(0x11c),
673 	REG(0x114),
674 	REG(0x118),
675 
676 	NOP(9),
677 	LRI(9, 0),
678 	REG16(0x3a8),
679 	REG16(0x28c),
680 	REG16(0x288),
681 	REG16(0x284),
682 	REG16(0x280),
683 	REG16(0x27c),
684 	REG16(0x278),
685 	REG16(0x274),
686 	REG16(0x270),
687 
688 	NOP(13),
689 	LRI(2, 0),
690 	REG16(0x200),
691 	REG(0x028),
692 
693 	END(80)
694 };
695 
696 static const u8 gen9_xcs_offsets[] = {
697 	NOP(1),
698 	LRI(14, POSTED),
699 	REG16(0x244),
700 	REG(0x034),
701 	REG(0x030),
702 	REG(0x038),
703 	REG(0x03c),
704 	REG(0x168),
705 	REG(0x140),
706 	REG(0x110),
707 	REG(0x11c),
708 	REG(0x114),
709 	REG(0x118),
710 	REG(0x1c0),
711 	REG(0x1c4),
712 	REG(0x1c8),
713 
714 	NOP(3),
715 	LRI(9, POSTED),
716 	REG16(0x3a8),
717 	REG16(0x28c),
718 	REG16(0x288),
719 	REG16(0x284),
720 	REG16(0x280),
721 	REG16(0x27c),
722 	REG16(0x278),
723 	REG16(0x274),
724 	REG16(0x270),
725 
726 	NOP(13),
727 	LRI(1, POSTED),
728 	REG16(0x200),
729 
730 	NOP(13),
731 	LRI(44, POSTED),
732 	REG(0x028),
733 	REG(0x09c),
734 	REG(0x0c0),
735 	REG(0x178),
736 	REG(0x17c),
737 	REG16(0x358),
738 	REG(0x170),
739 	REG(0x150),
740 	REG(0x154),
741 	REG(0x158),
742 	REG16(0x41c),
743 	REG16(0x600),
744 	REG16(0x604),
745 	REG16(0x608),
746 	REG16(0x60c),
747 	REG16(0x610),
748 	REG16(0x614),
749 	REG16(0x618),
750 	REG16(0x61c),
751 	REG16(0x620),
752 	REG16(0x624),
753 	REG16(0x628),
754 	REG16(0x62c),
755 	REG16(0x630),
756 	REG16(0x634),
757 	REG16(0x638),
758 	REG16(0x63c),
759 	REG16(0x640),
760 	REG16(0x644),
761 	REG16(0x648),
762 	REG16(0x64c),
763 	REG16(0x650),
764 	REG16(0x654),
765 	REG16(0x658),
766 	REG16(0x65c),
767 	REG16(0x660),
768 	REG16(0x664),
769 	REG16(0x668),
770 	REG16(0x66c),
771 	REG16(0x670),
772 	REG16(0x674),
773 	REG16(0x678),
774 	REG16(0x67c),
775 	REG(0x068),
776 
777 	END(176)
778 };
779 
780 static const u8 gen12_xcs_offsets[] = {
781 	NOP(1),
782 	LRI(13, POSTED),
783 	REG16(0x244),
784 	REG(0x034),
785 	REG(0x030),
786 	REG(0x038),
787 	REG(0x03c),
788 	REG(0x168),
789 	REG(0x140),
790 	REG(0x110),
791 	REG(0x1c0),
792 	REG(0x1c4),
793 	REG(0x1c8),
794 	REG(0x180),
795 	REG16(0x2b4),
796 
797 	NOP(5),
798 	LRI(9, POSTED),
799 	REG16(0x3a8),
800 	REG16(0x28c),
801 	REG16(0x288),
802 	REG16(0x284),
803 	REG16(0x280),
804 	REG16(0x27c),
805 	REG16(0x278),
806 	REG16(0x274),
807 	REG16(0x270),
808 
809 	END(80)
810 };
811 
812 static const u8 gen8_rcs_offsets[] = {
813 	NOP(1),
814 	LRI(14, POSTED),
815 	REG16(0x244),
816 	REG(0x034),
817 	REG(0x030),
818 	REG(0x038),
819 	REG(0x03c),
820 	REG(0x168),
821 	REG(0x140),
822 	REG(0x110),
823 	REG(0x11c),
824 	REG(0x114),
825 	REG(0x118),
826 	REG(0x1c0),
827 	REG(0x1c4),
828 	REG(0x1c8),
829 
830 	NOP(3),
831 	LRI(9, POSTED),
832 	REG16(0x3a8),
833 	REG16(0x28c),
834 	REG16(0x288),
835 	REG16(0x284),
836 	REG16(0x280),
837 	REG16(0x27c),
838 	REG16(0x278),
839 	REG16(0x274),
840 	REG16(0x270),
841 
842 	NOP(13),
843 	LRI(1, 0),
844 	REG(0x0c8),
845 
846 	END(80)
847 };
848 
849 static const u8 gen9_rcs_offsets[] = {
850 	NOP(1),
851 	LRI(14, POSTED),
852 	REG16(0x244),
853 	REG(0x34),
854 	REG(0x30),
855 	REG(0x38),
856 	REG(0x3c),
857 	REG(0x168),
858 	REG(0x140),
859 	REG(0x110),
860 	REG(0x11c),
861 	REG(0x114),
862 	REG(0x118),
863 	REG(0x1c0),
864 	REG(0x1c4),
865 	REG(0x1c8),
866 
867 	NOP(3),
868 	LRI(9, POSTED),
869 	REG16(0x3a8),
870 	REG16(0x28c),
871 	REG16(0x288),
872 	REG16(0x284),
873 	REG16(0x280),
874 	REG16(0x27c),
875 	REG16(0x278),
876 	REG16(0x274),
877 	REG16(0x270),
878 
879 	NOP(13),
880 	LRI(1, 0),
881 	REG(0xc8),
882 
883 	NOP(13),
884 	LRI(44, POSTED),
885 	REG(0x28),
886 	REG(0x9c),
887 	REG(0xc0),
888 	REG(0x178),
889 	REG(0x17c),
890 	REG16(0x358),
891 	REG(0x170),
892 	REG(0x150),
893 	REG(0x154),
894 	REG(0x158),
895 	REG16(0x41c),
896 	REG16(0x600),
897 	REG16(0x604),
898 	REG16(0x608),
899 	REG16(0x60c),
900 	REG16(0x610),
901 	REG16(0x614),
902 	REG16(0x618),
903 	REG16(0x61c),
904 	REG16(0x620),
905 	REG16(0x624),
906 	REG16(0x628),
907 	REG16(0x62c),
908 	REG16(0x630),
909 	REG16(0x634),
910 	REG16(0x638),
911 	REG16(0x63c),
912 	REG16(0x640),
913 	REG16(0x644),
914 	REG16(0x648),
915 	REG16(0x64c),
916 	REG16(0x650),
917 	REG16(0x654),
918 	REG16(0x658),
919 	REG16(0x65c),
920 	REG16(0x660),
921 	REG16(0x664),
922 	REG16(0x668),
923 	REG16(0x66c),
924 	REG16(0x670),
925 	REG16(0x674),
926 	REG16(0x678),
927 	REG16(0x67c),
928 	REG(0x68),
929 
930 	END(176)
931 };
932 
933 static const u8 gen11_rcs_offsets[] = {
934 	NOP(1),
935 	LRI(15, POSTED),
936 	REG16(0x244),
937 	REG(0x034),
938 	REG(0x030),
939 	REG(0x038),
940 	REG(0x03c),
941 	REG(0x168),
942 	REG(0x140),
943 	REG(0x110),
944 	REG(0x11c),
945 	REG(0x114),
946 	REG(0x118),
947 	REG(0x1c0),
948 	REG(0x1c4),
949 	REG(0x1c8),
950 	REG(0x180),
951 
952 	NOP(1),
953 	LRI(9, POSTED),
954 	REG16(0x3a8),
955 	REG16(0x28c),
956 	REG16(0x288),
957 	REG16(0x284),
958 	REG16(0x280),
959 	REG16(0x27c),
960 	REG16(0x278),
961 	REG16(0x274),
962 	REG16(0x270),
963 
964 	LRI(1, POSTED),
965 	REG(0x1b0),
966 
967 	NOP(10),
968 	LRI(1, 0),
969 	REG(0x0c8),
970 
971 	END(80)
972 };
973 
974 static const u8 gen12_rcs_offsets[] = {
975 	NOP(1),
976 	LRI(13, POSTED),
977 	REG16(0x244),
978 	REG(0x034),
979 	REG(0x030),
980 	REG(0x038),
981 	REG(0x03c),
982 	REG(0x168),
983 	REG(0x140),
984 	REG(0x110),
985 	REG(0x1c0),
986 	REG(0x1c4),
987 	REG(0x1c8),
988 	REG(0x180),
989 	REG16(0x2b4),
990 
991 	NOP(5),
992 	LRI(9, POSTED),
993 	REG16(0x3a8),
994 	REG16(0x28c),
995 	REG16(0x288),
996 	REG16(0x284),
997 	REG16(0x280),
998 	REG16(0x27c),
999 	REG16(0x278),
1000 	REG16(0x274),
1001 	REG16(0x270),
1002 
1003 	LRI(3, POSTED),
1004 	REG(0x1b0),
1005 	REG16(0x5a8),
1006 	REG16(0x5ac),
1007 
1008 	NOP(6),
1009 	LRI(1, 0),
1010 	REG(0x0c8),
1011 	NOP(3 + 9 + 1),
1012 
1013 	LRI(51, POSTED),
1014 	REG16(0x588),
1015 	REG16(0x588),
1016 	REG16(0x588),
1017 	REG16(0x588),
1018 	REG16(0x588),
1019 	REG16(0x588),
1020 	REG(0x028),
1021 	REG(0x09c),
1022 	REG(0x0c0),
1023 	REG(0x178),
1024 	REG(0x17c),
1025 	REG16(0x358),
1026 	REG(0x170),
1027 	REG(0x150),
1028 	REG(0x154),
1029 	REG(0x158),
1030 	REG16(0x41c),
1031 	REG16(0x600),
1032 	REG16(0x604),
1033 	REG16(0x608),
1034 	REG16(0x60c),
1035 	REG16(0x610),
1036 	REG16(0x614),
1037 	REG16(0x618),
1038 	REG16(0x61c),
1039 	REG16(0x620),
1040 	REG16(0x624),
1041 	REG16(0x628),
1042 	REG16(0x62c),
1043 	REG16(0x630),
1044 	REG16(0x634),
1045 	REG16(0x638),
1046 	REG16(0x63c),
1047 	REG16(0x640),
1048 	REG16(0x644),
1049 	REG16(0x648),
1050 	REG16(0x64c),
1051 	REG16(0x650),
1052 	REG16(0x654),
1053 	REG16(0x658),
1054 	REG16(0x65c),
1055 	REG16(0x660),
1056 	REG16(0x664),
1057 	REG16(0x668),
1058 	REG16(0x66c),
1059 	REG16(0x670),
1060 	REG16(0x674),
1061 	REG16(0x678),
1062 	REG16(0x67c),
1063 	REG(0x068),
1064 	REG(0x084),
1065 	NOP(1),
1066 
1067 	END(192)
1068 };
1069 
1070 #undef END
1071 #undef REG16
1072 #undef REG
1073 #undef LRI
1074 #undef NOP
1075 
1076 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1077 {
1078 	/*
1079 	 * The gen12+ lists only have the registers we program in the basic
1080 	 * default state. We rely on the context image using relative
1081 	 * addressing to automatic fixup the register state between the
1082 	 * physical engines for virtual engine.
1083 	 */
1084 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1085 		   !intel_engine_has_relative_mmio(engine));
1086 
1087 	if (engine->class == RENDER_CLASS) {
1088 		if (INTEL_GEN(engine->i915) >= 12)
1089 			return gen12_rcs_offsets;
1090 		else if (INTEL_GEN(engine->i915) >= 11)
1091 			return gen11_rcs_offsets;
1092 		else if (INTEL_GEN(engine->i915) >= 9)
1093 			return gen9_rcs_offsets;
1094 		else
1095 			return gen8_rcs_offsets;
1096 	} else {
1097 		if (INTEL_GEN(engine->i915) >= 12)
1098 			return gen12_xcs_offsets;
1099 		else if (INTEL_GEN(engine->i915) >= 9)
1100 			return gen9_xcs_offsets;
1101 		else
1102 			return gen8_xcs_offsets;
1103 	}
1104 }
1105 
1106 static struct i915_request *
1107 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1108 {
1109 	struct i915_request *rq, *rn, *active = NULL;
1110 	struct list_head *pl;
1111 	int prio = I915_PRIORITY_INVALID;
1112 
1113 	lockdep_assert_held(&engine->active.lock);
1114 
1115 	list_for_each_entry_safe_reverse(rq, rn,
1116 					 &engine->active.requests,
1117 					 sched.link) {
1118 		if (i915_request_completed(rq))
1119 			continue; /* XXX */
1120 
1121 		__i915_request_unsubmit(rq);
1122 
1123 		/*
1124 		 * Push the request back into the queue for later resubmission.
1125 		 * If this request is not native to this physical engine (i.e.
1126 		 * it came from a virtual source), push it back onto the virtual
1127 		 * engine so that it can be moved across onto another physical
1128 		 * engine as load dictates.
1129 		 */
1130 		if (likely(rq->execution_mask == engine->mask)) {
1131 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1132 			if (rq_prio(rq) != prio) {
1133 				prio = rq_prio(rq);
1134 				pl = i915_sched_lookup_priolist(engine, prio);
1135 			}
1136 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1137 
1138 			list_move(&rq->sched.link, pl);
1139 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1140 
1141 			/* Check in case we rollback so far we wrap [size/2] */
1142 			if (intel_ring_direction(rq->ring,
1143 						 rq->tail,
1144 						 rq->ring->tail + 8) > 0)
1145 				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1146 
1147 			active = rq;
1148 		} else {
1149 			struct intel_engine_cs *owner = rq->context->engine;
1150 
1151 			WRITE_ONCE(rq->engine, owner);
1152 			owner->submit_request(rq);
1153 			active = NULL;
1154 		}
1155 	}
1156 
1157 	return active;
1158 }
1159 
1160 struct i915_request *
1161 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1162 {
1163 	struct intel_engine_cs *engine =
1164 		container_of(execlists, typeof(*engine), execlists);
1165 
1166 	return __unwind_incomplete_requests(engine);
1167 }
1168 
1169 static inline void
1170 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1171 {
1172 	/*
1173 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1174 	 * The compiler should eliminate this function as dead-code.
1175 	 */
1176 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1177 		return;
1178 
1179 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1180 				   status, rq);
1181 }
1182 
1183 static void intel_engine_context_in(struct intel_engine_cs *engine)
1184 {
1185 	unsigned long flags;
1186 
1187 	if (atomic_add_unless(&engine->stats.active, 1, 0))
1188 		return;
1189 
1190 	write_seqlock_irqsave(&engine->stats.lock, flags);
1191 	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1192 		engine->stats.start = ktime_get();
1193 		atomic_inc(&engine->stats.active);
1194 	}
1195 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1196 }
1197 
1198 static void intel_engine_context_out(struct intel_engine_cs *engine)
1199 {
1200 	unsigned long flags;
1201 
1202 	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1203 
1204 	if (atomic_add_unless(&engine->stats.active, -1, 1))
1205 		return;
1206 
1207 	write_seqlock_irqsave(&engine->stats.lock, flags);
1208 	if (atomic_dec_and_test(&engine->stats.active)) {
1209 		engine->stats.total =
1210 			ktime_add(engine->stats.total,
1211 				  ktime_sub(ktime_get(), engine->stats.start));
1212 	}
1213 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1214 }
1215 
1216 static void
1217 execlists_check_context(const struct intel_context *ce,
1218 			const struct intel_engine_cs *engine)
1219 {
1220 	const struct intel_ring *ring = ce->ring;
1221 	u32 *regs = ce->lrc_reg_state;
1222 	bool valid = true;
1223 	int x;
1224 
1225 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1226 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1227 		       engine->name,
1228 		       regs[CTX_RING_START],
1229 		       i915_ggtt_offset(ring->vma));
1230 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1231 		valid = false;
1232 	}
1233 
1234 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1235 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1236 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1237 		       engine->name,
1238 		       regs[CTX_RING_CTL],
1239 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1240 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1241 		valid = false;
1242 	}
1243 
1244 	x = lrc_ring_mi_mode(engine);
1245 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1246 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1247 		       engine->name, regs[x + 1]);
1248 		regs[x + 1] &= ~STOP_RING;
1249 		regs[x + 1] |= STOP_RING << 16;
1250 		valid = false;
1251 	}
1252 
1253 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1254 }
1255 
1256 static void restore_default_state(struct intel_context *ce,
1257 				  struct intel_engine_cs *engine)
1258 {
1259 	u32 *regs;
1260 
1261 	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1262 	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1263 
1264 	ce->runtime.last = intel_context_get_runtime(ce);
1265 }
1266 
1267 static void reset_active(struct i915_request *rq,
1268 			 struct intel_engine_cs *engine)
1269 {
1270 	struct intel_context * const ce = rq->context;
1271 	u32 head;
1272 
1273 	/*
1274 	 * The executing context has been cancelled. We want to prevent
1275 	 * further execution along this context and propagate the error on
1276 	 * to anything depending on its results.
1277 	 *
1278 	 * In __i915_request_submit(), we apply the -EIO and remove the
1279 	 * requests' payloads for any banned requests. But first, we must
1280 	 * rewind the context back to the start of the incomplete request so
1281 	 * that we do not jump back into the middle of the batch.
1282 	 *
1283 	 * We preserve the breadcrumbs and semaphores of the incomplete
1284 	 * requests so that inter-timeline dependencies (i.e other timelines)
1285 	 * remain correctly ordered. And we defer to __i915_request_submit()
1286 	 * so that all asynchronous waits are correctly handled.
1287 	 */
1288 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1289 		     rq->fence.context, rq->fence.seqno);
1290 
1291 	/* On resubmission of the active request, payload will be scrubbed */
1292 	if (i915_request_completed(rq))
1293 		head = rq->tail;
1294 	else
1295 		head = active_request(ce->timeline, rq)->head;
1296 	head = intel_ring_wrap(ce->ring, head);
1297 
1298 	/* Scrub the context image to prevent replaying the previous batch */
1299 	restore_default_state(ce, engine);
1300 	__execlists_update_reg_state(ce, engine, head);
1301 
1302 	/* We've switched away, so this should be a no-op, but intent matters */
1303 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1304 }
1305 
1306 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1307 {
1308 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1309 	ce->runtime.num_underflow += dt < 0;
1310 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1311 #endif
1312 }
1313 
1314 static void intel_context_update_runtime(struct intel_context *ce)
1315 {
1316 	u32 old;
1317 	s32 dt;
1318 
1319 	if (intel_context_is_barrier(ce))
1320 		return;
1321 
1322 	old = ce->runtime.last;
1323 	ce->runtime.last = intel_context_get_runtime(ce);
1324 	dt = ce->runtime.last - old;
1325 
1326 	if (unlikely(dt <= 0)) {
1327 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1328 			 old, ce->runtime.last, dt);
1329 		st_update_runtime_underflow(ce, dt);
1330 		return;
1331 	}
1332 
1333 	ewma_runtime_add(&ce->runtime.avg, dt);
1334 	ce->runtime.total += dt;
1335 }
1336 
1337 static inline struct intel_engine_cs *
1338 __execlists_schedule_in(struct i915_request *rq)
1339 {
1340 	struct intel_engine_cs * const engine = rq->engine;
1341 	struct intel_context * const ce = rq->context;
1342 
1343 	intel_context_get(ce);
1344 
1345 	if (unlikely(intel_context_is_banned(ce)))
1346 		reset_active(rq, engine);
1347 
1348 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1349 		execlists_check_context(ce, engine);
1350 
1351 	if (ce->tag) {
1352 		/* Use a fixed tag for OA and friends */
1353 		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1354 		ce->lrc.ccid = ce->tag;
1355 	} else {
1356 		/* We don't need a strict matching tag, just different values */
1357 		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1358 
1359 		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1360 		clear_bit(tag - 1, &engine->context_tag);
1361 		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1362 
1363 		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1364 	}
1365 
1366 	ce->lrc.ccid |= engine->execlists.ccid;
1367 
1368 	__intel_gt_pm_get(engine->gt);
1369 	if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1370 		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1371 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1372 	intel_engine_context_in(engine);
1373 
1374 	return engine;
1375 }
1376 
1377 static inline struct i915_request *
1378 execlists_schedule_in(struct i915_request *rq, int idx)
1379 {
1380 	struct intel_context * const ce = rq->context;
1381 	struct intel_engine_cs *old;
1382 
1383 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1384 	trace_i915_request_in(rq, idx);
1385 
1386 	old = READ_ONCE(ce->inflight);
1387 	do {
1388 		if (!old) {
1389 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1390 			break;
1391 		}
1392 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1393 
1394 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1395 	return i915_request_get(rq);
1396 }
1397 
1398 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1399 {
1400 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1401 	struct i915_request *next = READ_ONCE(ve->request);
1402 
1403 	if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1404 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
1405 }
1406 
1407 static inline void
1408 __execlists_schedule_out(struct i915_request *rq,
1409 			 struct intel_engine_cs * const engine,
1410 			 unsigned int ccid)
1411 {
1412 	struct intel_context * const ce = rq->context;
1413 
1414 	/*
1415 	 * NB process_csb() is not under the engine->active.lock and hence
1416 	 * schedule_out can race with schedule_in meaning that we should
1417 	 * refrain from doing non-trivial work here.
1418 	 */
1419 
1420 	/*
1421 	 * If we have just completed this context, the engine may now be
1422 	 * idle and we want to re-enter powersaving.
1423 	 */
1424 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1425 	    i915_request_completed(rq))
1426 		intel_engine_add_retire(engine, ce->timeline);
1427 
1428 	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1429 	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1430 	if (ccid < BITS_PER_LONG) {
1431 		GEM_BUG_ON(ccid == 0);
1432 		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1433 		set_bit(ccid - 1, &engine->context_tag);
1434 	}
1435 
1436 	intel_context_update_runtime(ce);
1437 	intel_engine_context_out(engine);
1438 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1439 	if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1440 		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1441 	intel_gt_pm_put_async(engine->gt);
1442 
1443 	/*
1444 	 * If this is part of a virtual engine, its next request may
1445 	 * have been blocked waiting for access to the active context.
1446 	 * We have to kick all the siblings again in case we need to
1447 	 * switch (e.g. the next request is not runnable on this
1448 	 * engine). Hopefully, we will already have submitted the next
1449 	 * request before the tasklet runs and do not need to rebuild
1450 	 * each virtual tree and kick everyone again.
1451 	 */
1452 	if (ce->engine != engine)
1453 		kick_siblings(rq, ce);
1454 
1455 	intel_context_put(ce);
1456 }
1457 
1458 static inline void
1459 execlists_schedule_out(struct i915_request *rq)
1460 {
1461 	struct intel_context * const ce = rq->context;
1462 	struct intel_engine_cs *cur, *old;
1463 	u32 ccid;
1464 
1465 	trace_i915_request_out(rq);
1466 
1467 	ccid = rq->context->lrc.ccid;
1468 	old = READ_ONCE(ce->inflight);
1469 	do
1470 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1471 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1472 	if (!cur)
1473 		__execlists_schedule_out(rq, old, ccid);
1474 
1475 	i915_request_put(rq);
1476 }
1477 
1478 static u64 execlists_update_context(struct i915_request *rq)
1479 {
1480 	struct intel_context *ce = rq->context;
1481 	u64 desc = ce->lrc.desc;
1482 	u32 tail, prev;
1483 
1484 	/*
1485 	 * WaIdleLiteRestore:bdw,skl
1486 	 *
1487 	 * We should never submit the context with the same RING_TAIL twice
1488 	 * just in case we submit an empty ring, which confuses the HW.
1489 	 *
1490 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1491 	 * the normal request to be able to always advance the RING_TAIL on
1492 	 * subsequent resubmissions (for lite restore). Should that fail us,
1493 	 * and we try and submit the same tail again, force the context
1494 	 * reload.
1495 	 *
1496 	 * If we need to return to a preempted context, we need to skip the
1497 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1498 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1499 	 * an earlier request.
1500 	 */
1501 	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1502 	prev = rq->ring->tail;
1503 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1504 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1505 		desc |= CTX_DESC_FORCE_RESTORE;
1506 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1507 	rq->tail = rq->wa_tail;
1508 
1509 	/*
1510 	 * Make sure the context image is complete before we submit it to HW.
1511 	 *
1512 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1513 	 * an uncached write such as our mmio register access, the empirical
1514 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1515 	 * may not be visible to the HW prior to the completion of the UC
1516 	 * register write and that we may begin execution from the context
1517 	 * before its image is complete leading to invalid PD chasing.
1518 	 */
1519 	wmb();
1520 
1521 	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1522 	return desc;
1523 }
1524 
1525 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1526 {
1527 	if (execlists->ctrl_reg) {
1528 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1529 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1530 	} else {
1531 		writel(upper_32_bits(desc), execlists->submit_reg);
1532 		writel(lower_32_bits(desc), execlists->submit_reg);
1533 	}
1534 }
1535 
1536 static __maybe_unused char *
1537 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1538 {
1539 	if (!rq)
1540 		return "";
1541 
1542 	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1543 		 prefix,
1544 		 rq->context->lrc.ccid,
1545 		 rq->fence.context, rq->fence.seqno,
1546 		 i915_request_completed(rq) ? "!" :
1547 		 i915_request_started(rq) ? "*" :
1548 		 "",
1549 		 rq_prio(rq));
1550 
1551 	return buf;
1552 }
1553 
1554 static __maybe_unused void
1555 trace_ports(const struct intel_engine_execlists *execlists,
1556 	    const char *msg,
1557 	    struct i915_request * const *ports)
1558 {
1559 	const struct intel_engine_cs *engine =
1560 		container_of(execlists, typeof(*engine), execlists);
1561 	char __maybe_unused p0[40], p1[40];
1562 
1563 	if (!ports[0])
1564 		return;
1565 
1566 	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1567 		     dump_port(p0, sizeof(p0), "", ports[0]),
1568 		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1569 }
1570 
1571 static inline bool
1572 reset_in_progress(const struct intel_engine_execlists *execlists)
1573 {
1574 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1575 }
1576 
1577 static __maybe_unused bool
1578 assert_pending_valid(const struct intel_engine_execlists *execlists,
1579 		     const char *msg)
1580 {
1581 	struct intel_engine_cs *engine =
1582 		container_of(execlists, typeof(*engine), execlists);
1583 	struct i915_request * const *port, *rq;
1584 	struct intel_context *ce = NULL;
1585 	bool sentinel = false;
1586 	u32 ccid = -1;
1587 
1588 	trace_ports(execlists, msg, execlists->pending);
1589 
1590 	/* We may be messing around with the lists during reset, lalala */
1591 	if (reset_in_progress(execlists))
1592 		return true;
1593 
1594 	if (!execlists->pending[0]) {
1595 		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1596 			      engine->name);
1597 		return false;
1598 	}
1599 
1600 	if (execlists->pending[execlists_num_ports(execlists)]) {
1601 		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1602 			      engine->name, execlists_num_ports(execlists));
1603 		return false;
1604 	}
1605 
1606 	for (port = execlists->pending; (rq = *port); port++) {
1607 		unsigned long flags;
1608 		bool ok = true;
1609 
1610 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1611 		GEM_BUG_ON(!i915_request_is_active(rq));
1612 
1613 		if (ce == rq->context) {
1614 			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1615 				      engine->name,
1616 				      ce->timeline->fence_context,
1617 				      port - execlists->pending);
1618 			return false;
1619 		}
1620 		ce = rq->context;
1621 
1622 		if (ccid == ce->lrc.ccid) {
1623 			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1624 				      engine->name,
1625 				      ccid, ce->timeline->fence_context,
1626 				      port - execlists->pending);
1627 			return false;
1628 		}
1629 		ccid = ce->lrc.ccid;
1630 
1631 		/*
1632 		 * Sentinels are supposed to be the last request so they flush
1633 		 * the current execution off the HW. Check that they are the only
1634 		 * request in the pending submission.
1635 		 */
1636 		if (sentinel) {
1637 			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1638 				      engine->name,
1639 				      ce->timeline->fence_context,
1640 				      port - execlists->pending);
1641 			return false;
1642 		}
1643 		sentinel = i915_request_has_sentinel(rq);
1644 
1645 		/* Hold tightly onto the lock to prevent concurrent retires! */
1646 		if (!spin_trylock_irqsave(&rq->lock, flags))
1647 			continue;
1648 
1649 		if (i915_request_completed(rq))
1650 			goto unlock;
1651 
1652 		if (i915_active_is_idle(&ce->active) &&
1653 		    !intel_context_is_barrier(ce)) {
1654 			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1655 				      engine->name,
1656 				      ce->timeline->fence_context,
1657 				      port - execlists->pending);
1658 			ok = false;
1659 			goto unlock;
1660 		}
1661 
1662 		if (!i915_vma_is_pinned(ce->state)) {
1663 			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1664 				      engine->name,
1665 				      ce->timeline->fence_context,
1666 				      port - execlists->pending);
1667 			ok = false;
1668 			goto unlock;
1669 		}
1670 
1671 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1672 			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1673 				      engine->name,
1674 				      ce->timeline->fence_context,
1675 				      port - execlists->pending);
1676 			ok = false;
1677 			goto unlock;
1678 		}
1679 
1680 unlock:
1681 		spin_unlock_irqrestore(&rq->lock, flags);
1682 		if (!ok)
1683 			return false;
1684 	}
1685 
1686 	return ce;
1687 }
1688 
1689 static void execlists_submit_ports(struct intel_engine_cs *engine)
1690 {
1691 	struct intel_engine_execlists *execlists = &engine->execlists;
1692 	unsigned int n;
1693 
1694 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1695 
1696 	/*
1697 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1698 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1699 	 * not be relinquished until the device is idle (see
1700 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1701 	 * that all ELSP are drained i.e. we have processed the CSB,
1702 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1703 	 */
1704 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1705 
1706 	/*
1707 	 * ELSQ note: the submit queue is not cleared after being submitted
1708 	 * to the HW so we need to make sure we always clean it up. This is
1709 	 * currently ensured by the fact that we always write the same number
1710 	 * of elsq entries, keep this in mind before changing the loop below.
1711 	 */
1712 	for (n = execlists_num_ports(execlists); n--; ) {
1713 		struct i915_request *rq = execlists->pending[n];
1714 
1715 		write_desc(execlists,
1716 			   rq ? execlists_update_context(rq) : 0,
1717 			   n);
1718 	}
1719 
1720 	/* we need to manually load the submit queue */
1721 	if (execlists->ctrl_reg)
1722 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1723 }
1724 
1725 static bool ctx_single_port_submission(const struct intel_context *ce)
1726 {
1727 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1728 		intel_context_force_single_submission(ce));
1729 }
1730 
1731 static bool can_merge_ctx(const struct intel_context *prev,
1732 			  const struct intel_context *next)
1733 {
1734 	if (prev != next)
1735 		return false;
1736 
1737 	if (ctx_single_port_submission(prev))
1738 		return false;
1739 
1740 	return true;
1741 }
1742 
1743 static unsigned long i915_request_flags(const struct i915_request *rq)
1744 {
1745 	return READ_ONCE(rq->fence.flags);
1746 }
1747 
1748 static bool can_merge_rq(const struct i915_request *prev,
1749 			 const struct i915_request *next)
1750 {
1751 	GEM_BUG_ON(prev == next);
1752 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1753 
1754 	/*
1755 	 * We do not submit known completed requests. Therefore if the next
1756 	 * request is already completed, we can pretend to merge it in
1757 	 * with the previous context (and we will skip updating the ELSP
1758 	 * and tracking). Thus hopefully keeping the ELSP full with active
1759 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1760 	 * us.
1761 	 */
1762 	if (i915_request_completed(next))
1763 		return true;
1764 
1765 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1766 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1767 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1768 		return false;
1769 
1770 	if (!can_merge_ctx(prev->context, next->context))
1771 		return false;
1772 
1773 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1774 	return true;
1775 }
1776 
1777 static void virtual_update_register_offsets(u32 *regs,
1778 					    struct intel_engine_cs *engine)
1779 {
1780 	set_offsets(regs, reg_offsets(engine), engine, false);
1781 }
1782 
1783 static bool virtual_matches(const struct virtual_engine *ve,
1784 			    const struct i915_request *rq,
1785 			    const struct intel_engine_cs *engine)
1786 {
1787 	const struct intel_engine_cs *inflight;
1788 
1789 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1790 		return false;
1791 
1792 	/*
1793 	 * We track when the HW has completed saving the context image
1794 	 * (i.e. when we have seen the final CS event switching out of
1795 	 * the context) and must not overwrite the context image before
1796 	 * then. This restricts us to only using the active engine
1797 	 * while the previous virtualized request is inflight (so
1798 	 * we reuse the register offsets). This is a very small
1799 	 * hystersis on the greedy seelction algorithm.
1800 	 */
1801 	inflight = intel_context_inflight(&ve->context);
1802 	if (inflight && inflight != engine)
1803 		return false;
1804 
1805 	return true;
1806 }
1807 
1808 static void virtual_xfer_context(struct virtual_engine *ve,
1809 				 struct intel_engine_cs *engine)
1810 {
1811 	unsigned int n;
1812 
1813 	if (likely(engine == ve->siblings[0]))
1814 		return;
1815 
1816 	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1817 	if (!intel_engine_has_relative_mmio(engine))
1818 		virtual_update_register_offsets(ve->context.lrc_reg_state,
1819 						engine);
1820 
1821 	/*
1822 	 * Move the bound engine to the top of the list for
1823 	 * future execution. We then kick this tasklet first
1824 	 * before checking others, so that we preferentially
1825 	 * reuse this set of bound registers.
1826 	 */
1827 	for (n = 1; n < ve->num_siblings; n++) {
1828 		if (ve->siblings[n] == engine) {
1829 			swap(ve->siblings[n], ve->siblings[0]);
1830 			break;
1831 		}
1832 	}
1833 }
1834 
1835 #define for_each_waiter(p__, rq__) \
1836 	list_for_each_entry_lockless(p__, \
1837 				     &(rq__)->sched.waiters_list, \
1838 				     wait_link)
1839 
1840 #define for_each_signaler(p__, rq__) \
1841 	list_for_each_entry_rcu(p__, \
1842 				&(rq__)->sched.signalers_list, \
1843 				signal_link)
1844 
1845 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1846 {
1847 	LIST_HEAD(list);
1848 
1849 	/*
1850 	 * We want to move the interrupted request to the back of
1851 	 * the round-robin list (i.e. its priority level), but
1852 	 * in doing so, we must then move all requests that were in
1853 	 * flight and were waiting for the interrupted request to
1854 	 * be run after it again.
1855 	 */
1856 	do {
1857 		struct i915_dependency *p;
1858 
1859 		GEM_BUG_ON(i915_request_is_active(rq));
1860 		list_move_tail(&rq->sched.link, pl);
1861 
1862 		for_each_waiter(p, rq) {
1863 			struct i915_request *w =
1864 				container_of(p->waiter, typeof(*w), sched);
1865 
1866 			if (p->flags & I915_DEPENDENCY_WEAK)
1867 				continue;
1868 
1869 			/* Leave semaphores spinning on the other engines */
1870 			if (w->engine != rq->engine)
1871 				continue;
1872 
1873 			/* No waiter should start before its signaler */
1874 			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1875 				   i915_request_started(w) &&
1876 				   !i915_request_completed(rq));
1877 
1878 			GEM_BUG_ON(i915_request_is_active(w));
1879 			if (!i915_request_is_ready(w))
1880 				continue;
1881 
1882 			if (rq_prio(w) < rq_prio(rq))
1883 				continue;
1884 
1885 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1886 			list_move_tail(&w->sched.link, &list);
1887 		}
1888 
1889 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1890 	} while (rq);
1891 }
1892 
1893 static void defer_active(struct intel_engine_cs *engine)
1894 {
1895 	struct i915_request *rq;
1896 
1897 	rq = __unwind_incomplete_requests(engine);
1898 	if (!rq)
1899 		return;
1900 
1901 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1902 }
1903 
1904 static bool
1905 need_timeslice(const struct intel_engine_cs *engine,
1906 	       const struct i915_request *rq,
1907 	       const struct rb_node *rb)
1908 {
1909 	int hint;
1910 
1911 	if (!intel_engine_has_timeslices(engine))
1912 		return false;
1913 
1914 	hint = engine->execlists.queue_priority_hint;
1915 
1916 	if (rb) {
1917 		const struct virtual_engine *ve =
1918 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1919 		const struct intel_engine_cs *inflight =
1920 			intel_context_inflight(&ve->context);
1921 
1922 		if (!inflight || inflight == engine) {
1923 			struct i915_request *next;
1924 
1925 			rcu_read_lock();
1926 			next = READ_ONCE(ve->request);
1927 			if (next)
1928 				hint = max(hint, rq_prio(next));
1929 			rcu_read_unlock();
1930 		}
1931 	}
1932 
1933 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1934 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1935 
1936 	GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1937 	return hint >= effective_prio(rq);
1938 }
1939 
1940 static bool
1941 timeslice_yield(const struct intel_engine_execlists *el,
1942 		const struct i915_request *rq)
1943 {
1944 	/*
1945 	 * Once bitten, forever smitten!
1946 	 *
1947 	 * If the active context ever busy-waited on a semaphore,
1948 	 * it will be treated as a hog until the end of its timeslice (i.e.
1949 	 * until it is scheduled out and replaced by a new submission,
1950 	 * possibly even its own lite-restore). The HW only sends an interrupt
1951 	 * on the first miss, and we do know if that semaphore has been
1952 	 * signaled, or even if it is now stuck on another semaphore. Play
1953 	 * safe, yield if it might be stuck -- it will be given a fresh
1954 	 * timeslice in the near future.
1955 	 */
1956 	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1957 }
1958 
1959 static bool
1960 timeslice_expired(const struct intel_engine_execlists *el,
1961 		  const struct i915_request *rq)
1962 {
1963 	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1964 }
1965 
1966 static int
1967 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1968 {
1969 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1970 		return engine->execlists.queue_priority_hint;
1971 
1972 	return rq_prio(list_next_entry(rq, sched.link));
1973 }
1974 
1975 static inline unsigned long
1976 timeslice(const struct intel_engine_cs *engine)
1977 {
1978 	return READ_ONCE(engine->props.timeslice_duration_ms);
1979 }
1980 
1981 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1982 {
1983 	const struct intel_engine_execlists *execlists = &engine->execlists;
1984 	const struct i915_request *rq = *execlists->active;
1985 
1986 	if (!rq || i915_request_completed(rq))
1987 		return 0;
1988 
1989 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1990 		return 0;
1991 
1992 	return timeslice(engine);
1993 }
1994 
1995 static void set_timeslice(struct intel_engine_cs *engine)
1996 {
1997 	unsigned long duration;
1998 
1999 	if (!intel_engine_has_timeslices(engine))
2000 		return;
2001 
2002 	duration = active_timeslice(engine);
2003 	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2004 
2005 	set_timer_ms(&engine->execlists.timer, duration);
2006 }
2007 
2008 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2009 {
2010 	struct intel_engine_execlists *execlists = &engine->execlists;
2011 	unsigned long duration;
2012 
2013 	if (!intel_engine_has_timeslices(engine))
2014 		return;
2015 
2016 	WRITE_ONCE(execlists->switch_priority_hint, prio);
2017 	if (prio == INT_MIN)
2018 		return;
2019 
2020 	if (timer_pending(&execlists->timer))
2021 		return;
2022 
2023 	duration = timeslice(engine);
2024 	ENGINE_TRACE(engine,
2025 		     "start timeslicing, prio:%d, interval:%lu",
2026 		     prio, duration);
2027 
2028 	set_timer_ms(&execlists->timer, duration);
2029 }
2030 
2031 static void record_preemption(struct intel_engine_execlists *execlists)
2032 {
2033 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2034 }
2035 
2036 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2037 					    const struct i915_request *rq)
2038 {
2039 	if (!rq)
2040 		return 0;
2041 
2042 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2043 	if (unlikely(intel_context_is_banned(rq->context)))
2044 		return 1;
2045 
2046 	return READ_ONCE(engine->props.preempt_timeout_ms);
2047 }
2048 
2049 static void set_preempt_timeout(struct intel_engine_cs *engine,
2050 				const struct i915_request *rq)
2051 {
2052 	if (!intel_engine_has_preempt_reset(engine))
2053 		return;
2054 
2055 	set_timer_ms(&engine->execlists.preempt,
2056 		     active_preempt_timeout(engine, rq));
2057 }
2058 
2059 static inline void clear_ports(struct i915_request **ports, int count)
2060 {
2061 	memset_p((void **)ports, NULL, count);
2062 }
2063 
2064 static inline void
2065 copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2066 {
2067 	/* A memcpy_p() would be very useful here! */
2068 	while (count--)
2069 		WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2070 }
2071 
2072 static void execlists_dequeue(struct intel_engine_cs *engine)
2073 {
2074 	struct intel_engine_execlists * const execlists = &engine->execlists;
2075 	struct i915_request **port = execlists->pending;
2076 	struct i915_request ** const last_port = port + execlists->port_mask;
2077 	struct i915_request * const *active;
2078 	struct i915_request *last;
2079 	struct rb_node *rb;
2080 	bool submit = false;
2081 
2082 	/*
2083 	 * Hardware submission is through 2 ports. Conceptually each port
2084 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2085 	 * static for a context, and unique to each, so we only execute
2086 	 * requests belonging to a single context from each ring. RING_HEAD
2087 	 * is maintained by the CS in the context image, it marks the place
2088 	 * where it got up to last time, and through RING_TAIL we tell the CS
2089 	 * where we want to execute up to this time.
2090 	 *
2091 	 * In this list the requests are in order of execution. Consecutive
2092 	 * requests from the same context are adjacent in the ringbuffer. We
2093 	 * can combine these requests into a single RING_TAIL update:
2094 	 *
2095 	 *              RING_HEAD...req1...req2
2096 	 *                                    ^- RING_TAIL
2097 	 * since to execute req2 the CS must first execute req1.
2098 	 *
2099 	 * Our goal then is to point each port to the end of a consecutive
2100 	 * sequence of requests as being the most optimal (fewest wake ups
2101 	 * and context switches) submission.
2102 	 */
2103 
2104 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2105 		struct virtual_engine *ve =
2106 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2107 		struct i915_request *rq = READ_ONCE(ve->request);
2108 
2109 		if (!rq) { /* lazily cleanup after another engine handled rq */
2110 			rb_erase_cached(rb, &execlists->virtual);
2111 			RB_CLEAR_NODE(rb);
2112 			rb = rb_first_cached(&execlists->virtual);
2113 			continue;
2114 		}
2115 
2116 		if (!virtual_matches(ve, rq, engine)) {
2117 			rb = rb_next(rb);
2118 			continue;
2119 		}
2120 
2121 		break;
2122 	}
2123 
2124 	/*
2125 	 * If the queue is higher priority than the last
2126 	 * request in the currently active context, submit afresh.
2127 	 * We will resubmit again afterwards in case we need to split
2128 	 * the active context to interject the preemption request,
2129 	 * i.e. we will retrigger preemption following the ack in case
2130 	 * of trouble.
2131 	 */
2132 	active = READ_ONCE(execlists->active);
2133 
2134 	/*
2135 	 * In theory we can skip over completed contexts that have not
2136 	 * yet been processed by events (as those events are in flight):
2137 	 *
2138 	 * while ((last = *active) && i915_request_completed(last))
2139 	 *	active++;
2140 	 *
2141 	 * However, the GPU cannot handle this as it will ultimately
2142 	 * find itself trying to jump back into a context it has just
2143 	 * completed and barf.
2144 	 */
2145 
2146 	if ((last = *active)) {
2147 		if (need_preempt(engine, last, rb)) {
2148 			if (i915_request_completed(last)) {
2149 				tasklet_hi_schedule(&execlists->tasklet);
2150 				return;
2151 			}
2152 
2153 			ENGINE_TRACE(engine,
2154 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2155 				     last->fence.context,
2156 				     last->fence.seqno,
2157 				     last->sched.attr.priority,
2158 				     execlists->queue_priority_hint);
2159 			record_preemption(execlists);
2160 
2161 			/*
2162 			 * Don't let the RING_HEAD advance past the breadcrumb
2163 			 * as we unwind (and until we resubmit) so that we do
2164 			 * not accidentally tell it to go backwards.
2165 			 */
2166 			ring_set_paused(engine, 1);
2167 
2168 			/*
2169 			 * Note that we have not stopped the GPU at this point,
2170 			 * so we are unwinding the incomplete requests as they
2171 			 * remain inflight and so by the time we do complete
2172 			 * the preemption, some of the unwound requests may
2173 			 * complete!
2174 			 */
2175 			__unwind_incomplete_requests(engine);
2176 
2177 			last = NULL;
2178 		} else if (need_timeslice(engine, last, rb) &&
2179 			   timeslice_expired(execlists, last)) {
2180 			if (i915_request_completed(last)) {
2181 				tasklet_hi_schedule(&execlists->tasklet);
2182 				return;
2183 			}
2184 
2185 			ENGINE_TRACE(engine,
2186 				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2187 				     last->fence.context,
2188 				     last->fence.seqno,
2189 				     last->sched.attr.priority,
2190 				     execlists->queue_priority_hint,
2191 				     yesno(timeslice_yield(execlists, last)));
2192 
2193 			ring_set_paused(engine, 1);
2194 			defer_active(engine);
2195 
2196 			/*
2197 			 * Unlike for preemption, if we rewind and continue
2198 			 * executing the same context as previously active,
2199 			 * the order of execution will remain the same and
2200 			 * the tail will only advance. We do not need to
2201 			 * force a full context restore, as a lite-restore
2202 			 * is sufficient to resample the monotonic TAIL.
2203 			 *
2204 			 * If we switch to any other context, similarly we
2205 			 * will not rewind TAIL of current context, and
2206 			 * normal save/restore will preserve state and allow
2207 			 * us to later continue executing the same request.
2208 			 */
2209 			last = NULL;
2210 		} else {
2211 			/*
2212 			 * Otherwise if we already have a request pending
2213 			 * for execution after the current one, we can
2214 			 * just wait until the next CS event before
2215 			 * queuing more. In either case we will force a
2216 			 * lite-restore preemption event, but if we wait
2217 			 * we hopefully coalesce several updates into a single
2218 			 * submission.
2219 			 */
2220 			if (!list_is_last(&last->sched.link,
2221 					  &engine->active.requests)) {
2222 				/*
2223 				 * Even if ELSP[1] is occupied and not worthy
2224 				 * of timeslices, our queue might be.
2225 				 */
2226 				start_timeslice(engine, queue_prio(execlists));
2227 				return;
2228 			}
2229 		}
2230 	}
2231 
2232 	while (rb) { /* XXX virtual is always taking precedence */
2233 		struct virtual_engine *ve =
2234 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2235 		struct i915_request *rq;
2236 
2237 		spin_lock(&ve->base.active.lock);
2238 
2239 		rq = ve->request;
2240 		if (unlikely(!rq)) { /* lost the race to a sibling */
2241 			spin_unlock(&ve->base.active.lock);
2242 			rb_erase_cached(rb, &execlists->virtual);
2243 			RB_CLEAR_NODE(rb);
2244 			rb = rb_first_cached(&execlists->virtual);
2245 			continue;
2246 		}
2247 
2248 		GEM_BUG_ON(rq != ve->request);
2249 		GEM_BUG_ON(rq->engine != &ve->base);
2250 		GEM_BUG_ON(rq->context != &ve->context);
2251 
2252 		if (rq_prio(rq) >= queue_prio(execlists)) {
2253 			if (!virtual_matches(ve, rq, engine)) {
2254 				spin_unlock(&ve->base.active.lock);
2255 				rb = rb_next(rb);
2256 				continue;
2257 			}
2258 
2259 			if (last && !can_merge_rq(last, rq)) {
2260 				spin_unlock(&ve->base.active.lock);
2261 				start_timeslice(engine, rq_prio(rq));
2262 				return; /* leave this for another sibling */
2263 			}
2264 
2265 			ENGINE_TRACE(engine,
2266 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2267 				     rq->fence.context,
2268 				     rq->fence.seqno,
2269 				     i915_request_completed(rq) ? "!" :
2270 				     i915_request_started(rq) ? "*" :
2271 				     "",
2272 				     yesno(engine != ve->siblings[0]));
2273 
2274 			WRITE_ONCE(ve->request, NULL);
2275 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2276 				   INT_MIN);
2277 			rb_erase_cached(rb, &execlists->virtual);
2278 			RB_CLEAR_NODE(rb);
2279 
2280 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2281 			WRITE_ONCE(rq->engine, engine);
2282 
2283 			if (__i915_request_submit(rq)) {
2284 				/*
2285 				 * Only after we confirm that we will submit
2286 				 * this request (i.e. it has not already
2287 				 * completed), do we want to update the context.
2288 				 *
2289 				 * This serves two purposes. It avoids
2290 				 * unnecessary work if we are resubmitting an
2291 				 * already completed request after timeslicing.
2292 				 * But more importantly, it prevents us altering
2293 				 * ve->siblings[] on an idle context, where
2294 				 * we may be using ve->siblings[] in
2295 				 * virtual_context_enter / virtual_context_exit.
2296 				 */
2297 				virtual_xfer_context(ve, engine);
2298 				GEM_BUG_ON(ve->siblings[0] != engine);
2299 
2300 				submit = true;
2301 				last = rq;
2302 			}
2303 			i915_request_put(rq);
2304 
2305 			/*
2306 			 * Hmm, we have a bunch of virtual engine requests,
2307 			 * but the first one was already completed (thanks
2308 			 * preempt-to-busy!). Keep looking at the veng queue
2309 			 * until we have no more relevant requests (i.e.
2310 			 * the normal submit queue has higher priority).
2311 			 */
2312 			if (!submit) {
2313 				spin_unlock(&ve->base.active.lock);
2314 				rb = rb_first_cached(&execlists->virtual);
2315 				continue;
2316 			}
2317 		}
2318 
2319 		spin_unlock(&ve->base.active.lock);
2320 		break;
2321 	}
2322 
2323 	while ((rb = rb_first_cached(&execlists->queue))) {
2324 		struct i915_priolist *p = to_priolist(rb);
2325 		struct i915_request *rq, *rn;
2326 		int i;
2327 
2328 		priolist_for_each_request_consume(rq, rn, p, i) {
2329 			bool merge = true;
2330 
2331 			/*
2332 			 * Can we combine this request with the current port?
2333 			 * It has to be the same context/ringbuffer and not
2334 			 * have any exceptions (e.g. GVT saying never to
2335 			 * combine contexts).
2336 			 *
2337 			 * If we can combine the requests, we can execute both
2338 			 * by updating the RING_TAIL to point to the end of the
2339 			 * second request, and so we never need to tell the
2340 			 * hardware about the first.
2341 			 */
2342 			if (last && !can_merge_rq(last, rq)) {
2343 				/*
2344 				 * If we are on the second port and cannot
2345 				 * combine this request with the last, then we
2346 				 * are done.
2347 				 */
2348 				if (port == last_port)
2349 					goto done;
2350 
2351 				/*
2352 				 * We must not populate both ELSP[] with the
2353 				 * same LRCA, i.e. we must submit 2 different
2354 				 * contexts if we submit 2 ELSP.
2355 				 */
2356 				if (last->context == rq->context)
2357 					goto done;
2358 
2359 				if (i915_request_has_sentinel(last))
2360 					goto done;
2361 
2362 				/*
2363 				 * If GVT overrides us we only ever submit
2364 				 * port[0], leaving port[1] empty. Note that we
2365 				 * also have to be careful that we don't queue
2366 				 * the same context (even though a different
2367 				 * request) to the second port.
2368 				 */
2369 				if (ctx_single_port_submission(last->context) ||
2370 				    ctx_single_port_submission(rq->context))
2371 					goto done;
2372 
2373 				merge = false;
2374 			}
2375 
2376 			if (__i915_request_submit(rq)) {
2377 				if (!merge) {
2378 					*port = execlists_schedule_in(last, port - execlists->pending);
2379 					port++;
2380 					last = NULL;
2381 				}
2382 
2383 				GEM_BUG_ON(last &&
2384 					   !can_merge_ctx(last->context,
2385 							  rq->context));
2386 				GEM_BUG_ON(last &&
2387 					   i915_seqno_passed(last->fence.seqno,
2388 							     rq->fence.seqno));
2389 
2390 				submit = true;
2391 				last = rq;
2392 			}
2393 		}
2394 
2395 		rb_erase_cached(&p->node, &execlists->queue);
2396 		i915_priolist_free(p);
2397 	}
2398 
2399 done:
2400 	/*
2401 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2402 	 *
2403 	 * We choose the priority hint such that if we add a request of greater
2404 	 * priority than this, we kick the submission tasklet to decide on
2405 	 * the right order of submitting the requests to hardware. We must
2406 	 * also be prepared to reorder requests as they are in-flight on the
2407 	 * HW. We derive the priority hint then as the first "hole" in
2408 	 * the HW submission ports and if there are no available slots,
2409 	 * the priority of the lowest executing request, i.e. last.
2410 	 *
2411 	 * When we do receive a higher priority request ready to run from the
2412 	 * user, see queue_request(), the priority hint is bumped to that
2413 	 * request triggering preemption on the next dequeue (or subsequent
2414 	 * interrupt for secondary ports).
2415 	 */
2416 	execlists->queue_priority_hint = queue_prio(execlists);
2417 
2418 	if (submit) {
2419 		*port = execlists_schedule_in(last, port - execlists->pending);
2420 		execlists->switch_priority_hint =
2421 			switch_prio(engine, *execlists->pending);
2422 
2423 		/*
2424 		 * Skip if we ended up with exactly the same set of requests,
2425 		 * e.g. trying to timeslice a pair of ordered contexts
2426 		 */
2427 		if (!memcmp(active, execlists->pending,
2428 			    (port - execlists->pending + 1) * sizeof(*port))) {
2429 			do
2430 				execlists_schedule_out(fetch_and_zero(port));
2431 			while (port-- != execlists->pending);
2432 
2433 			goto skip_submit;
2434 		}
2435 		clear_ports(port + 1, last_port - port);
2436 
2437 		WRITE_ONCE(execlists->yield, -1);
2438 		set_preempt_timeout(engine, *active);
2439 		execlists_submit_ports(engine);
2440 	} else {
2441 		start_timeslice(engine, execlists->queue_priority_hint);
2442 skip_submit:
2443 		ring_set_paused(engine, 0);
2444 	}
2445 }
2446 
2447 static void
2448 cancel_port_requests(struct intel_engine_execlists * const execlists)
2449 {
2450 	struct i915_request * const *port;
2451 
2452 	for (port = execlists->pending; *port; port++)
2453 		execlists_schedule_out(*port);
2454 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2455 
2456 	/* Mark the end of active before we overwrite *active */
2457 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2458 		execlists_schedule_out(*port);
2459 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2460 
2461 	smp_wmb(); /* complete the seqlock for execlists_active() */
2462 	WRITE_ONCE(execlists->active, execlists->inflight);
2463 }
2464 
2465 static inline void
2466 invalidate_csb_entries(const u64 *first, const u64 *last)
2467 {
2468 	clflush((void *)first);
2469 	clflush((void *)last);
2470 }
2471 
2472 /*
2473  * Starting with Gen12, the status has a new format:
2474  *
2475  *     bit  0:     switched to new queue
2476  *     bit  1:     reserved
2477  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2478  *                 switch detail is set to "wait on semaphore"
2479  *     bits 3-5:   engine class
2480  *     bits 6-11:  engine instance
2481  *     bits 12-14: reserved
2482  *     bits 15-25: sw context id of the lrc the GT switched to
2483  *     bits 26-31: sw counter of the lrc the GT switched to
2484  *     bits 32-35: context switch detail
2485  *                  - 0: ctx complete
2486  *                  - 1: wait on sync flip
2487  *                  - 2: wait on vblank
2488  *                  - 3: wait on scanline
2489  *                  - 4: wait on semaphore
2490  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2491  *                       WAIT_FOR_EVENT)
2492  *     bit  36:    reserved
2493  *     bits 37-43: wait detail (for switch detail 1 to 4)
2494  *     bits 44-46: reserved
2495  *     bits 47-57: sw context id of the lrc the GT switched away from
2496  *     bits 58-63: sw counter of the lrc the GT switched away from
2497  */
2498 static inline bool gen12_csb_parse(const u64 *csb)
2499 {
2500 	bool ctx_away_valid;
2501 	bool new_queue;
2502 	u64 entry;
2503 
2504 	/* HSD#22011248461 */
2505 	entry = READ_ONCE(*csb);
2506 	if (unlikely(entry == -1)) {
2507 		preempt_disable();
2508 		if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50))
2509 			GEM_WARN_ON("50us CSB timeout");
2510 		preempt_enable();
2511 	}
2512 	WRITE_ONCE(*(u64 *)csb, -1);
2513 
2514 	ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry));
2515 	new_queue =
2516 		lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2517 
2518 	/*
2519 	 * The context switch detail is not guaranteed to be 5 when a preemption
2520 	 * occurs, so we can't just check for that. The check below works for
2521 	 * all the cases we care about, including preemptions of WAIT
2522 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2523 	 * would require some extra handling, but we don't support that.
2524 	 */
2525 	if (!ctx_away_valid || new_queue) {
2526 		GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry)));
2527 		return true;
2528 	}
2529 
2530 	/*
2531 	 * switch detail = 5 is covered by the case above and we do not expect a
2532 	 * context switch on an unsuccessful wait instruction since we always
2533 	 * use polling mode.
2534 	 */
2535 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry)));
2536 	return false;
2537 }
2538 
2539 static inline bool gen8_csb_parse(const u64 *csb)
2540 {
2541 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2542 }
2543 
2544 static void process_csb(struct intel_engine_cs *engine)
2545 {
2546 	struct intel_engine_execlists * const execlists = &engine->execlists;
2547 	const u64 * const buf = execlists->csb_status;
2548 	const u8 num_entries = execlists->csb_size;
2549 	u8 head, tail;
2550 
2551 	/*
2552 	 * As we modify our execlists state tracking we require exclusive
2553 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2554 	 * and we assume that is only inside the reset paths and so serialised.
2555 	 */
2556 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2557 		   !reset_in_progress(execlists));
2558 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2559 
2560 	/*
2561 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2562 	 * When reading from the csb_write mmio register, we have to be
2563 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2564 	 * the low 4bits. As it happens we know the next 4bits are always
2565 	 * zero and so we can simply masked off the low u8 of the register
2566 	 * and treat it identically to reading from the HWSP (without having
2567 	 * to use explicit shifting and masking, and probably bifurcating
2568 	 * the code to handle the legacy mmio read).
2569 	 */
2570 	head = execlists->csb_head;
2571 	tail = READ_ONCE(*execlists->csb_write);
2572 	if (unlikely(head == tail))
2573 		return;
2574 
2575 	/*
2576 	 * We will consume all events from HW, or at least pretend to.
2577 	 *
2578 	 * The sequence of events from the HW is deterministic, and derived
2579 	 * from our writes to the ELSP, with a smidgen of variability for
2580 	 * the arrival of the asynchronous requests wrt to the inflight
2581 	 * execution. If the HW sends an event that does not correspond with
2582 	 * the one we are expecting, we have to abandon all hope as we lose
2583 	 * all tracking of what the engine is actually executing. We will
2584 	 * only detect we are out of sequence with the HW when we get an
2585 	 * 'impossible' event because we have already drained our own
2586 	 * preemption/promotion queue. If this occurs, we know that we likely
2587 	 * lost track of execution earlier and must unwind and restart, the
2588 	 * simplest way is by stop processing the event queue and force the
2589 	 * engine to reset.
2590 	 */
2591 	execlists->csb_head = tail;
2592 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2593 
2594 	/*
2595 	 * Hopefully paired with a wmb() in HW!
2596 	 *
2597 	 * We must complete the read of the write pointer before any reads
2598 	 * from the CSB, so that we do not see stale values. Without an rmb
2599 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2600 	 * we perform the READ_ONCE(*csb_write).
2601 	 */
2602 	rmb();
2603 	do {
2604 		bool promote;
2605 
2606 		if (++head == num_entries)
2607 			head = 0;
2608 
2609 		/*
2610 		 * We are flying near dragons again.
2611 		 *
2612 		 * We hold a reference to the request in execlist_port[]
2613 		 * but no more than that. We are operating in softirq
2614 		 * context and so cannot hold any mutex or sleep. That
2615 		 * prevents us stopping the requests we are processing
2616 		 * in port[] from being retired simultaneously (the
2617 		 * breadcrumb will be complete before we see the
2618 		 * context-switch). As we only hold the reference to the
2619 		 * request, any pointer chasing underneath the request
2620 		 * is subject to a potential use-after-free. Thus we
2621 		 * store all of the bookkeeping within port[] as
2622 		 * required, and avoid using unguarded pointers beneath
2623 		 * request itself. The same applies to the atomic
2624 		 * status notifier.
2625 		 */
2626 
2627 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2628 			     head,
2629 			     upper_32_bits(buf[head]),
2630 			     lower_32_bits(buf[head]));
2631 
2632 		if (INTEL_GEN(engine->i915) >= 12)
2633 			promote = gen12_csb_parse(buf + head);
2634 		else
2635 			promote = gen8_csb_parse(buf + head);
2636 		if (promote) {
2637 			struct i915_request * const *old = execlists->active;
2638 
2639 			if (GEM_WARN_ON(!*execlists->pending)) {
2640 				execlists->error_interrupt |= ERROR_CSB;
2641 				break;
2642 			}
2643 
2644 			ring_set_paused(engine, 0);
2645 
2646 			/* Point active to the new ELSP; prevent overwriting */
2647 			WRITE_ONCE(execlists->active, execlists->pending);
2648 			smp_wmb(); /* notify execlists_active() */
2649 
2650 			/* cancel old inflight, prepare for switch */
2651 			trace_ports(execlists, "preempted", old);
2652 			while (*old)
2653 				execlists_schedule_out(*old++);
2654 
2655 			/* switch pending to inflight */
2656 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2657 			copy_ports(execlists->inflight,
2658 				   execlists->pending,
2659 				   execlists_num_ports(execlists));
2660 			smp_wmb(); /* complete the seqlock */
2661 			WRITE_ONCE(execlists->active, execlists->inflight);
2662 
2663 			/* XXX Magic delay for tgl */
2664 			ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2665 
2666 			WRITE_ONCE(execlists->pending[0], NULL);
2667 		} else {
2668 			if (GEM_WARN_ON(!*execlists->active)) {
2669 				execlists->error_interrupt |= ERROR_CSB;
2670 				break;
2671 			}
2672 
2673 			/* port0 completed, advanced to port1 */
2674 			trace_ports(execlists, "completed", execlists->active);
2675 
2676 			/*
2677 			 * We rely on the hardware being strongly
2678 			 * ordered, that the breadcrumb write is
2679 			 * coherent (visible from the CPU) before the
2680 			 * user interrupt is processed. One might assume
2681 			 * that the breadcrumb write being before the
2682 			 * user interrupt and the CS event for the context
2683 			 * switch would therefore be before the CS event
2684 			 * itself...
2685 			 */
2686 			if (GEM_SHOW_DEBUG() &&
2687 			    !i915_request_completed(*execlists->active)) {
2688 				struct i915_request *rq = *execlists->active;
2689 				const u32 *regs __maybe_unused =
2690 					rq->context->lrc_reg_state;
2691 
2692 				ENGINE_TRACE(engine,
2693 					     "context completed before request!\n");
2694 				ENGINE_TRACE(engine,
2695 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2696 					     ENGINE_READ(engine, RING_START),
2697 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2698 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2699 					     ENGINE_READ(engine, RING_CTL),
2700 					     ENGINE_READ(engine, RING_MI_MODE));
2701 				ENGINE_TRACE(engine,
2702 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2703 					     i915_ggtt_offset(rq->ring->vma),
2704 					     rq->head, rq->tail,
2705 					     rq->fence.context,
2706 					     lower_32_bits(rq->fence.seqno),
2707 					     hwsp_seqno(rq));
2708 				ENGINE_TRACE(engine,
2709 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2710 					     regs[CTX_RING_START],
2711 					     regs[CTX_RING_HEAD],
2712 					     regs[CTX_RING_TAIL]);
2713 			}
2714 
2715 			execlists_schedule_out(*execlists->active++);
2716 
2717 			GEM_BUG_ON(execlists->active - execlists->inflight >
2718 				   execlists_num_ports(execlists));
2719 		}
2720 	} while (head != tail);
2721 
2722 	set_timeslice(engine);
2723 
2724 	/*
2725 	 * Gen11 has proven to fail wrt global observation point between
2726 	 * entry and tail update, failing on the ordering and thus
2727 	 * we see an old entry in the context status buffer.
2728 	 *
2729 	 * Forcibly evict out entries for the next gpu csb update,
2730 	 * to increase the odds that we get a fresh entries with non
2731 	 * working hardware. The cost for doing so comes out mostly with
2732 	 * the wash as hardware, working or not, will need to do the
2733 	 * invalidation before.
2734 	 */
2735 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2736 }
2737 
2738 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2739 {
2740 	lockdep_assert_held(&engine->active.lock);
2741 	if (!READ_ONCE(engine->execlists.pending[0])) {
2742 		rcu_read_lock(); /* protect peeking at execlists->active */
2743 		execlists_dequeue(engine);
2744 		rcu_read_unlock();
2745 	}
2746 }
2747 
2748 static void __execlists_hold(struct i915_request *rq)
2749 {
2750 	LIST_HEAD(list);
2751 
2752 	do {
2753 		struct i915_dependency *p;
2754 
2755 		if (i915_request_is_active(rq))
2756 			__i915_request_unsubmit(rq);
2757 
2758 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2759 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2760 		i915_request_set_hold(rq);
2761 		RQ_TRACE(rq, "on hold\n");
2762 
2763 		for_each_waiter(p, rq) {
2764 			struct i915_request *w =
2765 				container_of(p->waiter, typeof(*w), sched);
2766 
2767 			/* Leave semaphores spinning on the other engines */
2768 			if (w->engine != rq->engine)
2769 				continue;
2770 
2771 			if (!i915_request_is_ready(w))
2772 				continue;
2773 
2774 			if (i915_request_completed(w))
2775 				continue;
2776 
2777 			if (i915_request_on_hold(w))
2778 				continue;
2779 
2780 			list_move_tail(&w->sched.link, &list);
2781 		}
2782 
2783 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2784 	} while (rq);
2785 }
2786 
2787 static bool execlists_hold(struct intel_engine_cs *engine,
2788 			   struct i915_request *rq)
2789 {
2790 	spin_lock_irq(&engine->active.lock);
2791 
2792 	if (i915_request_completed(rq)) { /* too late! */
2793 		rq = NULL;
2794 		goto unlock;
2795 	}
2796 
2797 	if (rq->engine != engine) { /* preempted virtual engine */
2798 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2799 
2800 		/*
2801 		 * intel_context_inflight() is only protected by virtue
2802 		 * of process_csb() being called only by the tasklet (or
2803 		 * directly from inside reset while the tasklet is suspended).
2804 		 * Assert that neither of those are allowed to run while we
2805 		 * poke at the request queues.
2806 		 */
2807 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2808 
2809 		/*
2810 		 * An unsubmitted request along a virtual engine will
2811 		 * remain on the active (this) engine until we are able
2812 		 * to process the context switch away (and so mark the
2813 		 * context as no longer in flight). That cannot have happened
2814 		 * yet, otherwise we would not be hanging!
2815 		 */
2816 		spin_lock(&ve->base.active.lock);
2817 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2818 		GEM_BUG_ON(ve->request != rq);
2819 		ve->request = NULL;
2820 		spin_unlock(&ve->base.active.lock);
2821 		i915_request_put(rq);
2822 
2823 		rq->engine = engine;
2824 	}
2825 
2826 	/*
2827 	 * Transfer this request onto the hold queue to prevent it
2828 	 * being resumbitted to HW (and potentially completed) before we have
2829 	 * released it. Since we may have already submitted following
2830 	 * requests, we need to remove those as well.
2831 	 */
2832 	GEM_BUG_ON(i915_request_on_hold(rq));
2833 	GEM_BUG_ON(rq->engine != engine);
2834 	__execlists_hold(rq);
2835 	GEM_BUG_ON(list_empty(&engine->active.hold));
2836 
2837 unlock:
2838 	spin_unlock_irq(&engine->active.lock);
2839 	return rq;
2840 }
2841 
2842 static bool hold_request(const struct i915_request *rq)
2843 {
2844 	struct i915_dependency *p;
2845 	bool result = false;
2846 
2847 	/*
2848 	 * If one of our ancestors is on hold, we must also be on hold,
2849 	 * otherwise we will bypass it and execute before it.
2850 	 */
2851 	rcu_read_lock();
2852 	for_each_signaler(p, rq) {
2853 		const struct i915_request *s =
2854 			container_of(p->signaler, typeof(*s), sched);
2855 
2856 		if (s->engine != rq->engine)
2857 			continue;
2858 
2859 		result = i915_request_on_hold(s);
2860 		if (result)
2861 			break;
2862 	}
2863 	rcu_read_unlock();
2864 
2865 	return result;
2866 }
2867 
2868 static void __execlists_unhold(struct i915_request *rq)
2869 {
2870 	LIST_HEAD(list);
2871 
2872 	do {
2873 		struct i915_dependency *p;
2874 
2875 		RQ_TRACE(rq, "hold release\n");
2876 
2877 		GEM_BUG_ON(!i915_request_on_hold(rq));
2878 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2879 
2880 		i915_request_clear_hold(rq);
2881 		list_move_tail(&rq->sched.link,
2882 			       i915_sched_lookup_priolist(rq->engine,
2883 							  rq_prio(rq)));
2884 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2885 
2886 		/* Also release any children on this engine that are ready */
2887 		for_each_waiter(p, rq) {
2888 			struct i915_request *w =
2889 				container_of(p->waiter, typeof(*w), sched);
2890 
2891 			/* Propagate any change in error status */
2892 			if (rq->fence.error)
2893 				i915_request_set_error_once(w, rq->fence.error);
2894 
2895 			if (w->engine != rq->engine)
2896 				continue;
2897 
2898 			if (!i915_request_on_hold(w))
2899 				continue;
2900 
2901 			/* Check that no other parents are also on hold */
2902 			if (hold_request(w))
2903 				continue;
2904 
2905 			list_move_tail(&w->sched.link, &list);
2906 		}
2907 
2908 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2909 	} while (rq);
2910 }
2911 
2912 static void execlists_unhold(struct intel_engine_cs *engine,
2913 			     struct i915_request *rq)
2914 {
2915 	spin_lock_irq(&engine->active.lock);
2916 
2917 	/*
2918 	 * Move this request back to the priority queue, and all of its
2919 	 * children and grandchildren that were suspended along with it.
2920 	 */
2921 	__execlists_unhold(rq);
2922 
2923 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2924 		engine->execlists.queue_priority_hint = rq_prio(rq);
2925 		tasklet_hi_schedule(&engine->execlists.tasklet);
2926 	}
2927 
2928 	spin_unlock_irq(&engine->active.lock);
2929 }
2930 
2931 struct execlists_capture {
2932 	struct work_struct work;
2933 	struct i915_request *rq;
2934 	struct i915_gpu_coredump *error;
2935 };
2936 
2937 static void execlists_capture_work(struct work_struct *work)
2938 {
2939 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2940 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2941 	struct intel_engine_cs *engine = cap->rq->engine;
2942 	struct intel_gt_coredump *gt = cap->error->gt;
2943 	struct intel_engine_capture_vma *vma;
2944 
2945 	/* Compress all the objects attached to the request, slow! */
2946 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2947 	if (vma) {
2948 		struct i915_vma_compress *compress =
2949 			i915_vma_capture_prepare(gt);
2950 
2951 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2952 		i915_vma_capture_finish(gt, compress);
2953 	}
2954 
2955 	gt->simulated = gt->engine->simulated;
2956 	cap->error->simulated = gt->simulated;
2957 
2958 	/* Publish the error state, and announce it to the world */
2959 	i915_error_state_store(cap->error);
2960 	i915_gpu_coredump_put(cap->error);
2961 
2962 	/* Return this request and all that depend upon it for signaling */
2963 	execlists_unhold(engine, cap->rq);
2964 	i915_request_put(cap->rq);
2965 
2966 	kfree(cap);
2967 }
2968 
2969 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2970 {
2971 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2972 	struct execlists_capture *cap;
2973 
2974 	cap = kmalloc(sizeof(*cap), gfp);
2975 	if (!cap)
2976 		return NULL;
2977 
2978 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2979 	if (!cap->error)
2980 		goto err_cap;
2981 
2982 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2983 	if (!cap->error->gt)
2984 		goto err_gpu;
2985 
2986 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2987 	if (!cap->error->gt->engine)
2988 		goto err_gt;
2989 
2990 	return cap;
2991 
2992 err_gt:
2993 	kfree(cap->error->gt);
2994 err_gpu:
2995 	kfree(cap->error);
2996 err_cap:
2997 	kfree(cap);
2998 	return NULL;
2999 }
3000 
3001 static struct i915_request *
3002 active_context(struct intel_engine_cs *engine, u32 ccid)
3003 {
3004 	const struct intel_engine_execlists * const el = &engine->execlists;
3005 	struct i915_request * const *port, *rq;
3006 
3007 	/*
3008 	 * Use the most recent result from process_csb(), but just in case
3009 	 * we trigger an error (via interrupt) before the first CS event has
3010 	 * been written, peek at the next submission.
3011 	 */
3012 
3013 	for (port = el->active; (rq = *port); port++) {
3014 		if (rq->context->lrc.ccid == ccid) {
3015 			ENGINE_TRACE(engine,
3016 				     "ccid found at active:%zd\n",
3017 				     port - el->active);
3018 			return rq;
3019 		}
3020 	}
3021 
3022 	for (port = el->pending; (rq = *port); port++) {
3023 		if (rq->context->lrc.ccid == ccid) {
3024 			ENGINE_TRACE(engine,
3025 				     "ccid found at pending:%zd\n",
3026 				     port - el->pending);
3027 			return rq;
3028 		}
3029 	}
3030 
3031 	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3032 	return NULL;
3033 }
3034 
3035 static u32 active_ccid(struct intel_engine_cs *engine)
3036 {
3037 	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3038 }
3039 
3040 static void execlists_capture(struct intel_engine_cs *engine)
3041 {
3042 	struct execlists_capture *cap;
3043 
3044 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3045 		return;
3046 
3047 	/*
3048 	 * We need to _quickly_ capture the engine state before we reset.
3049 	 * We are inside an atomic section (softirq) here and we are delaying
3050 	 * the forced preemption event.
3051 	 */
3052 	cap = capture_regs(engine);
3053 	if (!cap)
3054 		return;
3055 
3056 	spin_lock_irq(&engine->active.lock);
3057 	cap->rq = active_context(engine, active_ccid(engine));
3058 	if (cap->rq) {
3059 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3060 		cap->rq = i915_request_get_rcu(cap->rq);
3061 	}
3062 	spin_unlock_irq(&engine->active.lock);
3063 	if (!cap->rq)
3064 		goto err_free;
3065 
3066 	/*
3067 	 * Remove the request from the execlists queue, and take ownership
3068 	 * of the request. We pass it to our worker who will _slowly_ compress
3069 	 * all the pages the _user_ requested for debugging their batch, after
3070 	 * which we return it to the queue for signaling.
3071 	 *
3072 	 * By removing them from the execlists queue, we also remove the
3073 	 * requests from being processed by __unwind_incomplete_requests()
3074 	 * during the intel_engine_reset(), and so they will *not* be replayed
3075 	 * afterwards.
3076 	 *
3077 	 * Note that because we have not yet reset the engine at this point,
3078 	 * it is possible for the request that we have identified as being
3079 	 * guilty, did in fact complete and we will then hit an arbitration
3080 	 * point allowing the outstanding preemption to succeed. The likelihood
3081 	 * of that is very low (as capturing of the engine registers should be
3082 	 * fast enough to run inside an irq-off atomic section!), so we will
3083 	 * simply hold that request accountable for being non-preemptible
3084 	 * long enough to force the reset.
3085 	 */
3086 	if (!execlists_hold(engine, cap->rq))
3087 		goto err_rq;
3088 
3089 	INIT_WORK(&cap->work, execlists_capture_work);
3090 	schedule_work(&cap->work);
3091 	return;
3092 
3093 err_rq:
3094 	i915_request_put(cap->rq);
3095 err_free:
3096 	i915_gpu_coredump_put(cap->error);
3097 	kfree(cap);
3098 }
3099 
3100 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3101 {
3102 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3103 	unsigned long *lock = &engine->gt->reset.flags;
3104 
3105 	if (!intel_has_reset_engine(engine->gt))
3106 		return;
3107 
3108 	if (test_and_set_bit(bit, lock))
3109 		return;
3110 
3111 	ENGINE_TRACE(engine, "reset for %s\n", msg);
3112 
3113 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3114 	tasklet_disable_nosync(&engine->execlists.tasklet);
3115 
3116 	ring_set_paused(engine, 1); /* Freeze the current request in place */
3117 	execlists_capture(engine);
3118 	intel_engine_reset(engine, msg);
3119 
3120 	tasklet_enable(&engine->execlists.tasklet);
3121 	clear_and_wake_up_bit(bit, lock);
3122 }
3123 
3124 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3125 {
3126 	const struct timer_list *t = &engine->execlists.preempt;
3127 
3128 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3129 		return false;
3130 
3131 	if (!timer_expired(t))
3132 		return false;
3133 
3134 	return READ_ONCE(engine->execlists.pending[0]);
3135 }
3136 
3137 /*
3138  * Check the unread Context Status Buffers and manage the submission of new
3139  * contexts to the ELSP accordingly.
3140  */
3141 static void execlists_submission_tasklet(unsigned long data)
3142 {
3143 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3144 	bool timeout = preempt_timeout(engine);
3145 
3146 	process_csb(engine);
3147 
3148 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3149 		const char *msg;
3150 
3151 		/* Generate the error message in priority wrt to the user! */
3152 		if (engine->execlists.error_interrupt & GENMASK(15, 0))
3153 			msg = "CS error"; /* thrown by a user payload */
3154 		else if (engine->execlists.error_interrupt & ERROR_CSB)
3155 			msg = "invalid CSB event";
3156 		else
3157 			msg = "internal error";
3158 
3159 		engine->execlists.error_interrupt = 0;
3160 		execlists_reset(engine, msg);
3161 	}
3162 
3163 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3164 		unsigned long flags;
3165 
3166 		spin_lock_irqsave(&engine->active.lock, flags);
3167 		__execlists_submission_tasklet(engine);
3168 		spin_unlock_irqrestore(&engine->active.lock, flags);
3169 
3170 		/* Recheck after serialising with direct-submission */
3171 		if (unlikely(timeout && preempt_timeout(engine)))
3172 			execlists_reset(engine, "preemption time out");
3173 	}
3174 }
3175 
3176 static void __execlists_kick(struct intel_engine_execlists *execlists)
3177 {
3178 	/* Kick the tasklet for some interrupt coalescing and reset handling */
3179 	tasklet_hi_schedule(&execlists->tasklet);
3180 }
3181 
3182 #define execlists_kick(t, member) \
3183 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3184 
3185 static void execlists_timeslice(struct timer_list *timer)
3186 {
3187 	execlists_kick(timer, timer);
3188 }
3189 
3190 static void execlists_preempt(struct timer_list *timer)
3191 {
3192 	execlists_kick(timer, preempt);
3193 }
3194 
3195 static void queue_request(struct intel_engine_cs *engine,
3196 			  struct i915_request *rq)
3197 {
3198 	GEM_BUG_ON(!list_empty(&rq->sched.link));
3199 	list_add_tail(&rq->sched.link,
3200 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3201 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3202 }
3203 
3204 static void __submit_queue_imm(struct intel_engine_cs *engine)
3205 {
3206 	struct intel_engine_execlists * const execlists = &engine->execlists;
3207 
3208 	if (reset_in_progress(execlists))
3209 		return; /* defer until we restart the engine following reset */
3210 
3211 	__execlists_submission_tasklet(engine);
3212 }
3213 
3214 static void submit_queue(struct intel_engine_cs *engine,
3215 			 const struct i915_request *rq)
3216 {
3217 	struct intel_engine_execlists *execlists = &engine->execlists;
3218 
3219 	if (rq_prio(rq) <= execlists->queue_priority_hint)
3220 		return;
3221 
3222 	execlists->queue_priority_hint = rq_prio(rq);
3223 	__submit_queue_imm(engine);
3224 }
3225 
3226 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3227 			     const struct i915_request *rq)
3228 {
3229 	GEM_BUG_ON(i915_request_on_hold(rq));
3230 	return !list_empty(&engine->active.hold) && hold_request(rq);
3231 }
3232 
3233 static void flush_csb(struct intel_engine_cs *engine)
3234 {
3235 	struct intel_engine_execlists *el = &engine->execlists;
3236 
3237 	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3238 		if (!reset_in_progress(el))
3239 			process_csb(engine);
3240 		tasklet_unlock(&el->tasklet);
3241 	}
3242 }
3243 
3244 static void execlists_submit_request(struct i915_request *request)
3245 {
3246 	struct intel_engine_cs *engine = request->engine;
3247 	unsigned long flags;
3248 
3249 	/* Hopefully we clear execlists->pending[] to let us through */
3250 	flush_csb(engine);
3251 
3252 	/* Will be called from irq-context when using foreign fences. */
3253 	spin_lock_irqsave(&engine->active.lock, flags);
3254 
3255 	if (unlikely(ancestor_on_hold(engine, request))) {
3256 		RQ_TRACE(request, "ancestor on hold\n");
3257 		list_add_tail(&request->sched.link, &engine->active.hold);
3258 		i915_request_set_hold(request);
3259 	} else {
3260 		queue_request(engine, request);
3261 
3262 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3263 		GEM_BUG_ON(list_empty(&request->sched.link));
3264 
3265 		submit_queue(engine, request);
3266 	}
3267 
3268 	spin_unlock_irqrestore(&engine->active.lock, flags);
3269 }
3270 
3271 static void __execlists_context_fini(struct intel_context *ce)
3272 {
3273 	intel_ring_put(ce->ring);
3274 	i915_vma_put(ce->state);
3275 }
3276 
3277 static void execlists_context_destroy(struct kref *kref)
3278 {
3279 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3280 
3281 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3282 	GEM_BUG_ON(intel_context_is_pinned(ce));
3283 
3284 	if (ce->state)
3285 		__execlists_context_fini(ce);
3286 
3287 	intel_context_fini(ce);
3288 	intel_context_free(ce);
3289 }
3290 
3291 static void
3292 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3293 {
3294 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3295 		return;
3296 
3297 	vaddr += engine->context_size;
3298 
3299 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3300 }
3301 
3302 static void
3303 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3304 {
3305 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3306 		return;
3307 
3308 	vaddr += engine->context_size;
3309 
3310 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3311 		drm_err_once(&engine->i915->drm,
3312 			     "%s context redzone overwritten!\n",
3313 			     engine->name);
3314 }
3315 
3316 static void execlists_context_unpin(struct intel_context *ce)
3317 {
3318 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3319 		      ce->engine);
3320 }
3321 
3322 static void execlists_context_post_unpin(struct intel_context *ce)
3323 {
3324 	i915_gem_object_unpin_map(ce->state->obj);
3325 }
3326 
3327 static u32 *
3328 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3329 {
3330 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3331 		MI_SRM_LRM_GLOBAL_GTT |
3332 		MI_LRI_LRM_CS_MMIO;
3333 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3334 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3335 		CTX_TIMESTAMP * sizeof(u32);
3336 	*cs++ = 0;
3337 
3338 	*cs++ = MI_LOAD_REGISTER_REG |
3339 		MI_LRR_SOURCE_CS_MMIO |
3340 		MI_LRI_LRM_CS_MMIO;
3341 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3342 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3343 
3344 	*cs++ = MI_LOAD_REGISTER_REG |
3345 		MI_LRR_SOURCE_CS_MMIO |
3346 		MI_LRI_LRM_CS_MMIO;
3347 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3348 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3349 
3350 	return cs;
3351 }
3352 
3353 static u32 *
3354 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3355 {
3356 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3357 
3358 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3359 		MI_SRM_LRM_GLOBAL_GTT |
3360 		MI_LRI_LRM_CS_MMIO;
3361 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3362 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3363 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3364 	*cs++ = 0;
3365 
3366 	return cs;
3367 }
3368 
3369 static u32 *
3370 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3371 {
3372 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3373 
3374 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3375 		MI_SRM_LRM_GLOBAL_GTT |
3376 		MI_LRI_LRM_CS_MMIO;
3377 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3378 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3379 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3380 	*cs++ = 0;
3381 
3382 	*cs++ = MI_LOAD_REGISTER_REG |
3383 		MI_LRR_SOURCE_CS_MMIO |
3384 		MI_LRI_LRM_CS_MMIO;
3385 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3386 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3387 
3388 	return cs;
3389 }
3390 
3391 static u32 *
3392 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3393 {
3394 	cs = gen12_emit_timestamp_wa(ce, cs);
3395 	cs = gen12_emit_cmd_buf_wa(ce, cs);
3396 	cs = gen12_emit_restore_scratch(ce, cs);
3397 
3398 	return cs;
3399 }
3400 
3401 static u32 *
3402 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3403 {
3404 	cs = gen12_emit_timestamp_wa(ce, cs);
3405 	cs = gen12_emit_restore_scratch(ce, cs);
3406 
3407 	return cs;
3408 }
3409 
3410 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3411 {
3412 	return PAGE_SIZE * ce->wa_bb_page;
3413 }
3414 
3415 static u32 *context_indirect_bb(const struct intel_context *ce)
3416 {
3417 	void *ptr;
3418 
3419 	GEM_BUG_ON(!ce->wa_bb_page);
3420 
3421 	ptr = ce->lrc_reg_state;
3422 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3423 	ptr += context_wa_bb_offset(ce);
3424 
3425 	return ptr;
3426 }
3427 
3428 static void
3429 setup_indirect_ctx_bb(const struct intel_context *ce,
3430 		      const struct intel_engine_cs *engine,
3431 		      u32 *(*emit)(const struct intel_context *, u32 *))
3432 {
3433 	u32 * const start = context_indirect_bb(ce);
3434 	u32 *cs;
3435 
3436 	cs = emit(ce, start);
3437 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3438 	while ((unsigned long)cs % CACHELINE_BYTES)
3439 		*cs++ = MI_NOOP;
3440 
3441 	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3442 				    i915_ggtt_offset(ce->state) +
3443 				    context_wa_bb_offset(ce),
3444 				    (cs - start) * sizeof(*cs));
3445 }
3446 
3447 static void
3448 __execlists_update_reg_state(const struct intel_context *ce,
3449 			     const struct intel_engine_cs *engine,
3450 			     u32 head)
3451 {
3452 	struct intel_ring *ring = ce->ring;
3453 	u32 *regs = ce->lrc_reg_state;
3454 
3455 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3456 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3457 
3458 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3459 	regs[CTX_RING_HEAD] = head;
3460 	regs[CTX_RING_TAIL] = ring->tail;
3461 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3462 
3463 	/* RPCS */
3464 	if (engine->class == RENDER_CLASS) {
3465 		regs[CTX_R_PWR_CLK_STATE] =
3466 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3467 
3468 		i915_oa_init_reg_state(ce, engine);
3469 	}
3470 
3471 	if (ce->wa_bb_page) {
3472 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3473 
3474 		fn = gen12_emit_indirect_ctx_xcs;
3475 		if (ce->engine->class == RENDER_CLASS)
3476 			fn = gen12_emit_indirect_ctx_rcs;
3477 
3478 		/* Mutually exclusive wrt to global indirect bb */
3479 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3480 		setup_indirect_ctx_bb(ce, engine, fn);
3481 	}
3482 }
3483 
3484 static int
3485 execlists_context_pre_pin(struct intel_context *ce,
3486 			  struct i915_gem_ww_ctx *ww, void **vaddr)
3487 {
3488 	GEM_BUG_ON(!ce->state);
3489 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3490 
3491 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
3492 					i915_coherent_map_type(ce->engine->i915) |
3493 					I915_MAP_OVERRIDE);
3494 
3495 	return PTR_ERR_OR_ZERO(*vaddr);
3496 }
3497 
3498 static int
3499 __execlists_context_pin(struct intel_context *ce,
3500 			struct intel_engine_cs *engine,
3501 			void *vaddr)
3502 {
3503 	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3504 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3505 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3506 
3507 	return 0;
3508 }
3509 
3510 static int execlists_context_pin(struct intel_context *ce, void *vaddr)
3511 {
3512 	return __execlists_context_pin(ce, ce->engine, vaddr);
3513 }
3514 
3515 static int execlists_context_alloc(struct intel_context *ce)
3516 {
3517 	return __execlists_context_alloc(ce, ce->engine);
3518 }
3519 
3520 static void execlists_context_reset(struct intel_context *ce)
3521 {
3522 	CE_TRACE(ce, "reset\n");
3523 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3524 
3525 	intel_ring_reset(ce->ring, ce->ring->emit);
3526 
3527 	/* Scrub away the garbage */
3528 	execlists_init_reg_state(ce->lrc_reg_state,
3529 				 ce, ce->engine, ce->ring, true);
3530 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3531 
3532 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3533 }
3534 
3535 static const struct intel_context_ops execlists_context_ops = {
3536 	.alloc = execlists_context_alloc,
3537 
3538 	.pre_pin = execlists_context_pre_pin,
3539 	.pin = execlists_context_pin,
3540 	.unpin = execlists_context_unpin,
3541 	.post_unpin = execlists_context_post_unpin,
3542 
3543 	.enter = intel_context_enter_engine,
3544 	.exit = intel_context_exit_engine,
3545 
3546 	.reset = execlists_context_reset,
3547 	.destroy = execlists_context_destroy,
3548 };
3549 
3550 static u32 hwsp_offset(const struct i915_request *rq)
3551 {
3552 	const struct intel_timeline_cacheline *cl;
3553 
3554 	/* Before the request is executed, the timeline/cachline is fixed */
3555 
3556 	cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
3557 	if (cl)
3558 		return cl->ggtt_offset;
3559 
3560 	return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
3561 }
3562 
3563 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3564 {
3565 	u32 *cs;
3566 
3567 	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3568 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3569 		return 0;
3570 
3571 	cs = intel_ring_begin(rq, 6);
3572 	if (IS_ERR(cs))
3573 		return PTR_ERR(cs);
3574 
3575 	/*
3576 	 * Check if we have been preempted before we even get started.
3577 	 *
3578 	 * After this point i915_request_started() reports true, even if
3579 	 * we get preempted and so are no longer running.
3580 	 */
3581 	*cs++ = MI_ARB_CHECK;
3582 	*cs++ = MI_NOOP;
3583 
3584 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3585 	*cs++ = hwsp_offset(rq);
3586 	*cs++ = 0;
3587 	*cs++ = rq->fence.seqno - 1;
3588 
3589 	intel_ring_advance(rq, cs);
3590 
3591 	/* Record the updated position of the request's payload */
3592 	rq->infix = intel_ring_offset(rq, cs);
3593 
3594 	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3595 
3596 	return 0;
3597 }
3598 
3599 static int emit_pdps(struct i915_request *rq)
3600 {
3601 	const struct intel_engine_cs * const engine = rq->engine;
3602 	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3603 	int err, i;
3604 	u32 *cs;
3605 
3606 	GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3607 
3608 	/*
3609 	 * Beware ye of the dragons, this sequence is magic!
3610 	 *
3611 	 * Small changes to this sequence can cause anything from
3612 	 * GPU hangs to forcewake errors and machine lockups!
3613 	 */
3614 
3615 	/* Flush any residual operations from the context load */
3616 	err = engine->emit_flush(rq, EMIT_FLUSH);
3617 	if (err)
3618 		return err;
3619 
3620 	/* Magic required to prevent forcewake errors! */
3621 	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3622 	if (err)
3623 		return err;
3624 
3625 	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3626 	if (IS_ERR(cs))
3627 		return PTR_ERR(cs);
3628 
3629 	/* Ensure the LRI have landed before we invalidate & continue */
3630 	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3631 	for (i = GEN8_3LVL_PDPES; i--; ) {
3632 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3633 		u32 base = engine->mmio_base;
3634 
3635 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3636 		*cs++ = upper_32_bits(pd_daddr);
3637 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3638 		*cs++ = lower_32_bits(pd_daddr);
3639 	}
3640 	*cs++ = MI_NOOP;
3641 
3642 	intel_ring_advance(rq, cs);
3643 
3644 	return 0;
3645 }
3646 
3647 static int execlists_request_alloc(struct i915_request *request)
3648 {
3649 	int ret;
3650 
3651 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3652 
3653 	/*
3654 	 * Flush enough space to reduce the likelihood of waiting after
3655 	 * we start building the request - in which case we will just
3656 	 * have to repeat work.
3657 	 */
3658 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3659 
3660 	/*
3661 	 * Note that after this point, we have committed to using
3662 	 * this request as it is being used to both track the
3663 	 * state of engine initialisation and liveness of the
3664 	 * golden renderstate above. Think twice before you try
3665 	 * to cancel/unwind this request now.
3666 	 */
3667 
3668 	if (!i915_vm_is_4lvl(request->context->vm)) {
3669 		ret = emit_pdps(request);
3670 		if (ret)
3671 			return ret;
3672 	}
3673 
3674 	/* Unconditionally invalidate GPU caches and TLBs. */
3675 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3676 	if (ret)
3677 		return ret;
3678 
3679 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3680 	return 0;
3681 }
3682 
3683 /*
3684  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3685  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3686  * but there is a slight complication as this is applied in WA batch where the
3687  * values are only initialized once so we cannot take register value at the
3688  * beginning and reuse it further; hence we save its value to memory, upload a
3689  * constant value with bit21 set and then we restore it back with the saved value.
3690  * To simplify the WA, a constant value is formed by using the default value
3691  * of this register. This shouldn't be a problem because we are only modifying
3692  * it for a short period and this batch in non-premptible. We can ofcourse
3693  * use additional instructions that read the actual value of the register
3694  * at that time and set our bit of interest but it makes the WA complicated.
3695  *
3696  * This WA is also required for Gen9 so extracting as a function avoids
3697  * code duplication.
3698  */
3699 static u32 *
3700 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3701 {
3702 	/* NB no one else is allowed to scribble over scratch + 256! */
3703 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3704 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3705 	*batch++ = intel_gt_scratch_offset(engine->gt,
3706 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3707 	*batch++ = 0;
3708 
3709 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3710 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3711 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3712 
3713 	batch = gen8_emit_pipe_control(batch,
3714 				       PIPE_CONTROL_CS_STALL |
3715 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3716 				       0);
3717 
3718 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3719 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3720 	*batch++ = intel_gt_scratch_offset(engine->gt,
3721 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3722 	*batch++ = 0;
3723 
3724 	return batch;
3725 }
3726 
3727 /*
3728  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3729  * initialized at the beginning and shared across all contexts but this field
3730  * helps us to have multiple batches at different offsets and select them based
3731  * on a criteria. At the moment this batch always start at the beginning of the page
3732  * and at this point we don't have multiple wa_ctx batch buffers.
3733  *
3734  * The number of WA applied are not known at the beginning; we use this field
3735  * to return the no of DWORDS written.
3736  *
3737  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3738  * so it adds NOOPs as padding to make it cacheline aligned.
3739  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3740  * makes a complete batch buffer.
3741  */
3742 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3743 {
3744 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3745 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3746 
3747 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3748 	if (IS_BROADWELL(engine->i915))
3749 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3750 
3751 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3752 	/* Actual scratch location is at 128 bytes offset */
3753 	batch = gen8_emit_pipe_control(batch,
3754 				       PIPE_CONTROL_FLUSH_L3 |
3755 				       PIPE_CONTROL_STORE_DATA_INDEX |
3756 				       PIPE_CONTROL_CS_STALL |
3757 				       PIPE_CONTROL_QW_WRITE,
3758 				       LRC_PPHWSP_SCRATCH_ADDR);
3759 
3760 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3761 
3762 	/* Pad to end of cacheline */
3763 	while ((unsigned long)batch % CACHELINE_BYTES)
3764 		*batch++ = MI_NOOP;
3765 
3766 	/*
3767 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3768 	 * execution depends on the length specified in terms of cache lines
3769 	 * in the register CTX_RCS_INDIRECT_CTX
3770 	 */
3771 
3772 	return batch;
3773 }
3774 
3775 struct lri {
3776 	i915_reg_t reg;
3777 	u32 value;
3778 };
3779 
3780 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3781 {
3782 	GEM_BUG_ON(!count || count > 63);
3783 
3784 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3785 	do {
3786 		*batch++ = i915_mmio_reg_offset(lri->reg);
3787 		*batch++ = lri->value;
3788 	} while (lri++, --count);
3789 	*batch++ = MI_NOOP;
3790 
3791 	return batch;
3792 }
3793 
3794 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3795 {
3796 	static const struct lri lri[] = {
3797 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3798 		{
3799 			COMMON_SLICE_CHICKEN2,
3800 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3801 				       0),
3802 		},
3803 
3804 		/* BSpec: 11391 */
3805 		{
3806 			FF_SLICE_CHICKEN,
3807 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3808 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3809 		},
3810 
3811 		/* BSpec: 11299 */
3812 		{
3813 			_3D_CHICKEN3,
3814 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3815 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3816 		}
3817 	};
3818 
3819 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3820 
3821 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3822 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3823 
3824 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3825 	batch = gen8_emit_pipe_control(batch,
3826 				       PIPE_CONTROL_FLUSH_L3 |
3827 				       PIPE_CONTROL_STORE_DATA_INDEX |
3828 				       PIPE_CONTROL_CS_STALL |
3829 				       PIPE_CONTROL_QW_WRITE,
3830 				       LRC_PPHWSP_SCRATCH_ADDR);
3831 
3832 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3833 
3834 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3835 	if (HAS_POOLED_EU(engine->i915)) {
3836 		/*
3837 		 * EU pool configuration is setup along with golden context
3838 		 * during context initialization. This value depends on
3839 		 * device type (2x6 or 3x6) and needs to be updated based
3840 		 * on which subslice is disabled especially for 2x6
3841 		 * devices, however it is safe to load default
3842 		 * configuration of 3x6 device instead of masking off
3843 		 * corresponding bits because HW ignores bits of a disabled
3844 		 * subslice and drops down to appropriate config. Please
3845 		 * see render_state_setup() in i915_gem_render_state.c for
3846 		 * possible configurations, to avoid duplication they are
3847 		 * not shown here again.
3848 		 */
3849 		*batch++ = GEN9_MEDIA_POOL_STATE;
3850 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3851 		*batch++ = 0x00777000;
3852 		*batch++ = 0;
3853 		*batch++ = 0;
3854 		*batch++ = 0;
3855 	}
3856 
3857 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3858 
3859 	/* Pad to end of cacheline */
3860 	while ((unsigned long)batch % CACHELINE_BYTES)
3861 		*batch++ = MI_NOOP;
3862 
3863 	return batch;
3864 }
3865 
3866 static u32 *
3867 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3868 {
3869 	int i;
3870 
3871 	/*
3872 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3873 	 *
3874 	 * Ensure the engine is idle prior to programming a
3875 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3876 	 */
3877 	batch = gen8_emit_pipe_control(batch,
3878 				       PIPE_CONTROL_CS_STALL,
3879 				       0);
3880 	/*
3881 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3882 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3883 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3884 	 * confusing. Since gen8_emit_pipe_control() already advances the
3885 	 * batch by 6 dwords, we advance the other 10 here, completing a
3886 	 * cacheline. It's not clear if the workaround requires this padding
3887 	 * before other commands, or if it's just the regular padding we would
3888 	 * already have for the workaround bb, so leave it here for now.
3889 	 */
3890 	for (i = 0; i < 10; i++)
3891 		*batch++ = MI_NOOP;
3892 
3893 	/* Pad to end of cacheline */
3894 	while ((unsigned long)batch % CACHELINE_BYTES)
3895 		*batch++ = MI_NOOP;
3896 
3897 	return batch;
3898 }
3899 
3900 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3901 
3902 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3903 {
3904 	struct drm_i915_gem_object *obj;
3905 	struct i915_vma *vma;
3906 	int err;
3907 
3908 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3909 	if (IS_ERR(obj))
3910 		return PTR_ERR(obj);
3911 
3912 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3913 	if (IS_ERR(vma)) {
3914 		err = PTR_ERR(vma);
3915 		goto err;
3916 	}
3917 
3918 	err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
3919 	if (err)
3920 		goto err;
3921 
3922 	engine->wa_ctx.vma = vma;
3923 	return 0;
3924 
3925 err:
3926 	i915_gem_object_put(obj);
3927 	return err;
3928 }
3929 
3930 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3931 {
3932 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3933 }
3934 
3935 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3936 
3937 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3938 {
3939 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3940 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3941 					    &wa_ctx->per_ctx };
3942 	wa_bb_func_t wa_bb_fn[2];
3943 	void *batch, *batch_ptr;
3944 	unsigned int i;
3945 	int ret;
3946 
3947 	if (engine->class != RENDER_CLASS)
3948 		return 0;
3949 
3950 	switch (INTEL_GEN(engine->i915)) {
3951 	case 12:
3952 	case 11:
3953 		return 0;
3954 	case 10:
3955 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3956 		wa_bb_fn[1] = NULL;
3957 		break;
3958 	case 9:
3959 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3960 		wa_bb_fn[1] = NULL;
3961 		break;
3962 	case 8:
3963 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3964 		wa_bb_fn[1] = NULL;
3965 		break;
3966 	default:
3967 		MISSING_CASE(INTEL_GEN(engine->i915));
3968 		return 0;
3969 	}
3970 
3971 	ret = lrc_setup_wa_ctx(engine);
3972 	if (ret) {
3973 		drm_dbg(&engine->i915->drm,
3974 			"Failed to setup context WA page: %d\n", ret);
3975 		return ret;
3976 	}
3977 
3978 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
3979 
3980 	/*
3981 	 * Emit the two workaround batch buffers, recording the offset from the
3982 	 * start of the workaround batch buffer object for each and their
3983 	 * respective sizes.
3984 	 */
3985 	batch_ptr = batch;
3986 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3987 		wa_bb[i]->offset = batch_ptr - batch;
3988 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3989 						  CACHELINE_BYTES))) {
3990 			ret = -EINVAL;
3991 			break;
3992 		}
3993 		if (wa_bb_fn[i])
3994 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3995 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3996 	}
3997 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3998 
3999 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
4000 	__i915_gem_object_release_map(wa_ctx->vma->obj);
4001 	if (ret)
4002 		lrc_destroy_wa_ctx(engine);
4003 
4004 	return ret;
4005 }
4006 
4007 static void reset_csb_pointers(struct intel_engine_cs *engine)
4008 {
4009 	struct intel_engine_execlists * const execlists = &engine->execlists;
4010 	const unsigned int reset_value = execlists->csb_size - 1;
4011 
4012 	ring_set_paused(engine, 0);
4013 
4014 	/*
4015 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
4016 	 * Bludgeon them with a mmio update to be sure.
4017 	 */
4018 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4019 		     0xffff << 16 | reset_value << 8 | reset_value);
4020 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4021 
4022 	/*
4023 	 * After a reset, the HW starts writing into CSB entry [0]. We
4024 	 * therefore have to set our HEAD pointer back one entry so that
4025 	 * the *first* entry we check is entry 0. To complicate this further,
4026 	 * as we don't wait for the first interrupt after reset, we have to
4027 	 * fake the HW write to point back to the last entry so that our
4028 	 * inline comparison of our cached head position against the last HW
4029 	 * write works even before the first interrupt.
4030 	 */
4031 	execlists->csb_head = reset_value;
4032 	WRITE_ONCE(*execlists->csb_write, reset_value);
4033 	wmb(); /* Make sure this is visible to HW (paranoia?) */
4034 
4035 	/* Check that the GPU does indeed update the CSB entries! */
4036 	memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64));
4037 	invalidate_csb_entries(&execlists->csb_status[0],
4038 			       &execlists->csb_status[reset_value]);
4039 
4040 	/* Once more for luck and our trusty paranoia */
4041 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4042 		     0xffff << 16 | reset_value << 8 | reset_value);
4043 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4044 
4045 	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4046 }
4047 
4048 static void execlists_sanitize(struct intel_engine_cs *engine)
4049 {
4050 	/*
4051 	 * Poison residual state on resume, in case the suspend didn't!
4052 	 *
4053 	 * We have to assume that across suspend/resume (or other loss
4054 	 * of control) that the contents of our pinned buffers has been
4055 	 * lost, replaced by garbage. Since this doesn't always happen,
4056 	 * let's poison such state so that we more quickly spot when
4057 	 * we falsely assume it has been preserved.
4058 	 */
4059 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4060 		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4061 
4062 	reset_csb_pointers(engine);
4063 
4064 	/*
4065 	 * The kernel_context HWSP is stored in the status_page. As above,
4066 	 * that may be lost on resume/initialisation, and so we need to
4067 	 * reset the value in the HWSP.
4068 	 */
4069 	intel_timeline_reset_seqno(engine->kernel_context->timeline);
4070 
4071 	/* And scrub the dirty cachelines for the HWSP */
4072 	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4073 }
4074 
4075 static void enable_error_interrupt(struct intel_engine_cs *engine)
4076 {
4077 	u32 status;
4078 
4079 	engine->execlists.error_interrupt = 0;
4080 	ENGINE_WRITE(engine, RING_EMR, ~0u);
4081 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4082 
4083 	status = ENGINE_READ(engine, RING_ESR);
4084 	if (unlikely(status)) {
4085 		drm_err(&engine->i915->drm,
4086 			"engine '%s' resumed still in error: %08x\n",
4087 			engine->name, status);
4088 		__intel_gt_reset(engine->gt, engine->mask);
4089 	}
4090 
4091 	/*
4092 	 * On current gen8+, we have 2 signals to play with
4093 	 *
4094 	 * - I915_ERROR_INSTUCTION (bit 0)
4095 	 *
4096 	 *    Generate an error if the command parser encounters an invalid
4097 	 *    instruction
4098 	 *
4099 	 *    This is a fatal error.
4100 	 *
4101 	 * - CP_PRIV (bit 2)
4102 	 *
4103 	 *    Generate an error on privilege violation (where the CP replaces
4104 	 *    the instruction with a no-op). This also fires for writes into
4105 	 *    read-only scratch pages.
4106 	 *
4107 	 *    This is a non-fatal error, parsing continues.
4108 	 *
4109 	 * * there are a few others defined for odd HW that we do not use
4110 	 *
4111 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4112 	 * error (as the HW is validating and suppressing the mistakes), we
4113 	 * only unmask the instruction error bit.
4114 	 */
4115 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4116 }
4117 
4118 static void enable_execlists(struct intel_engine_cs *engine)
4119 {
4120 	u32 mode;
4121 
4122 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4123 
4124 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4125 
4126 	if (INTEL_GEN(engine->i915) >= 11)
4127 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4128 	else
4129 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4130 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4131 
4132 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4133 
4134 	ENGINE_WRITE_FW(engine,
4135 			RING_HWS_PGA,
4136 			i915_ggtt_offset(engine->status_page.vma));
4137 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4138 
4139 	enable_error_interrupt(engine);
4140 
4141 	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4142 }
4143 
4144 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4145 {
4146 	bool unexpected = false;
4147 
4148 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4149 		drm_dbg(&engine->i915->drm,
4150 			"STOP_RING still set in RING_MI_MODE\n");
4151 		unexpected = true;
4152 	}
4153 
4154 	return unexpected;
4155 }
4156 
4157 static int execlists_resume(struct intel_engine_cs *engine)
4158 {
4159 	intel_mocs_init_engine(engine);
4160 
4161 	intel_breadcrumbs_reset(engine->breadcrumbs);
4162 
4163 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4164 		struct drm_printer p = drm_debug_printer(__func__);
4165 
4166 		intel_engine_dump(engine, &p, NULL);
4167 	}
4168 
4169 	enable_execlists(engine);
4170 
4171 	return 0;
4172 }
4173 
4174 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4175 {
4176 	struct intel_engine_execlists * const execlists = &engine->execlists;
4177 	unsigned long flags;
4178 
4179 	ENGINE_TRACE(engine, "depth<-%d\n",
4180 		     atomic_read(&execlists->tasklet.count));
4181 
4182 	/*
4183 	 * Prevent request submission to the hardware until we have
4184 	 * completed the reset in i915_gem_reset_finish(). If a request
4185 	 * is completed by one engine, it may then queue a request
4186 	 * to a second via its execlists->tasklet *just* as we are
4187 	 * calling engine->resume() and also writing the ELSP.
4188 	 * Turning off the execlists->tasklet until the reset is over
4189 	 * prevents the race.
4190 	 */
4191 	__tasklet_disable_sync_once(&execlists->tasklet);
4192 	GEM_BUG_ON(!reset_in_progress(execlists));
4193 
4194 	/* And flush any current direct submission. */
4195 	spin_lock_irqsave(&engine->active.lock, flags);
4196 	spin_unlock_irqrestore(&engine->active.lock, flags);
4197 
4198 	/*
4199 	 * We stop engines, otherwise we might get failed reset and a
4200 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4201 	 * from system hang if batchbuffer is progressing when
4202 	 * the reset is issued, regardless of READY_TO_RESET ack.
4203 	 * Thus assume it is best to stop engines on all gens
4204 	 * where we have a gpu reset.
4205 	 *
4206 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4207 	 *
4208 	 * FIXME: Wa for more modern gens needs to be validated
4209 	 */
4210 	ring_set_paused(engine, 1);
4211 	intel_engine_stop_cs(engine);
4212 
4213 	engine->execlists.reset_ccid = active_ccid(engine);
4214 }
4215 
4216 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4217 {
4218 	int x;
4219 
4220 	x = lrc_ring_mi_mode(engine);
4221 	if (x != -1) {
4222 		regs[x + 1] &= ~STOP_RING;
4223 		regs[x + 1] |= STOP_RING << 16;
4224 	}
4225 }
4226 
4227 static void __execlists_reset_reg_state(const struct intel_context *ce,
4228 					const struct intel_engine_cs *engine)
4229 {
4230 	u32 *regs = ce->lrc_reg_state;
4231 
4232 	__reset_stop_ring(regs, engine);
4233 }
4234 
4235 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4236 {
4237 	struct intel_engine_execlists * const execlists = &engine->execlists;
4238 	struct intel_context *ce;
4239 	struct i915_request *rq;
4240 	u32 head;
4241 
4242 	mb(); /* paranoia: read the CSB pointers from after the reset */
4243 	clflush(execlists->csb_write);
4244 	mb();
4245 
4246 	process_csb(engine); /* drain preemption events */
4247 
4248 	/* Following the reset, we need to reload the CSB read/write pointers */
4249 	reset_csb_pointers(engine);
4250 
4251 	/*
4252 	 * Save the currently executing context, even if we completed
4253 	 * its request, it was still running at the time of the
4254 	 * reset and will have been clobbered.
4255 	 */
4256 	rq = active_context(engine, engine->execlists.reset_ccid);
4257 	if (!rq)
4258 		goto unwind;
4259 
4260 	ce = rq->context;
4261 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4262 
4263 	if (i915_request_completed(rq)) {
4264 		/* Idle context; tidy up the ring so we can restart afresh */
4265 		head = intel_ring_wrap(ce->ring, rq->tail);
4266 		goto out_replay;
4267 	}
4268 
4269 	/* We still have requests in-flight; the engine should be active */
4270 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4271 
4272 	/* Context has requests still in-flight; it should not be idle! */
4273 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4274 
4275 	rq = active_request(ce->timeline, rq);
4276 	head = intel_ring_wrap(ce->ring, rq->head);
4277 	GEM_BUG_ON(head == ce->ring->tail);
4278 
4279 	/*
4280 	 * If this request hasn't started yet, e.g. it is waiting on a
4281 	 * semaphore, we need to avoid skipping the request or else we
4282 	 * break the signaling chain. However, if the context is corrupt
4283 	 * the request will not restart and we will be stuck with a wedged
4284 	 * device. It is quite often the case that if we issue a reset
4285 	 * while the GPU is loading the context image, that the context
4286 	 * image becomes corrupt.
4287 	 *
4288 	 * Otherwise, if we have not started yet, the request should replay
4289 	 * perfectly and we do not need to flag the result as being erroneous.
4290 	 */
4291 	if (!i915_request_started(rq))
4292 		goto out_replay;
4293 
4294 	/*
4295 	 * If the request was innocent, we leave the request in the ELSP
4296 	 * and will try to replay it on restarting. The context image may
4297 	 * have been corrupted by the reset, in which case we may have
4298 	 * to service a new GPU hang, but more likely we can continue on
4299 	 * without impact.
4300 	 *
4301 	 * If the request was guilty, we presume the context is corrupt
4302 	 * and have to at least restore the RING register in the context
4303 	 * image back to the expected values to skip over the guilty request.
4304 	 */
4305 	__i915_request_reset(rq, stalled);
4306 
4307 	/*
4308 	 * We want a simple context + ring to execute the breadcrumb update.
4309 	 * We cannot rely on the context being intact across the GPU hang,
4310 	 * so clear it and rebuild just what we need for the breadcrumb.
4311 	 * All pending requests for this context will be zapped, and any
4312 	 * future request will be after userspace has had the opportunity
4313 	 * to recreate its own state.
4314 	 */
4315 out_replay:
4316 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4317 		     head, ce->ring->tail);
4318 	__execlists_reset_reg_state(ce, engine);
4319 	__execlists_update_reg_state(ce, engine, head);
4320 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4321 
4322 unwind:
4323 	/* Push back any incomplete requests for replay after the reset. */
4324 	cancel_port_requests(execlists);
4325 	__unwind_incomplete_requests(engine);
4326 }
4327 
4328 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4329 {
4330 	unsigned long flags;
4331 
4332 	ENGINE_TRACE(engine, "\n");
4333 
4334 	spin_lock_irqsave(&engine->active.lock, flags);
4335 
4336 	__execlists_reset(engine, stalled);
4337 
4338 	spin_unlock_irqrestore(&engine->active.lock, flags);
4339 }
4340 
4341 static void nop_submission_tasklet(unsigned long data)
4342 {
4343 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4344 
4345 	/* The driver is wedged; don't process any more events. */
4346 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4347 }
4348 
4349 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4350 {
4351 	struct intel_engine_execlists * const execlists = &engine->execlists;
4352 	struct i915_request *rq, *rn;
4353 	struct rb_node *rb;
4354 	unsigned long flags;
4355 
4356 	ENGINE_TRACE(engine, "\n");
4357 
4358 	/*
4359 	 * Before we call engine->cancel_requests(), we should have exclusive
4360 	 * access to the submission state. This is arranged for us by the
4361 	 * caller disabling the interrupt generation, the tasklet and other
4362 	 * threads that may then access the same state, giving us a free hand
4363 	 * to reset state. However, we still need to let lockdep be aware that
4364 	 * we know this state may be accessed in hardirq context, so we
4365 	 * disable the irq around this manipulation and we want to keep
4366 	 * the spinlock focused on its duties and not accidentally conflate
4367 	 * coverage to the submission's irq state. (Similarly, although we
4368 	 * shouldn't need to disable irq around the manipulation of the
4369 	 * submission's irq state, we also wish to remind ourselves that
4370 	 * it is irq state.)
4371 	 */
4372 	spin_lock_irqsave(&engine->active.lock, flags);
4373 
4374 	__execlists_reset(engine, true);
4375 
4376 	/* Mark all executing requests as skipped. */
4377 	list_for_each_entry(rq, &engine->active.requests, sched.link)
4378 		mark_eio(rq);
4379 
4380 	/* Flush the queued requests to the timeline list (for retiring). */
4381 	while ((rb = rb_first_cached(&execlists->queue))) {
4382 		struct i915_priolist *p = to_priolist(rb);
4383 		int i;
4384 
4385 		priolist_for_each_request_consume(rq, rn, p, i) {
4386 			mark_eio(rq);
4387 			__i915_request_submit(rq);
4388 		}
4389 
4390 		rb_erase_cached(&p->node, &execlists->queue);
4391 		i915_priolist_free(p);
4392 	}
4393 
4394 	/* On-hold requests will be flushed to timeline upon their release */
4395 	list_for_each_entry(rq, &engine->active.hold, sched.link)
4396 		mark_eio(rq);
4397 
4398 	/* Cancel all attached virtual engines */
4399 	while ((rb = rb_first_cached(&execlists->virtual))) {
4400 		struct virtual_engine *ve =
4401 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4402 
4403 		rb_erase_cached(rb, &execlists->virtual);
4404 		RB_CLEAR_NODE(rb);
4405 
4406 		spin_lock(&ve->base.active.lock);
4407 		rq = fetch_and_zero(&ve->request);
4408 		if (rq) {
4409 			mark_eio(rq);
4410 
4411 			rq->engine = engine;
4412 			__i915_request_submit(rq);
4413 			i915_request_put(rq);
4414 
4415 			ve->base.execlists.queue_priority_hint = INT_MIN;
4416 		}
4417 		spin_unlock(&ve->base.active.lock);
4418 	}
4419 
4420 	/* Remaining _unready_ requests will be nop'ed when submitted */
4421 
4422 	execlists->queue_priority_hint = INT_MIN;
4423 	execlists->queue = RB_ROOT_CACHED;
4424 
4425 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4426 	execlists->tasklet.func = nop_submission_tasklet;
4427 
4428 	spin_unlock_irqrestore(&engine->active.lock, flags);
4429 }
4430 
4431 static void execlists_reset_finish(struct intel_engine_cs *engine)
4432 {
4433 	struct intel_engine_execlists * const execlists = &engine->execlists;
4434 
4435 	/*
4436 	 * After a GPU reset, we may have requests to replay. Do so now while
4437 	 * we still have the forcewake to be sure that the GPU is not allowed
4438 	 * to sleep before we restart and reload a context.
4439 	 */
4440 	GEM_BUG_ON(!reset_in_progress(execlists));
4441 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4442 		execlists->tasklet.func(execlists->tasklet.data);
4443 
4444 	if (__tasklet_enable(&execlists->tasklet))
4445 		/* And kick in case we missed a new request submission. */
4446 		tasklet_hi_schedule(&execlists->tasklet);
4447 	ENGINE_TRACE(engine, "depth->%d\n",
4448 		     atomic_read(&execlists->tasklet.count));
4449 }
4450 
4451 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4452 				    u64 offset, u32 len,
4453 				    const unsigned int flags)
4454 {
4455 	u32 *cs;
4456 
4457 	cs = intel_ring_begin(rq, 4);
4458 	if (IS_ERR(cs))
4459 		return PTR_ERR(cs);
4460 
4461 	/*
4462 	 * WaDisableCtxRestoreArbitration:bdw,chv
4463 	 *
4464 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4465 	 * particular all the gen that do not need the w/a at all!), if we
4466 	 * took care to make sure that on every switch into this context
4467 	 * (both ordinary and for preemption) that arbitrartion was enabled
4468 	 * we would be fine.  However, for gen8 there is another w/a that
4469 	 * requires us to not preempt inside GPGPU execution, so we keep
4470 	 * arbitration disabled for gen8 batches. Arbitration will be
4471 	 * re-enabled before we close the request
4472 	 * (engine->emit_fini_breadcrumb).
4473 	 */
4474 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4475 
4476 	/* FIXME(BDW+): Address space and security selectors. */
4477 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4478 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4479 	*cs++ = lower_32_bits(offset);
4480 	*cs++ = upper_32_bits(offset);
4481 
4482 	intel_ring_advance(rq, cs);
4483 
4484 	return 0;
4485 }
4486 
4487 static int gen8_emit_bb_start(struct i915_request *rq,
4488 			      u64 offset, u32 len,
4489 			      const unsigned int flags)
4490 {
4491 	u32 *cs;
4492 
4493 	cs = intel_ring_begin(rq, 6);
4494 	if (IS_ERR(cs))
4495 		return PTR_ERR(cs);
4496 
4497 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4498 
4499 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4500 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4501 	*cs++ = lower_32_bits(offset);
4502 	*cs++ = upper_32_bits(offset);
4503 
4504 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4505 	*cs++ = MI_NOOP;
4506 
4507 	intel_ring_advance(rq, cs);
4508 
4509 	return 0;
4510 }
4511 
4512 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4513 {
4514 	ENGINE_WRITE(engine, RING_IMR,
4515 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4516 	ENGINE_POSTING_READ(engine, RING_IMR);
4517 }
4518 
4519 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4520 {
4521 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4522 }
4523 
4524 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4525 {
4526 	u32 cmd, *cs;
4527 
4528 	cs = intel_ring_begin(request, 4);
4529 	if (IS_ERR(cs))
4530 		return PTR_ERR(cs);
4531 
4532 	cmd = MI_FLUSH_DW + 1;
4533 
4534 	/* We always require a command barrier so that subsequent
4535 	 * commands, such as breadcrumb interrupts, are strictly ordered
4536 	 * wrt the contents of the write cache being flushed to memory
4537 	 * (and thus being coherent from the CPU).
4538 	 */
4539 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4540 
4541 	if (mode & EMIT_INVALIDATE) {
4542 		cmd |= MI_INVALIDATE_TLB;
4543 		if (request->engine->class == VIDEO_DECODE_CLASS)
4544 			cmd |= MI_INVALIDATE_BSD;
4545 	}
4546 
4547 	*cs++ = cmd;
4548 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4549 	*cs++ = 0; /* upper addr */
4550 	*cs++ = 0; /* value */
4551 	intel_ring_advance(request, cs);
4552 
4553 	return 0;
4554 }
4555 
4556 static int gen8_emit_flush_render(struct i915_request *request,
4557 				  u32 mode)
4558 {
4559 	bool vf_flush_wa = false, dc_flush_wa = false;
4560 	u32 *cs, flags = 0;
4561 	int len;
4562 
4563 	flags |= PIPE_CONTROL_CS_STALL;
4564 
4565 	if (mode & EMIT_FLUSH) {
4566 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4567 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4568 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4569 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4570 	}
4571 
4572 	if (mode & EMIT_INVALIDATE) {
4573 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4574 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4575 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4576 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4577 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4578 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4579 		flags |= PIPE_CONTROL_QW_WRITE;
4580 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4581 
4582 		/*
4583 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4584 		 * pipe control.
4585 		 */
4586 		if (IS_GEN(request->engine->i915, 9))
4587 			vf_flush_wa = true;
4588 
4589 		/* WaForGAMHang:kbl */
4590 		if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0))
4591 			dc_flush_wa = true;
4592 	}
4593 
4594 	len = 6;
4595 
4596 	if (vf_flush_wa)
4597 		len += 6;
4598 
4599 	if (dc_flush_wa)
4600 		len += 12;
4601 
4602 	cs = intel_ring_begin(request, len);
4603 	if (IS_ERR(cs))
4604 		return PTR_ERR(cs);
4605 
4606 	if (vf_flush_wa)
4607 		cs = gen8_emit_pipe_control(cs, 0, 0);
4608 
4609 	if (dc_flush_wa)
4610 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4611 					    0);
4612 
4613 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4614 
4615 	if (dc_flush_wa)
4616 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4617 
4618 	intel_ring_advance(request, cs);
4619 
4620 	return 0;
4621 }
4622 
4623 static int gen11_emit_flush_render(struct i915_request *request,
4624 				   u32 mode)
4625 {
4626 	if (mode & EMIT_FLUSH) {
4627 		u32 *cs;
4628 		u32 flags = 0;
4629 
4630 		flags |= PIPE_CONTROL_CS_STALL;
4631 
4632 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4633 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4634 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4635 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4636 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4637 		flags |= PIPE_CONTROL_QW_WRITE;
4638 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4639 
4640 		cs = intel_ring_begin(request, 6);
4641 		if (IS_ERR(cs))
4642 			return PTR_ERR(cs);
4643 
4644 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4645 		intel_ring_advance(request, cs);
4646 	}
4647 
4648 	if (mode & EMIT_INVALIDATE) {
4649 		u32 *cs;
4650 		u32 flags = 0;
4651 
4652 		flags |= PIPE_CONTROL_CS_STALL;
4653 
4654 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4655 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4656 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4657 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4658 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4659 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4660 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4661 		flags |= PIPE_CONTROL_QW_WRITE;
4662 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4663 
4664 		cs = intel_ring_begin(request, 6);
4665 		if (IS_ERR(cs))
4666 			return PTR_ERR(cs);
4667 
4668 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4669 		intel_ring_advance(request, cs);
4670 	}
4671 
4672 	return 0;
4673 }
4674 
4675 static u32 preparser_disable(bool state)
4676 {
4677 	return MI_ARB_CHECK | 1 << 8 | state;
4678 }
4679 
4680 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4681 {
4682 	static const i915_reg_t vd[] = {
4683 		GEN12_VD0_AUX_NV,
4684 		GEN12_VD1_AUX_NV,
4685 		GEN12_VD2_AUX_NV,
4686 		GEN12_VD3_AUX_NV,
4687 	};
4688 
4689 	static const i915_reg_t ve[] = {
4690 		GEN12_VE0_AUX_NV,
4691 		GEN12_VE1_AUX_NV,
4692 	};
4693 
4694 	if (engine->class == VIDEO_DECODE_CLASS)
4695 		return vd[engine->instance];
4696 
4697 	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4698 		return ve[engine->instance];
4699 
4700 	GEM_BUG_ON("unknown aux_inv_reg\n");
4701 
4702 	return INVALID_MMIO_REG;
4703 }
4704 
4705 static u32 *
4706 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4707 {
4708 	*cs++ = MI_LOAD_REGISTER_IMM(1);
4709 	*cs++ = i915_mmio_reg_offset(inv_reg);
4710 	*cs++ = AUX_INV;
4711 	*cs++ = MI_NOOP;
4712 
4713 	return cs;
4714 }
4715 
4716 static int gen12_emit_flush_render(struct i915_request *request,
4717 				   u32 mode)
4718 {
4719 	if (mode & EMIT_FLUSH) {
4720 		u32 flags = 0;
4721 		u32 *cs;
4722 
4723 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4724 		flags |= PIPE_CONTROL_FLUSH_L3;
4725 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4726 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4727 		/* Wa_1409600907:tgl */
4728 		flags |= PIPE_CONTROL_DEPTH_STALL;
4729 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4730 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4731 
4732 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4733 		flags |= PIPE_CONTROL_QW_WRITE;
4734 
4735 		flags |= PIPE_CONTROL_CS_STALL;
4736 
4737 		cs = intel_ring_begin(request, 6);
4738 		if (IS_ERR(cs))
4739 			return PTR_ERR(cs);
4740 
4741 		cs = gen12_emit_pipe_control(cs,
4742 					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4743 					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4744 		intel_ring_advance(request, cs);
4745 	}
4746 
4747 	if (mode & EMIT_INVALIDATE) {
4748 		u32 flags = 0;
4749 		u32 *cs;
4750 
4751 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4752 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4753 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4754 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4755 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4756 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4757 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4758 
4759 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4760 		flags |= PIPE_CONTROL_QW_WRITE;
4761 
4762 		flags |= PIPE_CONTROL_CS_STALL;
4763 
4764 		cs = intel_ring_begin(request, 8 + 4);
4765 		if (IS_ERR(cs))
4766 			return PTR_ERR(cs);
4767 
4768 		/*
4769 		 * Prevent the pre-parser from skipping past the TLB
4770 		 * invalidate and loading a stale page for the batch
4771 		 * buffer / request payload.
4772 		 */
4773 		*cs++ = preparser_disable(true);
4774 
4775 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4776 
4777 		/* hsdes: 1809175790 */
4778 		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4779 
4780 		*cs++ = preparser_disable(false);
4781 		intel_ring_advance(request, cs);
4782 	}
4783 
4784 	return 0;
4785 }
4786 
4787 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4788 {
4789 	intel_engine_mask_t aux_inv = 0;
4790 	u32 cmd, *cs;
4791 
4792 	cmd = 4;
4793 	if (mode & EMIT_INVALIDATE)
4794 		cmd += 2;
4795 	if (mode & EMIT_INVALIDATE)
4796 		aux_inv = request->engine->mask & ~BIT(BCS0);
4797 	if (aux_inv)
4798 		cmd += 2 * hweight8(aux_inv) + 2;
4799 
4800 	cs = intel_ring_begin(request, cmd);
4801 	if (IS_ERR(cs))
4802 		return PTR_ERR(cs);
4803 
4804 	if (mode & EMIT_INVALIDATE)
4805 		*cs++ = preparser_disable(true);
4806 
4807 	cmd = MI_FLUSH_DW + 1;
4808 
4809 	/* We always require a command barrier so that subsequent
4810 	 * commands, such as breadcrumb interrupts, are strictly ordered
4811 	 * wrt the contents of the write cache being flushed to memory
4812 	 * (and thus being coherent from the CPU).
4813 	 */
4814 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4815 
4816 	if (mode & EMIT_INVALIDATE) {
4817 		cmd |= MI_INVALIDATE_TLB;
4818 		if (request->engine->class == VIDEO_DECODE_CLASS)
4819 			cmd |= MI_INVALIDATE_BSD;
4820 	}
4821 
4822 	*cs++ = cmd;
4823 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4824 	*cs++ = 0; /* upper addr */
4825 	*cs++ = 0; /* value */
4826 
4827 	if (aux_inv) { /* hsdes: 1809175790 */
4828 		struct intel_engine_cs *engine;
4829 		unsigned int tmp;
4830 
4831 		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4832 		for_each_engine_masked(engine, request->engine->gt,
4833 				       aux_inv, tmp) {
4834 			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4835 			*cs++ = AUX_INV;
4836 		}
4837 		*cs++ = MI_NOOP;
4838 	}
4839 
4840 	if (mode & EMIT_INVALIDATE)
4841 		*cs++ = preparser_disable(false);
4842 
4843 	intel_ring_advance(request, cs);
4844 
4845 	return 0;
4846 }
4847 
4848 static void assert_request_valid(struct i915_request *rq)
4849 {
4850 	struct intel_ring *ring __maybe_unused = rq->ring;
4851 
4852 	/* Can we unwind this request without appearing to go forwards? */
4853 	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4854 }
4855 
4856 /*
4857  * Reserve space for 2 NOOPs at the end of each request to be
4858  * used as a workaround for not being allowed to do lite
4859  * restore with HEAD==TAIL (WaIdleLiteRestore).
4860  */
4861 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4862 {
4863 	/* Ensure there's always at least one preemption point per-request. */
4864 	*cs++ = MI_ARB_CHECK;
4865 	*cs++ = MI_NOOP;
4866 	request->wa_tail = intel_ring_offset(request, cs);
4867 
4868 	/* Check that entire request is less than half the ring */
4869 	assert_request_valid(request);
4870 
4871 	return cs;
4872 }
4873 
4874 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4875 {
4876 	*cs++ = MI_SEMAPHORE_WAIT |
4877 		MI_SEMAPHORE_GLOBAL_GTT |
4878 		MI_SEMAPHORE_POLL |
4879 		MI_SEMAPHORE_SAD_EQ_SDD;
4880 	*cs++ = 0;
4881 	*cs++ = intel_hws_preempt_address(request->engine);
4882 	*cs++ = 0;
4883 
4884 	return cs;
4885 }
4886 
4887 static __always_inline u32*
4888 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4889 {
4890 	*cs++ = MI_USER_INTERRUPT;
4891 
4892 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4893 	if (intel_engine_has_semaphores(request->engine))
4894 		cs = emit_preempt_busywait(request, cs);
4895 
4896 	request->tail = intel_ring_offset(request, cs);
4897 	assert_ring_tail_valid(request->ring, request->tail);
4898 
4899 	return gen8_emit_wa_tail(request, cs);
4900 }
4901 
4902 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
4903 {
4904 	return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
4905 }
4906 
4907 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4908 {
4909 	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4910 }
4911 
4912 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4913 {
4914 	cs = gen8_emit_pipe_control(cs,
4915 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4916 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4917 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4918 				    0);
4919 
4920 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4921 	cs = gen8_emit_ggtt_write_rcs(cs,
4922 				      request->fence.seqno,
4923 				      hwsp_offset(request),
4924 				      PIPE_CONTROL_FLUSH_ENABLE |
4925 				      PIPE_CONTROL_CS_STALL);
4926 
4927 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4928 }
4929 
4930 static u32 *
4931 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4932 {
4933 	cs = gen8_emit_ggtt_write_rcs(cs,
4934 				      request->fence.seqno,
4935 				      hwsp_offset(request),
4936 				      PIPE_CONTROL_CS_STALL |
4937 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4938 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4939 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4940 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4941 				      PIPE_CONTROL_FLUSH_ENABLE);
4942 
4943 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4944 }
4945 
4946 /*
4947  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4948  * flush and will continue pre-fetching the instructions after it before the
4949  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4950  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4951  * of the next request before the memory has been flushed, we're guaranteed that
4952  * we won't access the batch itself too early.
4953  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4954  * so, if the current request is modifying an instruction in the next request on
4955  * the same intel_context, we might pre-fetch and then execute the pre-update
4956  * instruction. To avoid this, the users of self-modifying code should either
4957  * disable the parser around the code emitting the memory writes, via a new flag
4958  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4959  * the in-kernel use-cases we've opted to use a separate context, see
4960  * reloc_gpu() as an example.
4961  * All the above applies only to the instructions themselves. Non-inline data
4962  * used by the instructions is not pre-fetched.
4963  */
4964 
4965 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4966 {
4967 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4968 		MI_SEMAPHORE_GLOBAL_GTT |
4969 		MI_SEMAPHORE_POLL |
4970 		MI_SEMAPHORE_SAD_EQ_SDD;
4971 	*cs++ = 0;
4972 	*cs++ = intel_hws_preempt_address(request->engine);
4973 	*cs++ = 0;
4974 	*cs++ = 0;
4975 	*cs++ = MI_NOOP;
4976 
4977 	return cs;
4978 }
4979 
4980 static __always_inline u32*
4981 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4982 {
4983 	*cs++ = MI_USER_INTERRUPT;
4984 
4985 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4986 	if (intel_engine_has_semaphores(request->engine))
4987 		cs = gen12_emit_preempt_busywait(request, cs);
4988 
4989 	request->tail = intel_ring_offset(request, cs);
4990 	assert_ring_tail_valid(request->ring, request->tail);
4991 
4992 	return gen8_emit_wa_tail(request, cs);
4993 }
4994 
4995 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4996 {
4997 	/* XXX Stalling flush before seqno write; post-sync not */
4998 	cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
4999 	return gen12_emit_fini_breadcrumb_tail(rq, cs);
5000 }
5001 
5002 static u32 *
5003 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
5004 {
5005 	cs = gen12_emit_ggtt_write_rcs(cs,
5006 				       request->fence.seqno,
5007 				       hwsp_offset(request),
5008 				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
5009 				       PIPE_CONTROL_CS_STALL |
5010 				       PIPE_CONTROL_TILE_CACHE_FLUSH |
5011 				       PIPE_CONTROL_FLUSH_L3 |
5012 				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5013 				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5014 				       /* Wa_1409600907:tgl */
5015 				       PIPE_CONTROL_DEPTH_STALL |
5016 				       PIPE_CONTROL_DC_FLUSH_ENABLE |
5017 				       PIPE_CONTROL_FLUSH_ENABLE);
5018 
5019 	return gen12_emit_fini_breadcrumb_tail(request, cs);
5020 }
5021 
5022 static void execlists_park(struct intel_engine_cs *engine)
5023 {
5024 	cancel_timer(&engine->execlists.timer);
5025 	cancel_timer(&engine->execlists.preempt);
5026 }
5027 
5028 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
5029 {
5030 	engine->submit_request = execlists_submit_request;
5031 	engine->schedule = i915_schedule;
5032 	engine->execlists.tasklet.func = execlists_submission_tasklet;
5033 
5034 	engine->reset.prepare = execlists_reset_prepare;
5035 	engine->reset.rewind = execlists_reset_rewind;
5036 	engine->reset.cancel = execlists_reset_cancel;
5037 	engine->reset.finish = execlists_reset_finish;
5038 
5039 	engine->park = execlists_park;
5040 	engine->unpark = NULL;
5041 
5042 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5043 	if (!intel_vgpu_active(engine->i915)) {
5044 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5045 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5046 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5047 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5048 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5049 		}
5050 	}
5051 
5052 	if (INTEL_GEN(engine->i915) >= 12)
5053 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5054 
5055 	if (intel_engine_has_preemption(engine))
5056 		engine->emit_bb_start = gen8_emit_bb_start;
5057 	else
5058 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
5059 }
5060 
5061 static void execlists_shutdown(struct intel_engine_cs *engine)
5062 {
5063 	/* Synchronise with residual timers and any softirq they raise */
5064 	del_timer_sync(&engine->execlists.timer);
5065 	del_timer_sync(&engine->execlists.preempt);
5066 	tasklet_kill(&engine->execlists.tasklet);
5067 }
5068 
5069 static void execlists_release(struct intel_engine_cs *engine)
5070 {
5071 	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5072 
5073 	execlists_shutdown(engine);
5074 
5075 	intel_engine_cleanup_common(engine);
5076 	lrc_destroy_wa_ctx(engine);
5077 }
5078 
5079 static void
5080 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5081 {
5082 	/* Default vfuncs which can be overriden by each engine. */
5083 
5084 	engine->resume = execlists_resume;
5085 
5086 	engine->cops = &execlists_context_ops;
5087 	engine->request_alloc = execlists_request_alloc;
5088 
5089 	engine->emit_flush = gen8_emit_flush;
5090 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5091 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5092 	if (INTEL_GEN(engine->i915) >= 12) {
5093 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5094 		engine->emit_flush = gen12_emit_flush;
5095 	}
5096 	engine->set_default_submission = intel_execlists_set_default_submission;
5097 
5098 	if (INTEL_GEN(engine->i915) < 11) {
5099 		engine->irq_enable = gen8_logical_ring_enable_irq;
5100 		engine->irq_disable = gen8_logical_ring_disable_irq;
5101 	} else {
5102 		/*
5103 		 * TODO: On Gen11 interrupt masks need to be clear
5104 		 * to allow C6 entry. Keep interrupts enabled at
5105 		 * and take the hit of generating extra interrupts
5106 		 * until a more refined solution exists.
5107 		 */
5108 	}
5109 }
5110 
5111 static inline void
5112 logical_ring_default_irqs(struct intel_engine_cs *engine)
5113 {
5114 	unsigned int shift = 0;
5115 
5116 	if (INTEL_GEN(engine->i915) < 11) {
5117 		const u8 irq_shifts[] = {
5118 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5119 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5120 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5121 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5122 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5123 		};
5124 
5125 		shift = irq_shifts[engine->id];
5126 	}
5127 
5128 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5129 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5130 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5131 	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5132 }
5133 
5134 static void rcs_submission_override(struct intel_engine_cs *engine)
5135 {
5136 	switch (INTEL_GEN(engine->i915)) {
5137 	case 12:
5138 		engine->emit_flush = gen12_emit_flush_render;
5139 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5140 		break;
5141 	case 11:
5142 		engine->emit_flush = gen11_emit_flush_render;
5143 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5144 		break;
5145 	default:
5146 		engine->emit_flush = gen8_emit_flush_render;
5147 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5148 		break;
5149 	}
5150 }
5151 
5152 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5153 {
5154 	struct intel_engine_execlists * const execlists = &engine->execlists;
5155 	struct drm_i915_private *i915 = engine->i915;
5156 	struct intel_uncore *uncore = engine->uncore;
5157 	u32 base = engine->mmio_base;
5158 
5159 	tasklet_init(&engine->execlists.tasklet,
5160 		     execlists_submission_tasklet, (unsigned long)engine);
5161 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5162 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5163 
5164 	logical_ring_default_vfuncs(engine);
5165 	logical_ring_default_irqs(engine);
5166 
5167 	if (engine->class == RENDER_CLASS)
5168 		rcs_submission_override(engine);
5169 
5170 	if (intel_init_workaround_bb(engine))
5171 		/*
5172 		 * We continue even if we fail to initialize WA batch
5173 		 * because we only expect rare glitches but nothing
5174 		 * critical to prevent us from using GPU
5175 		 */
5176 		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5177 
5178 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5179 		execlists->submit_reg = uncore->regs +
5180 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5181 		execlists->ctrl_reg = uncore->regs +
5182 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5183 	} else {
5184 		execlists->submit_reg = uncore->regs +
5185 			i915_mmio_reg_offset(RING_ELSP(base));
5186 	}
5187 
5188 	execlists->csb_status =
5189 		(u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5190 
5191 	execlists->csb_write =
5192 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5193 
5194 	if (INTEL_GEN(i915) < 11)
5195 		execlists->csb_size = GEN8_CSB_ENTRIES;
5196 	else
5197 		execlists->csb_size = GEN11_CSB_ENTRIES;
5198 
5199 	if (INTEL_GEN(engine->i915) >= 11) {
5200 		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5201 		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5202 	}
5203 
5204 	/* Finally, take ownership and responsibility for cleanup! */
5205 	engine->sanitize = execlists_sanitize;
5206 	engine->release = execlists_release;
5207 
5208 	return 0;
5209 }
5210 
5211 static void init_common_reg_state(u32 * const regs,
5212 				  const struct intel_engine_cs *engine,
5213 				  const struct intel_ring *ring,
5214 				  bool inhibit)
5215 {
5216 	u32 ctl;
5217 
5218 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5219 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5220 	if (inhibit)
5221 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5222 	if (INTEL_GEN(engine->i915) < 11)
5223 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5224 					   CTX_CTRL_RS_CTX_ENABLE);
5225 	regs[CTX_CONTEXT_CONTROL] = ctl;
5226 
5227 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5228 	regs[CTX_TIMESTAMP] = 0;
5229 }
5230 
5231 static void init_wa_bb_reg_state(u32 * const regs,
5232 				 const struct intel_engine_cs *engine)
5233 {
5234 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5235 
5236 	if (wa_ctx->per_ctx.size) {
5237 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5238 
5239 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5240 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5241 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5242 	}
5243 
5244 	if (wa_ctx->indirect_ctx.size) {
5245 		lrc_ring_setup_indirect_ctx(regs, engine,
5246 					    i915_ggtt_offset(wa_ctx->vma) +
5247 					    wa_ctx->indirect_ctx.offset,
5248 					    wa_ctx->indirect_ctx.size);
5249 	}
5250 }
5251 
5252 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5253 {
5254 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5255 		/* 64b PPGTT (48bit canonical)
5256 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5257 		 * other PDP Descriptors are ignored.
5258 		 */
5259 		ASSIGN_CTX_PML4(ppgtt, regs);
5260 	} else {
5261 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5262 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5263 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5264 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5265 	}
5266 }
5267 
5268 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5269 {
5270 	if (i915_is_ggtt(vm))
5271 		return i915_vm_to_ggtt(vm)->alias;
5272 	else
5273 		return i915_vm_to_ppgtt(vm);
5274 }
5275 
5276 static void execlists_init_reg_state(u32 *regs,
5277 				     const struct intel_context *ce,
5278 				     const struct intel_engine_cs *engine,
5279 				     const struct intel_ring *ring,
5280 				     bool inhibit)
5281 {
5282 	/*
5283 	 * A context is actually a big batch buffer with several
5284 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5285 	 * values we are setting here are only for the first context restore:
5286 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5287 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5288 	 * we are not initializing here).
5289 	 *
5290 	 * Must keep consistent with virtual_update_register_offsets().
5291 	 */
5292 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5293 
5294 	init_common_reg_state(regs, engine, ring, inhibit);
5295 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5296 
5297 	init_wa_bb_reg_state(regs, engine);
5298 
5299 	__reset_stop_ring(regs, engine);
5300 }
5301 
5302 static int
5303 populate_lr_context(struct intel_context *ce,
5304 		    struct drm_i915_gem_object *ctx_obj,
5305 		    struct intel_engine_cs *engine,
5306 		    struct intel_ring *ring)
5307 {
5308 	bool inhibit = true;
5309 	void *vaddr;
5310 
5311 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5312 	if (IS_ERR(vaddr)) {
5313 		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5314 		return PTR_ERR(vaddr);
5315 	}
5316 
5317 	set_redzone(vaddr, engine);
5318 
5319 	if (engine->default_state) {
5320 		shmem_read(engine->default_state, 0,
5321 			   vaddr, engine->context_size);
5322 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5323 		inhibit = false;
5324 	}
5325 
5326 	/* Clear the ppHWSP (inc. per-context counters) */
5327 	memset(vaddr, 0, PAGE_SIZE);
5328 
5329 	/*
5330 	 * The second page of the context object contains some registers which
5331 	 * must be set up prior to the first execution.
5332 	 */
5333 	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5334 				 ce, engine, ring, inhibit);
5335 
5336 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5337 	i915_gem_object_unpin_map(ctx_obj);
5338 	return 0;
5339 }
5340 
5341 static struct intel_timeline *pinned_timeline(struct intel_context *ce)
5342 {
5343 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
5344 
5345 	return intel_timeline_create_from_engine(ce->engine,
5346 						 page_unmask_bits(tl));
5347 }
5348 
5349 static int __execlists_context_alloc(struct intel_context *ce,
5350 				     struct intel_engine_cs *engine)
5351 {
5352 	struct drm_i915_gem_object *ctx_obj;
5353 	struct intel_ring *ring;
5354 	struct i915_vma *vma;
5355 	u32 context_size;
5356 	int ret;
5357 
5358 	GEM_BUG_ON(ce->state);
5359 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5360 
5361 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5362 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5363 
5364 	if (INTEL_GEN(engine->i915) == 12) {
5365 		ce->wa_bb_page = context_size / PAGE_SIZE;
5366 		context_size += PAGE_SIZE;
5367 	}
5368 
5369 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5370 	if (IS_ERR(ctx_obj))
5371 		return PTR_ERR(ctx_obj);
5372 
5373 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5374 	if (IS_ERR(vma)) {
5375 		ret = PTR_ERR(vma);
5376 		goto error_deref_obj;
5377 	}
5378 
5379 	if (!page_mask_bits(ce->timeline)) {
5380 		struct intel_timeline *tl;
5381 
5382 		/*
5383 		 * Use the static global HWSP for the kernel context, and
5384 		 * a dynamically allocated cacheline for everyone else.
5385 		 */
5386 		if (unlikely(ce->timeline))
5387 			tl = pinned_timeline(ce);
5388 		else
5389 			tl = intel_timeline_create(engine->gt);
5390 		if (IS_ERR(tl)) {
5391 			ret = PTR_ERR(tl);
5392 			goto error_deref_obj;
5393 		}
5394 
5395 		ce->timeline = tl;
5396 	}
5397 
5398 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5399 	if (IS_ERR(ring)) {
5400 		ret = PTR_ERR(ring);
5401 		goto error_deref_obj;
5402 	}
5403 
5404 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5405 	if (ret) {
5406 		drm_dbg(&engine->i915->drm,
5407 			"Failed to populate LRC: %d\n", ret);
5408 		goto error_ring_free;
5409 	}
5410 
5411 	ce->ring = ring;
5412 	ce->state = vma;
5413 
5414 	return 0;
5415 
5416 error_ring_free:
5417 	intel_ring_put(ring);
5418 error_deref_obj:
5419 	i915_gem_object_put(ctx_obj);
5420 	return ret;
5421 }
5422 
5423 static struct list_head *virtual_queue(struct virtual_engine *ve)
5424 {
5425 	return &ve->base.execlists.default_priolist.requests[0];
5426 }
5427 
5428 static void virtual_context_destroy(struct kref *kref)
5429 {
5430 	struct virtual_engine *ve =
5431 		container_of(kref, typeof(*ve), context.ref);
5432 	unsigned int n;
5433 
5434 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5435 	GEM_BUG_ON(ve->request);
5436 	GEM_BUG_ON(ve->context.inflight);
5437 
5438 	for (n = 0; n < ve->num_siblings; n++) {
5439 		struct intel_engine_cs *sibling = ve->siblings[n];
5440 		struct rb_node *node = &ve->nodes[sibling->id].rb;
5441 		unsigned long flags;
5442 
5443 		if (RB_EMPTY_NODE(node))
5444 			continue;
5445 
5446 		spin_lock_irqsave(&sibling->active.lock, flags);
5447 
5448 		/* Detachment is lazily performed in the execlists tasklet */
5449 		if (!RB_EMPTY_NODE(node))
5450 			rb_erase_cached(node, &sibling->execlists.virtual);
5451 
5452 		spin_unlock_irqrestore(&sibling->active.lock, flags);
5453 	}
5454 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5455 
5456 	if (ve->context.state)
5457 		__execlists_context_fini(&ve->context);
5458 	intel_context_fini(&ve->context);
5459 
5460 	intel_engine_free_request_pool(&ve->base);
5461 
5462 	kfree(ve->bonds);
5463 	kfree(ve);
5464 }
5465 
5466 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5467 {
5468 	int swp;
5469 
5470 	/*
5471 	 * Pick a random sibling on starting to help spread the load around.
5472 	 *
5473 	 * New contexts are typically created with exactly the same order
5474 	 * of siblings, and often started in batches. Due to the way we iterate
5475 	 * the array of sibling when submitting requests, sibling[0] is
5476 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5477 	 * randomised across the system, we also help spread the load by the
5478 	 * first engine we inspect being different each time.
5479 	 *
5480 	 * NB This does not force us to execute on this engine, it will just
5481 	 * typically be the first we inspect for submission.
5482 	 */
5483 	swp = prandom_u32_max(ve->num_siblings);
5484 	if (swp)
5485 		swap(ve->siblings[swp], ve->siblings[0]);
5486 }
5487 
5488 static int virtual_context_alloc(struct intel_context *ce)
5489 {
5490 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5491 
5492 	return __execlists_context_alloc(ce, ve->siblings[0]);
5493 }
5494 
5495 static int virtual_context_pin(struct intel_context *ce, void *vaddr)
5496 {
5497 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5498 
5499 	/* Note: we must use a real engine class for setting up reg state */
5500 	return __execlists_context_pin(ce, ve->siblings[0], vaddr);
5501 }
5502 
5503 static void virtual_context_enter(struct intel_context *ce)
5504 {
5505 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5506 	unsigned int n;
5507 
5508 	for (n = 0; n < ve->num_siblings; n++)
5509 		intel_engine_pm_get(ve->siblings[n]);
5510 
5511 	intel_timeline_enter(ce->timeline);
5512 }
5513 
5514 static void virtual_context_exit(struct intel_context *ce)
5515 {
5516 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5517 	unsigned int n;
5518 
5519 	intel_timeline_exit(ce->timeline);
5520 
5521 	for (n = 0; n < ve->num_siblings; n++)
5522 		intel_engine_pm_put(ve->siblings[n]);
5523 }
5524 
5525 static const struct intel_context_ops virtual_context_ops = {
5526 	.alloc = virtual_context_alloc,
5527 
5528 	.pre_pin = execlists_context_pre_pin,
5529 	.pin = virtual_context_pin,
5530 	.unpin = execlists_context_unpin,
5531 	.post_unpin = execlists_context_post_unpin,
5532 
5533 	.enter = virtual_context_enter,
5534 	.exit = virtual_context_exit,
5535 
5536 	.destroy = virtual_context_destroy,
5537 };
5538 
5539 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5540 {
5541 	struct i915_request *rq;
5542 	intel_engine_mask_t mask;
5543 
5544 	rq = READ_ONCE(ve->request);
5545 	if (!rq)
5546 		return 0;
5547 
5548 	/* The rq is ready for submission; rq->execution_mask is now stable. */
5549 	mask = rq->execution_mask;
5550 	if (unlikely(!mask)) {
5551 		/* Invalid selection, submit to a random engine in error */
5552 		i915_request_set_error_once(rq, -ENODEV);
5553 		mask = ve->siblings[0]->mask;
5554 	}
5555 
5556 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5557 		     rq->fence.context, rq->fence.seqno,
5558 		     mask, ve->base.execlists.queue_priority_hint);
5559 
5560 	return mask;
5561 }
5562 
5563 static void virtual_submission_tasklet(unsigned long data)
5564 {
5565 	struct virtual_engine * const ve = (struct virtual_engine *)data;
5566 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5567 	intel_engine_mask_t mask;
5568 	unsigned int n;
5569 
5570 	rcu_read_lock();
5571 	mask = virtual_submission_mask(ve);
5572 	rcu_read_unlock();
5573 	if (unlikely(!mask))
5574 		return;
5575 
5576 	local_irq_disable();
5577 	for (n = 0; n < ve->num_siblings; n++) {
5578 		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5579 		struct ve_node * const node = &ve->nodes[sibling->id];
5580 		struct rb_node **parent, *rb;
5581 		bool first;
5582 
5583 		if (!READ_ONCE(ve->request))
5584 			break; /* already handled by a sibling's tasklet */
5585 
5586 		if (unlikely(!(mask & sibling->mask))) {
5587 			if (!RB_EMPTY_NODE(&node->rb)) {
5588 				spin_lock(&sibling->active.lock);
5589 				rb_erase_cached(&node->rb,
5590 						&sibling->execlists.virtual);
5591 				RB_CLEAR_NODE(&node->rb);
5592 				spin_unlock(&sibling->active.lock);
5593 			}
5594 			continue;
5595 		}
5596 
5597 		spin_lock(&sibling->active.lock);
5598 
5599 		if (!RB_EMPTY_NODE(&node->rb)) {
5600 			/*
5601 			 * Cheat and avoid rebalancing the tree if we can
5602 			 * reuse this node in situ.
5603 			 */
5604 			first = rb_first_cached(&sibling->execlists.virtual) ==
5605 				&node->rb;
5606 			if (prio == node->prio || (prio > node->prio && first))
5607 				goto submit_engine;
5608 
5609 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5610 		}
5611 
5612 		rb = NULL;
5613 		first = true;
5614 		parent = &sibling->execlists.virtual.rb_root.rb_node;
5615 		while (*parent) {
5616 			struct ve_node *other;
5617 
5618 			rb = *parent;
5619 			other = rb_entry(rb, typeof(*other), rb);
5620 			if (prio > other->prio) {
5621 				parent = &rb->rb_left;
5622 			} else {
5623 				parent = &rb->rb_right;
5624 				first = false;
5625 			}
5626 		}
5627 
5628 		rb_link_node(&node->rb, rb, parent);
5629 		rb_insert_color_cached(&node->rb,
5630 				       &sibling->execlists.virtual,
5631 				       first);
5632 
5633 submit_engine:
5634 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5635 		node->prio = prio;
5636 		if (first && prio > sibling->execlists.queue_priority_hint)
5637 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5638 
5639 		spin_unlock(&sibling->active.lock);
5640 	}
5641 	local_irq_enable();
5642 }
5643 
5644 static void virtual_submit_request(struct i915_request *rq)
5645 {
5646 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5647 	struct i915_request *old;
5648 	unsigned long flags;
5649 
5650 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5651 		     rq->fence.context,
5652 		     rq->fence.seqno);
5653 
5654 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5655 
5656 	spin_lock_irqsave(&ve->base.active.lock, flags);
5657 
5658 	old = ve->request;
5659 	if (old) { /* background completion event from preempt-to-busy */
5660 		GEM_BUG_ON(!i915_request_completed(old));
5661 		__i915_request_submit(old);
5662 		i915_request_put(old);
5663 	}
5664 
5665 	if (i915_request_completed(rq)) {
5666 		__i915_request_submit(rq);
5667 
5668 		ve->base.execlists.queue_priority_hint = INT_MIN;
5669 		ve->request = NULL;
5670 	} else {
5671 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5672 		ve->request = i915_request_get(rq);
5673 
5674 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5675 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5676 
5677 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
5678 	}
5679 
5680 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5681 }
5682 
5683 static struct ve_bond *
5684 virtual_find_bond(struct virtual_engine *ve,
5685 		  const struct intel_engine_cs *master)
5686 {
5687 	int i;
5688 
5689 	for (i = 0; i < ve->num_bonds; i++) {
5690 		if (ve->bonds[i].master == master)
5691 			return &ve->bonds[i];
5692 	}
5693 
5694 	return NULL;
5695 }
5696 
5697 static void
5698 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5699 {
5700 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5701 	intel_engine_mask_t allowed, exec;
5702 	struct ve_bond *bond;
5703 
5704 	allowed = ~to_request(signal)->engine->mask;
5705 
5706 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5707 	if (bond)
5708 		allowed &= bond->sibling_mask;
5709 
5710 	/* Restrict the bonded request to run on only the available engines */
5711 	exec = READ_ONCE(rq->execution_mask);
5712 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5713 		;
5714 
5715 	/* Prevent the master from being re-run on the bonded engines */
5716 	to_request(signal)->execution_mask &= ~allowed;
5717 }
5718 
5719 struct intel_context *
5720 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5721 			       unsigned int count)
5722 {
5723 	struct virtual_engine *ve;
5724 	unsigned int n;
5725 	int err;
5726 
5727 	if (count == 0)
5728 		return ERR_PTR(-EINVAL);
5729 
5730 	if (count == 1)
5731 		return intel_context_create(siblings[0]);
5732 
5733 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5734 	if (!ve)
5735 		return ERR_PTR(-ENOMEM);
5736 
5737 	ve->base.i915 = siblings[0]->i915;
5738 	ve->base.gt = siblings[0]->gt;
5739 	ve->base.uncore = siblings[0]->uncore;
5740 	ve->base.id = -1;
5741 
5742 	ve->base.class = OTHER_CLASS;
5743 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5744 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5745 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5746 
5747 	/*
5748 	 * The decision on whether to submit a request using semaphores
5749 	 * depends on the saturated state of the engine. We only compute
5750 	 * this during HW submission of the request, and we need for this
5751 	 * state to be globally applied to all requests being submitted
5752 	 * to this engine. Virtual engines encompass more than one physical
5753 	 * engine and so we cannot accurately tell in advance if one of those
5754 	 * engines is already saturated and so cannot afford to use a semaphore
5755 	 * and be pessimized in priority for doing so -- if we are the only
5756 	 * context using semaphores after all other clients have stopped, we
5757 	 * will be starved on the saturated system. Such a global switch for
5758 	 * semaphores is less than ideal, but alas is the current compromise.
5759 	 */
5760 	ve->base.saturated = ALL_ENGINES;
5761 
5762 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5763 
5764 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5765 	intel_engine_init_execlists(&ve->base);
5766 
5767 	ve->base.cops = &virtual_context_ops;
5768 	ve->base.request_alloc = execlists_request_alloc;
5769 
5770 	ve->base.schedule = i915_schedule;
5771 	ve->base.submit_request = virtual_submit_request;
5772 	ve->base.bond_execute = virtual_bond_execute;
5773 
5774 	INIT_LIST_HEAD(virtual_queue(ve));
5775 	ve->base.execlists.queue_priority_hint = INT_MIN;
5776 	tasklet_init(&ve->base.execlists.tasklet,
5777 		     virtual_submission_tasklet,
5778 		     (unsigned long)ve);
5779 
5780 	intel_context_init(&ve->context, &ve->base);
5781 
5782 	ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
5783 	if (!ve->base.breadcrumbs) {
5784 		err = -ENOMEM;
5785 		goto err_put;
5786 	}
5787 
5788 	for (n = 0; n < count; n++) {
5789 		struct intel_engine_cs *sibling = siblings[n];
5790 
5791 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5792 		if (sibling->mask & ve->base.mask) {
5793 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5794 				  sibling->name);
5795 			err = -EINVAL;
5796 			goto err_put;
5797 		}
5798 
5799 		/*
5800 		 * The virtual engine implementation is tightly coupled to
5801 		 * the execlists backend -- we push out request directly
5802 		 * into a tree inside each physical engine. We could support
5803 		 * layering if we handle cloning of the requests and
5804 		 * submitting a copy into each backend.
5805 		 */
5806 		if (sibling->execlists.tasklet.func !=
5807 		    execlists_submission_tasklet) {
5808 			err = -ENODEV;
5809 			goto err_put;
5810 		}
5811 
5812 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5813 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5814 
5815 		ve->siblings[ve->num_siblings++] = sibling;
5816 		ve->base.mask |= sibling->mask;
5817 
5818 		/*
5819 		 * All physical engines must be compatible for their emission
5820 		 * functions (as we build the instructions during request
5821 		 * construction and do not alter them before submission
5822 		 * on the physical engine). We use the engine class as a guide
5823 		 * here, although that could be refined.
5824 		 */
5825 		if (ve->base.class != OTHER_CLASS) {
5826 			if (ve->base.class != sibling->class) {
5827 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5828 					  sibling->class, ve->base.class);
5829 				err = -EINVAL;
5830 				goto err_put;
5831 			}
5832 			continue;
5833 		}
5834 
5835 		ve->base.class = sibling->class;
5836 		ve->base.uabi_class = sibling->uabi_class;
5837 		snprintf(ve->base.name, sizeof(ve->base.name),
5838 			 "v%dx%d", ve->base.class, count);
5839 		ve->base.context_size = sibling->context_size;
5840 
5841 		ve->base.emit_bb_start = sibling->emit_bb_start;
5842 		ve->base.emit_flush = sibling->emit_flush;
5843 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5844 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5845 		ve->base.emit_fini_breadcrumb_dw =
5846 			sibling->emit_fini_breadcrumb_dw;
5847 
5848 		ve->base.flags = sibling->flags;
5849 	}
5850 
5851 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5852 
5853 	virtual_engine_initial_hint(ve);
5854 	return &ve->context;
5855 
5856 err_put:
5857 	intel_context_put(&ve->context);
5858 	return ERR_PTR(err);
5859 }
5860 
5861 struct intel_context *
5862 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5863 {
5864 	struct virtual_engine *se = to_virtual_engine(src);
5865 	struct intel_context *dst;
5866 
5867 	dst = intel_execlists_create_virtual(se->siblings,
5868 					     se->num_siblings);
5869 	if (IS_ERR(dst))
5870 		return dst;
5871 
5872 	if (se->num_bonds) {
5873 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5874 
5875 		de->bonds = kmemdup(se->bonds,
5876 				    sizeof(*se->bonds) * se->num_bonds,
5877 				    GFP_KERNEL);
5878 		if (!de->bonds) {
5879 			intel_context_put(dst);
5880 			return ERR_PTR(-ENOMEM);
5881 		}
5882 
5883 		de->num_bonds = se->num_bonds;
5884 	}
5885 
5886 	return dst;
5887 }
5888 
5889 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5890 				     const struct intel_engine_cs *master,
5891 				     const struct intel_engine_cs *sibling)
5892 {
5893 	struct virtual_engine *ve = to_virtual_engine(engine);
5894 	struct ve_bond *bond;
5895 	int n;
5896 
5897 	/* Sanity check the sibling is part of the virtual engine */
5898 	for (n = 0; n < ve->num_siblings; n++)
5899 		if (sibling == ve->siblings[n])
5900 			break;
5901 	if (n == ve->num_siblings)
5902 		return -EINVAL;
5903 
5904 	bond = virtual_find_bond(ve, master);
5905 	if (bond) {
5906 		bond->sibling_mask |= sibling->mask;
5907 		return 0;
5908 	}
5909 
5910 	bond = krealloc(ve->bonds,
5911 			sizeof(*bond) * (ve->num_bonds + 1),
5912 			GFP_KERNEL);
5913 	if (!bond)
5914 		return -ENOMEM;
5915 
5916 	bond[ve->num_bonds].master = master;
5917 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5918 
5919 	ve->bonds = bond;
5920 	ve->num_bonds++;
5921 
5922 	return 0;
5923 }
5924 
5925 struct intel_engine_cs *
5926 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5927 				 unsigned int sibling)
5928 {
5929 	struct virtual_engine *ve = to_virtual_engine(engine);
5930 
5931 	if (sibling >= ve->num_siblings)
5932 		return NULL;
5933 
5934 	return ve->siblings[sibling];
5935 }
5936 
5937 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5938 				   struct drm_printer *m,
5939 				   void (*show_request)(struct drm_printer *m,
5940 							struct i915_request *rq,
5941 							const char *prefix),
5942 				   unsigned int max)
5943 {
5944 	const struct intel_engine_execlists *execlists = &engine->execlists;
5945 	struct i915_request *rq, *last;
5946 	unsigned long flags;
5947 	unsigned int count;
5948 	struct rb_node *rb;
5949 
5950 	spin_lock_irqsave(&engine->active.lock, flags);
5951 
5952 	last = NULL;
5953 	count = 0;
5954 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5955 		if (count++ < max - 1)
5956 			show_request(m, rq, "\t\tE ");
5957 		else
5958 			last = rq;
5959 	}
5960 	if (last) {
5961 		if (count > max) {
5962 			drm_printf(m,
5963 				   "\t\t...skipping %d executing requests...\n",
5964 				   count - max);
5965 		}
5966 		show_request(m, last, "\t\tE ");
5967 	}
5968 
5969 	if (execlists->switch_priority_hint != INT_MIN)
5970 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5971 			   READ_ONCE(execlists->switch_priority_hint));
5972 	if (execlists->queue_priority_hint != INT_MIN)
5973 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5974 			   READ_ONCE(execlists->queue_priority_hint));
5975 
5976 	last = NULL;
5977 	count = 0;
5978 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5979 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5980 		int i;
5981 
5982 		priolist_for_each_request(rq, p, i) {
5983 			if (count++ < max - 1)
5984 				show_request(m, rq, "\t\tQ ");
5985 			else
5986 				last = rq;
5987 		}
5988 	}
5989 	if (last) {
5990 		if (count > max) {
5991 			drm_printf(m,
5992 				   "\t\t...skipping %d queued requests...\n",
5993 				   count - max);
5994 		}
5995 		show_request(m, last, "\t\tQ ");
5996 	}
5997 
5998 	last = NULL;
5999 	count = 0;
6000 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
6001 		struct virtual_engine *ve =
6002 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
6003 		struct i915_request *rq = READ_ONCE(ve->request);
6004 
6005 		if (rq) {
6006 			if (count++ < max - 1)
6007 				show_request(m, rq, "\t\tV ");
6008 			else
6009 				last = rq;
6010 		}
6011 	}
6012 	if (last) {
6013 		if (count > max) {
6014 			drm_printf(m,
6015 				   "\t\t...skipping %d virtual requests...\n",
6016 				   count - max);
6017 		}
6018 		show_request(m, last, "\t\tV ");
6019 	}
6020 
6021 	spin_unlock_irqrestore(&engine->active.lock, flags);
6022 }
6023 
6024 void intel_lr_context_reset(struct intel_engine_cs *engine,
6025 			    struct intel_context *ce,
6026 			    u32 head,
6027 			    bool scrub)
6028 {
6029 	GEM_BUG_ON(!intel_context_is_pinned(ce));
6030 
6031 	/*
6032 	 * We want a simple context + ring to execute the breadcrumb update.
6033 	 * We cannot rely on the context being intact across the GPU hang,
6034 	 * so clear it and rebuild just what we need for the breadcrumb.
6035 	 * All pending requests for this context will be zapped, and any
6036 	 * future request will be after userspace has had the opportunity
6037 	 * to recreate its own state.
6038 	 */
6039 	if (scrub)
6040 		restore_default_state(ce, engine);
6041 
6042 	/* Rerun the request; its payload has been neutered (if guilty). */
6043 	__execlists_update_reg_state(ce, engine, head);
6044 }
6045 
6046 bool
6047 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6048 {
6049 	return engine->set_default_submission ==
6050 	       intel_execlists_set_default_submission;
6051 }
6052 
6053 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6054 #include "selftest_lrc.c"
6055 #endif
6056