xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 6ca7217d)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_breadcrumbs.h"
141 #include "intel_context.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_gt_requests.h"
146 #include "intel_lrc_reg.h"
147 #include "intel_mocs.h"
148 #include "intel_reset.h"
149 #include "intel_ring.h"
150 #include "intel_workarounds.h"
151 #include "shmem_utils.h"
152 
153 #define RING_EXECLIST_QFULL		(1 << 0x2)
154 #define RING_EXECLIST1_VALID		(1 << 0x3)
155 #define RING_EXECLIST0_VALID		(1 << 0x4)
156 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
157 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
158 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
159 
160 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
161 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
163 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
164 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
165 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
166 
167 #define GEN8_CTX_STATUS_COMPLETED_MASK \
168 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
169 
170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
171 
172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
174 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
175 #define GEN12_IDLE_CTX_ID		0x7FF
176 #define GEN12_CSB_CTX_VALID(csb_dw) \
177 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
178 
179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
181 
182 struct virtual_engine {
183 	struct intel_engine_cs base;
184 	struct intel_context context;
185 
186 	/*
187 	 * We allow only a single request through the virtual engine at a time
188 	 * (each request in the timeline waits for the completion fence of
189 	 * the previous before being submitted). By restricting ourselves to
190 	 * only submitting a single request, each request is placed on to a
191 	 * physical to maximise load spreading (by virtue of the late greedy
192 	 * scheduling -- each real engine takes the next available request
193 	 * upon idling).
194 	 */
195 	struct i915_request *request;
196 
197 	/*
198 	 * We keep a rbtree of available virtual engines inside each physical
199 	 * engine, sorted by priority. Here we preallocate the nodes we need
200 	 * for the virtual engine, indexed by physical_engine->id.
201 	 */
202 	struct ve_node {
203 		struct rb_node rb;
204 		int prio;
205 	} nodes[I915_NUM_ENGINES];
206 
207 	/*
208 	 * Keep track of bonded pairs -- restrictions upon on our selection
209 	 * of physical engines any particular request may be submitted to.
210 	 * If we receive a submit-fence from a master engine, we will only
211 	 * use one of sibling_mask physical engines.
212 	 */
213 	struct ve_bond {
214 		const struct intel_engine_cs *master;
215 		intel_engine_mask_t sibling_mask;
216 	} *bonds;
217 	unsigned int num_bonds;
218 
219 	/* And finally, which physical engines this virtual engine maps onto. */
220 	unsigned int num_siblings;
221 	struct intel_engine_cs *siblings[];
222 };
223 
224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
225 {
226 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
227 	return container_of(engine, struct virtual_engine, base);
228 }
229 
230 static int __execlists_context_alloc(struct intel_context *ce,
231 				     struct intel_engine_cs *engine);
232 
233 static void execlists_init_reg_state(u32 *reg_state,
234 				     const struct intel_context *ce,
235 				     const struct intel_engine_cs *engine,
236 				     const struct intel_ring *ring,
237 				     bool close);
238 static void
239 __execlists_update_reg_state(const struct intel_context *ce,
240 			     const struct intel_engine_cs *engine,
241 			     u32 head);
242 
243 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
244 {
245 	if (INTEL_GEN(engine->i915) >= 12)
246 		return 0x60;
247 	else if (INTEL_GEN(engine->i915) >= 9)
248 		return 0x54;
249 	else if (engine->class == RENDER_CLASS)
250 		return 0x58;
251 	else
252 		return -1;
253 }
254 
255 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
256 {
257 	if (INTEL_GEN(engine->i915) >= 12)
258 		return 0x74;
259 	else if (INTEL_GEN(engine->i915) >= 9)
260 		return 0x68;
261 	else if (engine->class == RENDER_CLASS)
262 		return 0xd8;
263 	else
264 		return -1;
265 }
266 
267 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
268 {
269 	if (INTEL_GEN(engine->i915) >= 12)
270 		return 0x12;
271 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
272 		return 0x18;
273 	else
274 		return -1;
275 }
276 
277 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
278 {
279 	int x;
280 
281 	x = lrc_ring_wa_bb_per_ctx(engine);
282 	if (x < 0)
283 		return x;
284 
285 	return x + 2;
286 }
287 
288 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
289 {
290 	int x;
291 
292 	x = lrc_ring_indirect_ptr(engine);
293 	if (x < 0)
294 		return x;
295 
296 	return x + 2;
297 }
298 
299 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
300 {
301 	if (engine->class != RENDER_CLASS)
302 		return -1;
303 
304 	if (INTEL_GEN(engine->i915) >= 12)
305 		return 0xb6;
306 	else if (INTEL_GEN(engine->i915) >= 11)
307 		return 0xaa;
308 	else
309 		return -1;
310 }
311 
312 static u32
313 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
314 {
315 	switch (INTEL_GEN(engine->i915)) {
316 	default:
317 		MISSING_CASE(INTEL_GEN(engine->i915));
318 		fallthrough;
319 	case 12:
320 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
321 	case 11:
322 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
323 	case 10:
324 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
325 	case 9:
326 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
327 	case 8:
328 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
329 	}
330 }
331 
332 static void
333 lrc_ring_setup_indirect_ctx(u32 *regs,
334 			    const struct intel_engine_cs *engine,
335 			    u32 ctx_bb_ggtt_addr,
336 			    u32 size)
337 {
338 	GEM_BUG_ON(!size);
339 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
340 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
341 	regs[lrc_ring_indirect_ptr(engine) + 1] =
342 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
343 
344 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
345 	regs[lrc_ring_indirect_offset(engine) + 1] =
346 		lrc_ring_indirect_offset_default(engine) << 6;
347 }
348 
349 static u32 intel_context_get_runtime(const struct intel_context *ce)
350 {
351 	/*
352 	 * We can use either ppHWSP[16] which is recorded before the context
353 	 * switch (and so excludes the cost of context switches) or use the
354 	 * value from the context image itself, which is saved/restored earlier
355 	 * and so includes the cost of the save.
356 	 */
357 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
358 }
359 
360 static void mark_eio(struct i915_request *rq)
361 {
362 	if (i915_request_completed(rq))
363 		return;
364 
365 	GEM_BUG_ON(i915_request_signaled(rq));
366 
367 	i915_request_set_error_once(rq, -EIO);
368 	i915_request_mark_complete(rq);
369 }
370 
371 static struct i915_request *
372 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
373 {
374 	struct i915_request *active = rq;
375 
376 	rcu_read_lock();
377 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
378 		if (i915_request_completed(rq))
379 			break;
380 
381 		active = rq;
382 	}
383 	rcu_read_unlock();
384 
385 	return active;
386 }
387 
388 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
389 {
390 	return (i915_ggtt_offset(engine->status_page.vma) +
391 		I915_GEM_HWS_PREEMPT_ADDR);
392 }
393 
394 static inline void
395 ring_set_paused(const struct intel_engine_cs *engine, int state)
396 {
397 	/*
398 	 * We inspect HWS_PREEMPT with a semaphore inside
399 	 * engine->emit_fini_breadcrumb. If the dword is true,
400 	 * the ring is paused as the semaphore will busywait
401 	 * until the dword is false.
402 	 */
403 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
404 	if (state)
405 		wmb();
406 }
407 
408 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
409 {
410 	return rb_entry(rb, struct i915_priolist, node);
411 }
412 
413 static inline int rq_prio(const struct i915_request *rq)
414 {
415 	return READ_ONCE(rq->sched.attr.priority);
416 }
417 
418 static int effective_prio(const struct i915_request *rq)
419 {
420 	int prio = rq_prio(rq);
421 
422 	/*
423 	 * If this request is special and must not be interrupted at any
424 	 * cost, so be it. Note we are only checking the most recent request
425 	 * in the context and so may be masking an earlier vip request. It
426 	 * is hoped that under the conditions where nopreempt is used, this
427 	 * will not matter (i.e. all requests to that context will be
428 	 * nopreempt for as long as desired).
429 	 */
430 	if (i915_request_has_nopreempt(rq))
431 		prio = I915_PRIORITY_UNPREEMPTABLE;
432 
433 	return prio;
434 }
435 
436 static int queue_prio(const struct intel_engine_execlists *execlists)
437 {
438 	struct i915_priolist *p;
439 	struct rb_node *rb;
440 
441 	rb = rb_first_cached(&execlists->queue);
442 	if (!rb)
443 		return INT_MIN;
444 
445 	/*
446 	 * As the priolist[] are inverted, with the highest priority in [0],
447 	 * we have to flip the index value to become priority.
448 	 */
449 	p = to_priolist(rb);
450 	if (!I915_USER_PRIORITY_SHIFT)
451 		return p->priority;
452 
453 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
454 }
455 
456 static inline bool need_preempt(const struct intel_engine_cs *engine,
457 				const struct i915_request *rq,
458 				struct rb_node *rb)
459 {
460 	int last_prio;
461 
462 	if (!intel_engine_has_semaphores(engine))
463 		return false;
464 
465 	/*
466 	 * Check if the current priority hint merits a preemption attempt.
467 	 *
468 	 * We record the highest value priority we saw during rescheduling
469 	 * prior to this dequeue, therefore we know that if it is strictly
470 	 * less than the current tail of ESLP[0], we do not need to force
471 	 * a preempt-to-idle cycle.
472 	 *
473 	 * However, the priority hint is a mere hint that we may need to
474 	 * preempt. If that hint is stale or we may be trying to preempt
475 	 * ourselves, ignore the request.
476 	 *
477 	 * More naturally we would write
478 	 *      prio >= max(0, last);
479 	 * except that we wish to prevent triggering preemption at the same
480 	 * priority level: the task that is running should remain running
481 	 * to preserve FIFO ordering of dependencies.
482 	 */
483 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
484 	if (engine->execlists.queue_priority_hint <= last_prio)
485 		return false;
486 
487 	/*
488 	 * Check against the first request in ELSP[1], it will, thanks to the
489 	 * power of PI, be the highest priority of that context.
490 	 */
491 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
492 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
493 		return true;
494 
495 	if (rb) {
496 		struct virtual_engine *ve =
497 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
498 		bool preempt = false;
499 
500 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
501 			struct i915_request *next;
502 
503 			rcu_read_lock();
504 			next = READ_ONCE(ve->request);
505 			if (next)
506 				preempt = rq_prio(next) > last_prio;
507 			rcu_read_unlock();
508 		}
509 
510 		if (preempt)
511 			return preempt;
512 	}
513 
514 	/*
515 	 * If the inflight context did not trigger the preemption, then maybe
516 	 * it was the set of queued requests? Pick the highest priority in
517 	 * the queue (the first active priolist) and see if it deserves to be
518 	 * running instead of ELSP[0].
519 	 *
520 	 * The highest priority request in the queue can not be either
521 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
522 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
523 	 */
524 	return queue_prio(&engine->execlists) > last_prio;
525 }
526 
527 __maybe_unused static inline bool
528 assert_priority_queue(const struct i915_request *prev,
529 		      const struct i915_request *next)
530 {
531 	/*
532 	 * Without preemption, the prev may refer to the still active element
533 	 * which we refuse to let go.
534 	 *
535 	 * Even with preemption, there are times when we think it is better not
536 	 * to preempt and leave an ostensibly lower priority request in flight.
537 	 */
538 	if (i915_request_is_active(prev))
539 		return true;
540 
541 	return rq_prio(prev) >= rq_prio(next);
542 }
543 
544 /*
545  * The context descriptor encodes various attributes of a context,
546  * including its GTT address and some flags. Because it's fairly
547  * expensive to calculate, we'll just do it once and cache the result,
548  * which remains valid until the context is unpinned.
549  *
550  * This is what a descriptor looks like, from LSB to MSB::
551  *
552  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
553  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
554  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
555  *      bits 53-54:    mbz, reserved for use by hardware
556  *      bits 55-63:    group ID, currently unused and set to 0
557  *
558  * Starting from Gen11, the upper dword of the descriptor has a new format:
559  *
560  *      bits 32-36:    reserved
561  *      bits 37-47:    SW context ID
562  *      bits 48:53:    engine instance
563  *      bit 54:        mbz, reserved for use by hardware
564  *      bits 55-60:    SW counter
565  *      bits 61-63:    engine class
566  *
567  * engine info, SW context ID and SW counter need to form a unique number
568  * (Context ID) per lrc.
569  */
570 static u32
571 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
572 {
573 	u32 desc;
574 
575 	desc = INTEL_LEGACY_32B_CONTEXT;
576 	if (i915_vm_is_4lvl(ce->vm))
577 		desc = INTEL_LEGACY_64B_CONTEXT;
578 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
579 
580 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
581 	if (IS_GEN(engine->i915, 8))
582 		desc |= GEN8_CTX_L3LLC_COHERENT;
583 
584 	return i915_ggtt_offset(ce->state) | desc;
585 }
586 
587 static inline unsigned int dword_in_page(void *addr)
588 {
589 	return offset_in_page(addr) / sizeof(u32);
590 }
591 
592 static void set_offsets(u32 *regs,
593 			const u8 *data,
594 			const struct intel_engine_cs *engine,
595 			bool clear)
596 #define NOP(x) (BIT(7) | (x))
597 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
598 #define POSTED BIT(0)
599 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
600 #define REG16(x) \
601 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
602 	(((x) >> 2) & 0x7f)
603 #define END(total_state_size) 0, (total_state_size)
604 {
605 	const u32 base = engine->mmio_base;
606 
607 	while (*data) {
608 		u8 count, flags;
609 
610 		if (*data & BIT(7)) { /* skip */
611 			count = *data++ & ~BIT(7);
612 			if (clear)
613 				memset32(regs, MI_NOOP, count);
614 			regs += count;
615 			continue;
616 		}
617 
618 		count = *data & 0x3f;
619 		flags = *data >> 6;
620 		data++;
621 
622 		*regs = MI_LOAD_REGISTER_IMM(count);
623 		if (flags & POSTED)
624 			*regs |= MI_LRI_FORCE_POSTED;
625 		if (INTEL_GEN(engine->i915) >= 11)
626 			*regs |= MI_LRI_LRM_CS_MMIO;
627 		regs++;
628 
629 		GEM_BUG_ON(!count);
630 		do {
631 			u32 offset = 0;
632 			u8 v;
633 
634 			do {
635 				v = *data++;
636 				offset <<= 7;
637 				offset |= v & ~BIT(7);
638 			} while (v & BIT(7));
639 
640 			regs[0] = base + (offset << 2);
641 			if (clear)
642 				regs[1] = 0;
643 			regs += 2;
644 		} while (--count);
645 	}
646 
647 	if (clear) {
648 		u8 count = *++data;
649 
650 		/* Clear past the tail for HW access */
651 		GEM_BUG_ON(dword_in_page(regs) > count);
652 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
653 
654 		/* Close the batch; used mainly by live_lrc_layout() */
655 		*regs = MI_BATCH_BUFFER_END;
656 		if (INTEL_GEN(engine->i915) >= 10)
657 			*regs |= BIT(0);
658 	}
659 }
660 
661 static const u8 gen8_xcs_offsets[] = {
662 	NOP(1),
663 	LRI(11, 0),
664 	REG16(0x244),
665 	REG(0x034),
666 	REG(0x030),
667 	REG(0x038),
668 	REG(0x03c),
669 	REG(0x168),
670 	REG(0x140),
671 	REG(0x110),
672 	REG(0x11c),
673 	REG(0x114),
674 	REG(0x118),
675 
676 	NOP(9),
677 	LRI(9, 0),
678 	REG16(0x3a8),
679 	REG16(0x28c),
680 	REG16(0x288),
681 	REG16(0x284),
682 	REG16(0x280),
683 	REG16(0x27c),
684 	REG16(0x278),
685 	REG16(0x274),
686 	REG16(0x270),
687 
688 	NOP(13),
689 	LRI(2, 0),
690 	REG16(0x200),
691 	REG(0x028),
692 
693 	END(80)
694 };
695 
696 static const u8 gen9_xcs_offsets[] = {
697 	NOP(1),
698 	LRI(14, POSTED),
699 	REG16(0x244),
700 	REG(0x034),
701 	REG(0x030),
702 	REG(0x038),
703 	REG(0x03c),
704 	REG(0x168),
705 	REG(0x140),
706 	REG(0x110),
707 	REG(0x11c),
708 	REG(0x114),
709 	REG(0x118),
710 	REG(0x1c0),
711 	REG(0x1c4),
712 	REG(0x1c8),
713 
714 	NOP(3),
715 	LRI(9, POSTED),
716 	REG16(0x3a8),
717 	REG16(0x28c),
718 	REG16(0x288),
719 	REG16(0x284),
720 	REG16(0x280),
721 	REG16(0x27c),
722 	REG16(0x278),
723 	REG16(0x274),
724 	REG16(0x270),
725 
726 	NOP(13),
727 	LRI(1, POSTED),
728 	REG16(0x200),
729 
730 	NOP(13),
731 	LRI(44, POSTED),
732 	REG(0x028),
733 	REG(0x09c),
734 	REG(0x0c0),
735 	REG(0x178),
736 	REG(0x17c),
737 	REG16(0x358),
738 	REG(0x170),
739 	REG(0x150),
740 	REG(0x154),
741 	REG(0x158),
742 	REG16(0x41c),
743 	REG16(0x600),
744 	REG16(0x604),
745 	REG16(0x608),
746 	REG16(0x60c),
747 	REG16(0x610),
748 	REG16(0x614),
749 	REG16(0x618),
750 	REG16(0x61c),
751 	REG16(0x620),
752 	REG16(0x624),
753 	REG16(0x628),
754 	REG16(0x62c),
755 	REG16(0x630),
756 	REG16(0x634),
757 	REG16(0x638),
758 	REG16(0x63c),
759 	REG16(0x640),
760 	REG16(0x644),
761 	REG16(0x648),
762 	REG16(0x64c),
763 	REG16(0x650),
764 	REG16(0x654),
765 	REG16(0x658),
766 	REG16(0x65c),
767 	REG16(0x660),
768 	REG16(0x664),
769 	REG16(0x668),
770 	REG16(0x66c),
771 	REG16(0x670),
772 	REG16(0x674),
773 	REG16(0x678),
774 	REG16(0x67c),
775 	REG(0x068),
776 
777 	END(176)
778 };
779 
780 static const u8 gen12_xcs_offsets[] = {
781 	NOP(1),
782 	LRI(13, POSTED),
783 	REG16(0x244),
784 	REG(0x034),
785 	REG(0x030),
786 	REG(0x038),
787 	REG(0x03c),
788 	REG(0x168),
789 	REG(0x140),
790 	REG(0x110),
791 	REG(0x1c0),
792 	REG(0x1c4),
793 	REG(0x1c8),
794 	REG(0x180),
795 	REG16(0x2b4),
796 
797 	NOP(5),
798 	LRI(9, POSTED),
799 	REG16(0x3a8),
800 	REG16(0x28c),
801 	REG16(0x288),
802 	REG16(0x284),
803 	REG16(0x280),
804 	REG16(0x27c),
805 	REG16(0x278),
806 	REG16(0x274),
807 	REG16(0x270),
808 
809 	END(80)
810 };
811 
812 static const u8 gen8_rcs_offsets[] = {
813 	NOP(1),
814 	LRI(14, POSTED),
815 	REG16(0x244),
816 	REG(0x034),
817 	REG(0x030),
818 	REG(0x038),
819 	REG(0x03c),
820 	REG(0x168),
821 	REG(0x140),
822 	REG(0x110),
823 	REG(0x11c),
824 	REG(0x114),
825 	REG(0x118),
826 	REG(0x1c0),
827 	REG(0x1c4),
828 	REG(0x1c8),
829 
830 	NOP(3),
831 	LRI(9, POSTED),
832 	REG16(0x3a8),
833 	REG16(0x28c),
834 	REG16(0x288),
835 	REG16(0x284),
836 	REG16(0x280),
837 	REG16(0x27c),
838 	REG16(0x278),
839 	REG16(0x274),
840 	REG16(0x270),
841 
842 	NOP(13),
843 	LRI(1, 0),
844 	REG(0x0c8),
845 
846 	END(80)
847 };
848 
849 static const u8 gen9_rcs_offsets[] = {
850 	NOP(1),
851 	LRI(14, POSTED),
852 	REG16(0x244),
853 	REG(0x34),
854 	REG(0x30),
855 	REG(0x38),
856 	REG(0x3c),
857 	REG(0x168),
858 	REG(0x140),
859 	REG(0x110),
860 	REG(0x11c),
861 	REG(0x114),
862 	REG(0x118),
863 	REG(0x1c0),
864 	REG(0x1c4),
865 	REG(0x1c8),
866 
867 	NOP(3),
868 	LRI(9, POSTED),
869 	REG16(0x3a8),
870 	REG16(0x28c),
871 	REG16(0x288),
872 	REG16(0x284),
873 	REG16(0x280),
874 	REG16(0x27c),
875 	REG16(0x278),
876 	REG16(0x274),
877 	REG16(0x270),
878 
879 	NOP(13),
880 	LRI(1, 0),
881 	REG(0xc8),
882 
883 	NOP(13),
884 	LRI(44, POSTED),
885 	REG(0x28),
886 	REG(0x9c),
887 	REG(0xc0),
888 	REG(0x178),
889 	REG(0x17c),
890 	REG16(0x358),
891 	REG(0x170),
892 	REG(0x150),
893 	REG(0x154),
894 	REG(0x158),
895 	REG16(0x41c),
896 	REG16(0x600),
897 	REG16(0x604),
898 	REG16(0x608),
899 	REG16(0x60c),
900 	REG16(0x610),
901 	REG16(0x614),
902 	REG16(0x618),
903 	REG16(0x61c),
904 	REG16(0x620),
905 	REG16(0x624),
906 	REG16(0x628),
907 	REG16(0x62c),
908 	REG16(0x630),
909 	REG16(0x634),
910 	REG16(0x638),
911 	REG16(0x63c),
912 	REG16(0x640),
913 	REG16(0x644),
914 	REG16(0x648),
915 	REG16(0x64c),
916 	REG16(0x650),
917 	REG16(0x654),
918 	REG16(0x658),
919 	REG16(0x65c),
920 	REG16(0x660),
921 	REG16(0x664),
922 	REG16(0x668),
923 	REG16(0x66c),
924 	REG16(0x670),
925 	REG16(0x674),
926 	REG16(0x678),
927 	REG16(0x67c),
928 	REG(0x68),
929 
930 	END(176)
931 };
932 
933 static const u8 gen11_rcs_offsets[] = {
934 	NOP(1),
935 	LRI(15, POSTED),
936 	REG16(0x244),
937 	REG(0x034),
938 	REG(0x030),
939 	REG(0x038),
940 	REG(0x03c),
941 	REG(0x168),
942 	REG(0x140),
943 	REG(0x110),
944 	REG(0x11c),
945 	REG(0x114),
946 	REG(0x118),
947 	REG(0x1c0),
948 	REG(0x1c4),
949 	REG(0x1c8),
950 	REG(0x180),
951 
952 	NOP(1),
953 	LRI(9, POSTED),
954 	REG16(0x3a8),
955 	REG16(0x28c),
956 	REG16(0x288),
957 	REG16(0x284),
958 	REG16(0x280),
959 	REG16(0x27c),
960 	REG16(0x278),
961 	REG16(0x274),
962 	REG16(0x270),
963 
964 	LRI(1, POSTED),
965 	REG(0x1b0),
966 
967 	NOP(10),
968 	LRI(1, 0),
969 	REG(0x0c8),
970 
971 	END(80)
972 };
973 
974 static const u8 gen12_rcs_offsets[] = {
975 	NOP(1),
976 	LRI(13, POSTED),
977 	REG16(0x244),
978 	REG(0x034),
979 	REG(0x030),
980 	REG(0x038),
981 	REG(0x03c),
982 	REG(0x168),
983 	REG(0x140),
984 	REG(0x110),
985 	REG(0x1c0),
986 	REG(0x1c4),
987 	REG(0x1c8),
988 	REG(0x180),
989 	REG16(0x2b4),
990 
991 	NOP(5),
992 	LRI(9, POSTED),
993 	REG16(0x3a8),
994 	REG16(0x28c),
995 	REG16(0x288),
996 	REG16(0x284),
997 	REG16(0x280),
998 	REG16(0x27c),
999 	REG16(0x278),
1000 	REG16(0x274),
1001 	REG16(0x270),
1002 
1003 	LRI(3, POSTED),
1004 	REG(0x1b0),
1005 	REG16(0x5a8),
1006 	REG16(0x5ac),
1007 
1008 	NOP(6),
1009 	LRI(1, 0),
1010 	REG(0x0c8),
1011 	NOP(3 + 9 + 1),
1012 
1013 	LRI(51, POSTED),
1014 	REG16(0x588),
1015 	REG16(0x588),
1016 	REG16(0x588),
1017 	REG16(0x588),
1018 	REG16(0x588),
1019 	REG16(0x588),
1020 	REG(0x028),
1021 	REG(0x09c),
1022 	REG(0x0c0),
1023 	REG(0x178),
1024 	REG(0x17c),
1025 	REG16(0x358),
1026 	REG(0x170),
1027 	REG(0x150),
1028 	REG(0x154),
1029 	REG(0x158),
1030 	REG16(0x41c),
1031 	REG16(0x600),
1032 	REG16(0x604),
1033 	REG16(0x608),
1034 	REG16(0x60c),
1035 	REG16(0x610),
1036 	REG16(0x614),
1037 	REG16(0x618),
1038 	REG16(0x61c),
1039 	REG16(0x620),
1040 	REG16(0x624),
1041 	REG16(0x628),
1042 	REG16(0x62c),
1043 	REG16(0x630),
1044 	REG16(0x634),
1045 	REG16(0x638),
1046 	REG16(0x63c),
1047 	REG16(0x640),
1048 	REG16(0x644),
1049 	REG16(0x648),
1050 	REG16(0x64c),
1051 	REG16(0x650),
1052 	REG16(0x654),
1053 	REG16(0x658),
1054 	REG16(0x65c),
1055 	REG16(0x660),
1056 	REG16(0x664),
1057 	REG16(0x668),
1058 	REG16(0x66c),
1059 	REG16(0x670),
1060 	REG16(0x674),
1061 	REG16(0x678),
1062 	REG16(0x67c),
1063 	REG(0x068),
1064 	REG(0x084),
1065 	NOP(1),
1066 
1067 	END(192)
1068 };
1069 
1070 #undef END
1071 #undef REG16
1072 #undef REG
1073 #undef LRI
1074 #undef NOP
1075 
1076 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1077 {
1078 	/*
1079 	 * The gen12+ lists only have the registers we program in the basic
1080 	 * default state. We rely on the context image using relative
1081 	 * addressing to automatic fixup the register state between the
1082 	 * physical engines for virtual engine.
1083 	 */
1084 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1085 		   !intel_engine_has_relative_mmio(engine));
1086 
1087 	if (engine->class == RENDER_CLASS) {
1088 		if (INTEL_GEN(engine->i915) >= 12)
1089 			return gen12_rcs_offsets;
1090 		else if (INTEL_GEN(engine->i915) >= 11)
1091 			return gen11_rcs_offsets;
1092 		else if (INTEL_GEN(engine->i915) >= 9)
1093 			return gen9_rcs_offsets;
1094 		else
1095 			return gen8_rcs_offsets;
1096 	} else {
1097 		if (INTEL_GEN(engine->i915) >= 12)
1098 			return gen12_xcs_offsets;
1099 		else if (INTEL_GEN(engine->i915) >= 9)
1100 			return gen9_xcs_offsets;
1101 		else
1102 			return gen8_xcs_offsets;
1103 	}
1104 }
1105 
1106 static struct i915_request *
1107 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1108 {
1109 	struct i915_request *rq, *rn, *active = NULL;
1110 	struct list_head *pl;
1111 	int prio = I915_PRIORITY_INVALID;
1112 
1113 	lockdep_assert_held(&engine->active.lock);
1114 
1115 	list_for_each_entry_safe_reverse(rq, rn,
1116 					 &engine->active.requests,
1117 					 sched.link) {
1118 		if (i915_request_completed(rq))
1119 			continue; /* XXX */
1120 
1121 		__i915_request_unsubmit(rq);
1122 
1123 		/*
1124 		 * Push the request back into the queue for later resubmission.
1125 		 * If this request is not native to this physical engine (i.e.
1126 		 * it came from a virtual source), push it back onto the virtual
1127 		 * engine so that it can be moved across onto another physical
1128 		 * engine as load dictates.
1129 		 */
1130 		if (likely(rq->execution_mask == engine->mask)) {
1131 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1132 			if (rq_prio(rq) != prio) {
1133 				prio = rq_prio(rq);
1134 				pl = i915_sched_lookup_priolist(engine, prio);
1135 			}
1136 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1137 
1138 			list_move(&rq->sched.link, pl);
1139 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1140 
1141 			/* Check in case we rollback so far we wrap [size/2] */
1142 			if (intel_ring_direction(rq->ring,
1143 						 intel_ring_wrap(rq->ring,
1144 								 rq->tail),
1145 						 rq->ring->tail) > 0)
1146 				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1147 
1148 			active = rq;
1149 		} else {
1150 			struct intel_engine_cs *owner = rq->context->engine;
1151 
1152 			WRITE_ONCE(rq->engine, owner);
1153 			owner->submit_request(rq);
1154 			active = NULL;
1155 		}
1156 	}
1157 
1158 	return active;
1159 }
1160 
1161 struct i915_request *
1162 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1163 {
1164 	struct intel_engine_cs *engine =
1165 		container_of(execlists, typeof(*engine), execlists);
1166 
1167 	return __unwind_incomplete_requests(engine);
1168 }
1169 
1170 static inline void
1171 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1172 {
1173 	/*
1174 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1175 	 * The compiler should eliminate this function as dead-code.
1176 	 */
1177 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1178 		return;
1179 
1180 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1181 				   status, rq);
1182 }
1183 
1184 static void intel_engine_context_in(struct intel_engine_cs *engine)
1185 {
1186 	unsigned long flags;
1187 
1188 	if (atomic_add_unless(&engine->stats.active, 1, 0))
1189 		return;
1190 
1191 	write_seqlock_irqsave(&engine->stats.lock, flags);
1192 	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1193 		engine->stats.start = ktime_get();
1194 		atomic_inc(&engine->stats.active);
1195 	}
1196 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1197 }
1198 
1199 static void intel_engine_context_out(struct intel_engine_cs *engine)
1200 {
1201 	unsigned long flags;
1202 
1203 	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1204 
1205 	if (atomic_add_unless(&engine->stats.active, -1, 1))
1206 		return;
1207 
1208 	write_seqlock_irqsave(&engine->stats.lock, flags);
1209 	if (atomic_dec_and_test(&engine->stats.active)) {
1210 		engine->stats.total =
1211 			ktime_add(engine->stats.total,
1212 				  ktime_sub(ktime_get(), engine->stats.start));
1213 	}
1214 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1215 }
1216 
1217 static void
1218 execlists_check_context(const struct intel_context *ce,
1219 			const struct intel_engine_cs *engine)
1220 {
1221 	const struct intel_ring *ring = ce->ring;
1222 	u32 *regs = ce->lrc_reg_state;
1223 	bool valid = true;
1224 	int x;
1225 
1226 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1227 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1228 		       engine->name,
1229 		       regs[CTX_RING_START],
1230 		       i915_ggtt_offset(ring->vma));
1231 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1232 		valid = false;
1233 	}
1234 
1235 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1236 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1237 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1238 		       engine->name,
1239 		       regs[CTX_RING_CTL],
1240 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1241 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1242 		valid = false;
1243 	}
1244 
1245 	x = lrc_ring_mi_mode(engine);
1246 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1247 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1248 		       engine->name, regs[x + 1]);
1249 		regs[x + 1] &= ~STOP_RING;
1250 		regs[x + 1] |= STOP_RING << 16;
1251 		valid = false;
1252 	}
1253 
1254 	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1255 }
1256 
1257 static void restore_default_state(struct intel_context *ce,
1258 				  struct intel_engine_cs *engine)
1259 {
1260 	u32 *regs;
1261 
1262 	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1263 	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1264 
1265 	ce->runtime.last = intel_context_get_runtime(ce);
1266 }
1267 
1268 static void reset_active(struct i915_request *rq,
1269 			 struct intel_engine_cs *engine)
1270 {
1271 	struct intel_context * const ce = rq->context;
1272 	u32 head;
1273 
1274 	/*
1275 	 * The executing context has been cancelled. We want to prevent
1276 	 * further execution along this context and propagate the error on
1277 	 * to anything depending on its results.
1278 	 *
1279 	 * In __i915_request_submit(), we apply the -EIO and remove the
1280 	 * requests' payloads for any banned requests. But first, we must
1281 	 * rewind the context back to the start of the incomplete request so
1282 	 * that we do not jump back into the middle of the batch.
1283 	 *
1284 	 * We preserve the breadcrumbs and semaphores of the incomplete
1285 	 * requests so that inter-timeline dependencies (i.e other timelines)
1286 	 * remain correctly ordered. And we defer to __i915_request_submit()
1287 	 * so that all asynchronous waits are correctly handled.
1288 	 */
1289 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1290 		     rq->fence.context, rq->fence.seqno);
1291 
1292 	/* On resubmission of the active request, payload will be scrubbed */
1293 	if (i915_request_completed(rq))
1294 		head = rq->tail;
1295 	else
1296 		head = active_request(ce->timeline, rq)->head;
1297 	head = intel_ring_wrap(ce->ring, head);
1298 
1299 	/* Scrub the context image to prevent replaying the previous batch */
1300 	restore_default_state(ce, engine);
1301 	__execlists_update_reg_state(ce, engine, head);
1302 
1303 	/* We've switched away, so this should be a no-op, but intent matters */
1304 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1305 }
1306 
1307 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1308 {
1309 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1310 	ce->runtime.num_underflow += dt < 0;
1311 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1312 #endif
1313 }
1314 
1315 static void intel_context_update_runtime(struct intel_context *ce)
1316 {
1317 	u32 old;
1318 	s32 dt;
1319 
1320 	if (intel_context_is_barrier(ce))
1321 		return;
1322 
1323 	old = ce->runtime.last;
1324 	ce->runtime.last = intel_context_get_runtime(ce);
1325 	dt = ce->runtime.last - old;
1326 
1327 	if (unlikely(dt <= 0)) {
1328 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1329 			 old, ce->runtime.last, dt);
1330 		st_update_runtime_underflow(ce, dt);
1331 		return;
1332 	}
1333 
1334 	ewma_runtime_add(&ce->runtime.avg, dt);
1335 	ce->runtime.total += dt;
1336 }
1337 
1338 static inline struct intel_engine_cs *
1339 __execlists_schedule_in(struct i915_request *rq)
1340 {
1341 	struct intel_engine_cs * const engine = rq->engine;
1342 	struct intel_context * const ce = rq->context;
1343 
1344 	intel_context_get(ce);
1345 
1346 	if (unlikely(intel_context_is_banned(ce)))
1347 		reset_active(rq, engine);
1348 
1349 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1350 		execlists_check_context(ce, engine);
1351 
1352 	if (ce->tag) {
1353 		/* Use a fixed tag for OA and friends */
1354 		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1355 		ce->lrc.ccid = ce->tag;
1356 	} else {
1357 		/* We don't need a strict matching tag, just different values */
1358 		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1359 
1360 		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1361 		clear_bit(tag - 1, &engine->context_tag);
1362 		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1363 
1364 		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1365 	}
1366 
1367 	ce->lrc.ccid |= engine->execlists.ccid;
1368 
1369 	__intel_gt_pm_get(engine->gt);
1370 	if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1371 		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1372 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1373 	intel_engine_context_in(engine);
1374 
1375 	return engine;
1376 }
1377 
1378 static inline struct i915_request *
1379 execlists_schedule_in(struct i915_request *rq, int idx)
1380 {
1381 	struct intel_context * const ce = rq->context;
1382 	struct intel_engine_cs *old;
1383 
1384 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1385 	trace_i915_request_in(rq, idx);
1386 
1387 	old = READ_ONCE(ce->inflight);
1388 	do {
1389 		if (!old) {
1390 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1391 			break;
1392 		}
1393 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1394 
1395 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1396 	return i915_request_get(rq);
1397 }
1398 
1399 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1400 {
1401 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1402 	struct i915_request *next = READ_ONCE(ve->request);
1403 
1404 	if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1405 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
1406 }
1407 
1408 static inline void
1409 __execlists_schedule_out(struct i915_request *rq,
1410 			 struct intel_engine_cs * const engine,
1411 			 unsigned int ccid)
1412 {
1413 	struct intel_context * const ce = rq->context;
1414 
1415 	/*
1416 	 * NB process_csb() is not under the engine->active.lock and hence
1417 	 * schedule_out can race with schedule_in meaning that we should
1418 	 * refrain from doing non-trivial work here.
1419 	 */
1420 
1421 	/*
1422 	 * If we have just completed this context, the engine may now be
1423 	 * idle and we want to re-enter powersaving.
1424 	 */
1425 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1426 	    i915_request_completed(rq))
1427 		intel_engine_add_retire(engine, ce->timeline);
1428 
1429 	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1430 	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1431 	if (ccid < BITS_PER_LONG) {
1432 		GEM_BUG_ON(ccid == 0);
1433 		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1434 		set_bit(ccid - 1, &engine->context_tag);
1435 	}
1436 
1437 	intel_context_update_runtime(ce);
1438 	intel_engine_context_out(engine);
1439 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1440 	if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1441 		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1442 	intel_gt_pm_put_async(engine->gt);
1443 
1444 	/*
1445 	 * If this is part of a virtual engine, its next request may
1446 	 * have been blocked waiting for access to the active context.
1447 	 * We have to kick all the siblings again in case we need to
1448 	 * switch (e.g. the next request is not runnable on this
1449 	 * engine). Hopefully, we will already have submitted the next
1450 	 * request before the tasklet runs and do not need to rebuild
1451 	 * each virtual tree and kick everyone again.
1452 	 */
1453 	if (ce->engine != engine)
1454 		kick_siblings(rq, ce);
1455 
1456 	intel_context_put(ce);
1457 }
1458 
1459 static inline void
1460 execlists_schedule_out(struct i915_request *rq)
1461 {
1462 	struct intel_context * const ce = rq->context;
1463 	struct intel_engine_cs *cur, *old;
1464 	u32 ccid;
1465 
1466 	trace_i915_request_out(rq);
1467 
1468 	ccid = rq->context->lrc.ccid;
1469 	old = READ_ONCE(ce->inflight);
1470 	do
1471 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1472 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1473 	if (!cur)
1474 		__execlists_schedule_out(rq, old, ccid);
1475 
1476 	i915_request_put(rq);
1477 }
1478 
1479 static u64 execlists_update_context(struct i915_request *rq)
1480 {
1481 	struct intel_context *ce = rq->context;
1482 	u64 desc = ce->lrc.desc;
1483 	u32 tail, prev;
1484 
1485 	/*
1486 	 * WaIdleLiteRestore:bdw,skl
1487 	 *
1488 	 * We should never submit the context with the same RING_TAIL twice
1489 	 * just in case we submit an empty ring, which confuses the HW.
1490 	 *
1491 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1492 	 * the normal request to be able to always advance the RING_TAIL on
1493 	 * subsequent resubmissions (for lite restore). Should that fail us,
1494 	 * and we try and submit the same tail again, force the context
1495 	 * reload.
1496 	 *
1497 	 * If we need to return to a preempted context, we need to skip the
1498 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1499 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1500 	 * an earlier request.
1501 	 */
1502 	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1503 	prev = rq->ring->tail;
1504 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1505 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1506 		desc |= CTX_DESC_FORCE_RESTORE;
1507 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1508 	rq->tail = rq->wa_tail;
1509 
1510 	/*
1511 	 * Make sure the context image is complete before we submit it to HW.
1512 	 *
1513 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1514 	 * an uncached write such as our mmio register access, the empirical
1515 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1516 	 * may not be visible to the HW prior to the completion of the UC
1517 	 * register write and that we may begin execution from the context
1518 	 * before its image is complete leading to invalid PD chasing.
1519 	 */
1520 	wmb();
1521 
1522 	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1523 	return desc;
1524 }
1525 
1526 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1527 {
1528 	if (execlists->ctrl_reg) {
1529 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1530 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1531 	} else {
1532 		writel(upper_32_bits(desc), execlists->submit_reg);
1533 		writel(lower_32_bits(desc), execlists->submit_reg);
1534 	}
1535 }
1536 
1537 static __maybe_unused char *
1538 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1539 {
1540 	if (!rq)
1541 		return "";
1542 
1543 	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1544 		 prefix,
1545 		 rq->context->lrc.ccid,
1546 		 rq->fence.context, rq->fence.seqno,
1547 		 i915_request_completed(rq) ? "!" :
1548 		 i915_request_started(rq) ? "*" :
1549 		 "",
1550 		 rq_prio(rq));
1551 
1552 	return buf;
1553 }
1554 
1555 static __maybe_unused void
1556 trace_ports(const struct intel_engine_execlists *execlists,
1557 	    const char *msg,
1558 	    struct i915_request * const *ports)
1559 {
1560 	const struct intel_engine_cs *engine =
1561 		container_of(execlists, typeof(*engine), execlists);
1562 	char __maybe_unused p0[40], p1[40];
1563 
1564 	if (!ports[0])
1565 		return;
1566 
1567 	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1568 		     dump_port(p0, sizeof(p0), "", ports[0]),
1569 		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1570 }
1571 
1572 static inline bool
1573 reset_in_progress(const struct intel_engine_execlists *execlists)
1574 {
1575 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1576 }
1577 
1578 static __maybe_unused bool
1579 assert_pending_valid(const struct intel_engine_execlists *execlists,
1580 		     const char *msg)
1581 {
1582 	struct intel_engine_cs *engine =
1583 		container_of(execlists, typeof(*engine), execlists);
1584 	struct i915_request * const *port, *rq;
1585 	struct intel_context *ce = NULL;
1586 	bool sentinel = false;
1587 	u32 ccid = -1;
1588 
1589 	trace_ports(execlists, msg, execlists->pending);
1590 
1591 	/* We may be messing around with the lists during reset, lalala */
1592 	if (reset_in_progress(execlists))
1593 		return true;
1594 
1595 	if (!execlists->pending[0]) {
1596 		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1597 			      engine->name);
1598 		return false;
1599 	}
1600 
1601 	if (execlists->pending[execlists_num_ports(execlists)]) {
1602 		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1603 			      engine->name, execlists_num_ports(execlists));
1604 		return false;
1605 	}
1606 
1607 	for (port = execlists->pending; (rq = *port); port++) {
1608 		unsigned long flags;
1609 		bool ok = true;
1610 
1611 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1612 		GEM_BUG_ON(!i915_request_is_active(rq));
1613 
1614 		if (ce == rq->context) {
1615 			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1616 				      engine->name,
1617 				      ce->timeline->fence_context,
1618 				      port - execlists->pending);
1619 			return false;
1620 		}
1621 		ce = rq->context;
1622 
1623 		if (ccid == ce->lrc.ccid) {
1624 			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1625 				      engine->name,
1626 				      ccid, ce->timeline->fence_context,
1627 				      port - execlists->pending);
1628 			return false;
1629 		}
1630 		ccid = ce->lrc.ccid;
1631 
1632 		/*
1633 		 * Sentinels are supposed to be the last request so they flush
1634 		 * the current execution off the HW. Check that they are the only
1635 		 * request in the pending submission.
1636 		 */
1637 		if (sentinel) {
1638 			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1639 				      engine->name,
1640 				      ce->timeline->fence_context,
1641 				      port - execlists->pending);
1642 			return false;
1643 		}
1644 		sentinel = i915_request_has_sentinel(rq);
1645 
1646 		/* Hold tightly onto the lock to prevent concurrent retires! */
1647 		if (!spin_trylock_irqsave(&rq->lock, flags))
1648 			continue;
1649 
1650 		if (i915_request_completed(rq))
1651 			goto unlock;
1652 
1653 		if (i915_active_is_idle(&ce->active) &&
1654 		    !intel_context_is_barrier(ce)) {
1655 			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1656 				      engine->name,
1657 				      ce->timeline->fence_context,
1658 				      port - execlists->pending);
1659 			ok = false;
1660 			goto unlock;
1661 		}
1662 
1663 		if (!i915_vma_is_pinned(ce->state)) {
1664 			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1665 				      engine->name,
1666 				      ce->timeline->fence_context,
1667 				      port - execlists->pending);
1668 			ok = false;
1669 			goto unlock;
1670 		}
1671 
1672 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1673 			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1674 				      engine->name,
1675 				      ce->timeline->fence_context,
1676 				      port - execlists->pending);
1677 			ok = false;
1678 			goto unlock;
1679 		}
1680 
1681 unlock:
1682 		spin_unlock_irqrestore(&rq->lock, flags);
1683 		if (!ok)
1684 			return false;
1685 	}
1686 
1687 	return ce;
1688 }
1689 
1690 static void execlists_submit_ports(struct intel_engine_cs *engine)
1691 {
1692 	struct intel_engine_execlists *execlists = &engine->execlists;
1693 	unsigned int n;
1694 
1695 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1696 
1697 	/*
1698 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1699 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1700 	 * not be relinquished until the device is idle (see
1701 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1702 	 * that all ELSP are drained i.e. we have processed the CSB,
1703 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1704 	 */
1705 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1706 
1707 	/*
1708 	 * ELSQ note: the submit queue is not cleared after being submitted
1709 	 * to the HW so we need to make sure we always clean it up. This is
1710 	 * currently ensured by the fact that we always write the same number
1711 	 * of elsq entries, keep this in mind before changing the loop below.
1712 	 */
1713 	for (n = execlists_num_ports(execlists); n--; ) {
1714 		struct i915_request *rq = execlists->pending[n];
1715 
1716 		write_desc(execlists,
1717 			   rq ? execlists_update_context(rq) : 0,
1718 			   n);
1719 	}
1720 
1721 	/* we need to manually load the submit queue */
1722 	if (execlists->ctrl_reg)
1723 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1724 }
1725 
1726 static bool ctx_single_port_submission(const struct intel_context *ce)
1727 {
1728 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1729 		intel_context_force_single_submission(ce));
1730 }
1731 
1732 static bool can_merge_ctx(const struct intel_context *prev,
1733 			  const struct intel_context *next)
1734 {
1735 	if (prev != next)
1736 		return false;
1737 
1738 	if (ctx_single_port_submission(prev))
1739 		return false;
1740 
1741 	return true;
1742 }
1743 
1744 static unsigned long i915_request_flags(const struct i915_request *rq)
1745 {
1746 	return READ_ONCE(rq->fence.flags);
1747 }
1748 
1749 static bool can_merge_rq(const struct i915_request *prev,
1750 			 const struct i915_request *next)
1751 {
1752 	GEM_BUG_ON(prev == next);
1753 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1754 
1755 	/*
1756 	 * We do not submit known completed requests. Therefore if the next
1757 	 * request is already completed, we can pretend to merge it in
1758 	 * with the previous context (and we will skip updating the ELSP
1759 	 * and tracking). Thus hopefully keeping the ELSP full with active
1760 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1761 	 * us.
1762 	 */
1763 	if (i915_request_completed(next))
1764 		return true;
1765 
1766 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1767 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1768 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1769 		return false;
1770 
1771 	if (!can_merge_ctx(prev->context, next->context))
1772 		return false;
1773 
1774 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1775 	return true;
1776 }
1777 
1778 static void virtual_update_register_offsets(u32 *regs,
1779 					    struct intel_engine_cs *engine)
1780 {
1781 	set_offsets(regs, reg_offsets(engine), engine, false);
1782 }
1783 
1784 static bool virtual_matches(const struct virtual_engine *ve,
1785 			    const struct i915_request *rq,
1786 			    const struct intel_engine_cs *engine)
1787 {
1788 	const struct intel_engine_cs *inflight;
1789 
1790 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1791 		return false;
1792 
1793 	/*
1794 	 * We track when the HW has completed saving the context image
1795 	 * (i.e. when we have seen the final CS event switching out of
1796 	 * the context) and must not overwrite the context image before
1797 	 * then. This restricts us to only using the active engine
1798 	 * while the previous virtualized request is inflight (so
1799 	 * we reuse the register offsets). This is a very small
1800 	 * hystersis on the greedy seelction algorithm.
1801 	 */
1802 	inflight = intel_context_inflight(&ve->context);
1803 	if (inflight && inflight != engine)
1804 		return false;
1805 
1806 	return true;
1807 }
1808 
1809 static void virtual_xfer_context(struct virtual_engine *ve,
1810 				 struct intel_engine_cs *engine)
1811 {
1812 	unsigned int n;
1813 
1814 	if (likely(engine == ve->siblings[0]))
1815 		return;
1816 
1817 	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1818 	if (!intel_engine_has_relative_mmio(engine))
1819 		virtual_update_register_offsets(ve->context.lrc_reg_state,
1820 						engine);
1821 
1822 	/*
1823 	 * Move the bound engine to the top of the list for
1824 	 * future execution. We then kick this tasklet first
1825 	 * before checking others, so that we preferentially
1826 	 * reuse this set of bound registers.
1827 	 */
1828 	for (n = 1; n < ve->num_siblings; n++) {
1829 		if (ve->siblings[n] == engine) {
1830 			swap(ve->siblings[n], ve->siblings[0]);
1831 			break;
1832 		}
1833 	}
1834 }
1835 
1836 #define for_each_waiter(p__, rq__) \
1837 	list_for_each_entry_lockless(p__, \
1838 				     &(rq__)->sched.waiters_list, \
1839 				     wait_link)
1840 
1841 #define for_each_signaler(p__, rq__) \
1842 	list_for_each_entry_rcu(p__, \
1843 				&(rq__)->sched.signalers_list, \
1844 				signal_link)
1845 
1846 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1847 {
1848 	LIST_HEAD(list);
1849 
1850 	/*
1851 	 * We want to move the interrupted request to the back of
1852 	 * the round-robin list (i.e. its priority level), but
1853 	 * in doing so, we must then move all requests that were in
1854 	 * flight and were waiting for the interrupted request to
1855 	 * be run after it again.
1856 	 */
1857 	do {
1858 		struct i915_dependency *p;
1859 
1860 		GEM_BUG_ON(i915_request_is_active(rq));
1861 		list_move_tail(&rq->sched.link, pl);
1862 
1863 		for_each_waiter(p, rq) {
1864 			struct i915_request *w =
1865 				container_of(p->waiter, typeof(*w), sched);
1866 
1867 			if (p->flags & I915_DEPENDENCY_WEAK)
1868 				continue;
1869 
1870 			/* Leave semaphores spinning on the other engines */
1871 			if (w->engine != rq->engine)
1872 				continue;
1873 
1874 			/* No waiter should start before its signaler */
1875 			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1876 				   i915_request_started(w) &&
1877 				   !i915_request_completed(rq));
1878 
1879 			GEM_BUG_ON(i915_request_is_active(w));
1880 			if (!i915_request_is_ready(w))
1881 				continue;
1882 
1883 			if (rq_prio(w) < rq_prio(rq))
1884 				continue;
1885 
1886 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1887 			list_move_tail(&w->sched.link, &list);
1888 		}
1889 
1890 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1891 	} while (rq);
1892 }
1893 
1894 static void defer_active(struct intel_engine_cs *engine)
1895 {
1896 	struct i915_request *rq;
1897 
1898 	rq = __unwind_incomplete_requests(engine);
1899 	if (!rq)
1900 		return;
1901 
1902 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1903 }
1904 
1905 static bool
1906 need_timeslice(const struct intel_engine_cs *engine,
1907 	       const struct i915_request *rq,
1908 	       const struct rb_node *rb)
1909 {
1910 	int hint;
1911 
1912 	if (!intel_engine_has_timeslices(engine))
1913 		return false;
1914 
1915 	hint = engine->execlists.queue_priority_hint;
1916 
1917 	if (rb) {
1918 		const struct virtual_engine *ve =
1919 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1920 		const struct intel_engine_cs *inflight =
1921 			intel_context_inflight(&ve->context);
1922 
1923 		if (!inflight || inflight == engine) {
1924 			struct i915_request *next;
1925 
1926 			rcu_read_lock();
1927 			next = READ_ONCE(ve->request);
1928 			if (next)
1929 				hint = max(hint, rq_prio(next));
1930 			rcu_read_unlock();
1931 		}
1932 	}
1933 
1934 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1935 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1936 
1937 	GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1938 	return hint >= effective_prio(rq);
1939 }
1940 
1941 static bool
1942 timeslice_yield(const struct intel_engine_execlists *el,
1943 		const struct i915_request *rq)
1944 {
1945 	/*
1946 	 * Once bitten, forever smitten!
1947 	 *
1948 	 * If the active context ever busy-waited on a semaphore,
1949 	 * it will be treated as a hog until the end of its timeslice (i.e.
1950 	 * until it is scheduled out and replaced by a new submission,
1951 	 * possibly even its own lite-restore). The HW only sends an interrupt
1952 	 * on the first miss, and we do know if that semaphore has been
1953 	 * signaled, or even if it is now stuck on another semaphore. Play
1954 	 * safe, yield if it might be stuck -- it will be given a fresh
1955 	 * timeslice in the near future.
1956 	 */
1957 	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1958 }
1959 
1960 static bool
1961 timeslice_expired(const struct intel_engine_execlists *el,
1962 		  const struct i915_request *rq)
1963 {
1964 	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1965 }
1966 
1967 static int
1968 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1969 {
1970 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1971 		return engine->execlists.queue_priority_hint;
1972 
1973 	return rq_prio(list_next_entry(rq, sched.link));
1974 }
1975 
1976 static inline unsigned long
1977 timeslice(const struct intel_engine_cs *engine)
1978 {
1979 	return READ_ONCE(engine->props.timeslice_duration_ms);
1980 }
1981 
1982 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1983 {
1984 	const struct intel_engine_execlists *execlists = &engine->execlists;
1985 	const struct i915_request *rq = *execlists->active;
1986 
1987 	if (!rq || i915_request_completed(rq))
1988 		return 0;
1989 
1990 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1991 		return 0;
1992 
1993 	return timeslice(engine);
1994 }
1995 
1996 static void set_timeslice(struct intel_engine_cs *engine)
1997 {
1998 	unsigned long duration;
1999 
2000 	if (!intel_engine_has_timeslices(engine))
2001 		return;
2002 
2003 	duration = active_timeslice(engine);
2004 	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2005 
2006 	set_timer_ms(&engine->execlists.timer, duration);
2007 }
2008 
2009 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2010 {
2011 	struct intel_engine_execlists *execlists = &engine->execlists;
2012 	unsigned long duration;
2013 
2014 	if (!intel_engine_has_timeslices(engine))
2015 		return;
2016 
2017 	WRITE_ONCE(execlists->switch_priority_hint, prio);
2018 	if (prio == INT_MIN)
2019 		return;
2020 
2021 	if (timer_pending(&execlists->timer))
2022 		return;
2023 
2024 	duration = timeslice(engine);
2025 	ENGINE_TRACE(engine,
2026 		     "start timeslicing, prio:%d, interval:%lu",
2027 		     prio, duration);
2028 
2029 	set_timer_ms(&execlists->timer, duration);
2030 }
2031 
2032 static void record_preemption(struct intel_engine_execlists *execlists)
2033 {
2034 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2035 }
2036 
2037 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2038 					    const struct i915_request *rq)
2039 {
2040 	if (!rq)
2041 		return 0;
2042 
2043 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2044 	if (unlikely(intel_context_is_banned(rq->context)))
2045 		return 1;
2046 
2047 	return READ_ONCE(engine->props.preempt_timeout_ms);
2048 }
2049 
2050 static void set_preempt_timeout(struct intel_engine_cs *engine,
2051 				const struct i915_request *rq)
2052 {
2053 	if (!intel_engine_has_preempt_reset(engine))
2054 		return;
2055 
2056 	set_timer_ms(&engine->execlists.preempt,
2057 		     active_preempt_timeout(engine, rq));
2058 }
2059 
2060 static inline void clear_ports(struct i915_request **ports, int count)
2061 {
2062 	memset_p((void **)ports, NULL, count);
2063 }
2064 
2065 static inline void
2066 copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2067 {
2068 	/* A memcpy_p() would be very useful here! */
2069 	while (count--)
2070 		WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2071 }
2072 
2073 static void execlists_dequeue(struct intel_engine_cs *engine)
2074 {
2075 	struct intel_engine_execlists * const execlists = &engine->execlists;
2076 	struct i915_request **port = execlists->pending;
2077 	struct i915_request ** const last_port = port + execlists->port_mask;
2078 	struct i915_request * const *active;
2079 	struct i915_request *last;
2080 	struct rb_node *rb;
2081 	bool submit = false;
2082 
2083 	/*
2084 	 * Hardware submission is through 2 ports. Conceptually each port
2085 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2086 	 * static for a context, and unique to each, so we only execute
2087 	 * requests belonging to a single context from each ring. RING_HEAD
2088 	 * is maintained by the CS in the context image, it marks the place
2089 	 * where it got up to last time, and through RING_TAIL we tell the CS
2090 	 * where we want to execute up to this time.
2091 	 *
2092 	 * In this list the requests are in order of execution. Consecutive
2093 	 * requests from the same context are adjacent in the ringbuffer. We
2094 	 * can combine these requests into a single RING_TAIL update:
2095 	 *
2096 	 *              RING_HEAD...req1...req2
2097 	 *                                    ^- RING_TAIL
2098 	 * since to execute req2 the CS must first execute req1.
2099 	 *
2100 	 * Our goal then is to point each port to the end of a consecutive
2101 	 * sequence of requests as being the most optimal (fewest wake ups
2102 	 * and context switches) submission.
2103 	 */
2104 
2105 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2106 		struct virtual_engine *ve =
2107 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2108 		struct i915_request *rq = READ_ONCE(ve->request);
2109 
2110 		if (!rq) { /* lazily cleanup after another engine handled rq */
2111 			rb_erase_cached(rb, &execlists->virtual);
2112 			RB_CLEAR_NODE(rb);
2113 			rb = rb_first_cached(&execlists->virtual);
2114 			continue;
2115 		}
2116 
2117 		if (!virtual_matches(ve, rq, engine)) {
2118 			rb = rb_next(rb);
2119 			continue;
2120 		}
2121 
2122 		break;
2123 	}
2124 
2125 	/*
2126 	 * If the queue is higher priority than the last
2127 	 * request in the currently active context, submit afresh.
2128 	 * We will resubmit again afterwards in case we need to split
2129 	 * the active context to interject the preemption request,
2130 	 * i.e. we will retrigger preemption following the ack in case
2131 	 * of trouble.
2132 	 */
2133 	active = READ_ONCE(execlists->active);
2134 
2135 	/*
2136 	 * In theory we can skip over completed contexts that have not
2137 	 * yet been processed by events (as those events are in flight):
2138 	 *
2139 	 * while ((last = *active) && i915_request_completed(last))
2140 	 *	active++;
2141 	 *
2142 	 * However, the GPU cannot handle this as it will ultimately
2143 	 * find itself trying to jump back into a context it has just
2144 	 * completed and barf.
2145 	 */
2146 
2147 	if ((last = *active)) {
2148 		if (need_preempt(engine, last, rb)) {
2149 			if (i915_request_completed(last)) {
2150 				tasklet_hi_schedule(&execlists->tasklet);
2151 				return;
2152 			}
2153 
2154 			ENGINE_TRACE(engine,
2155 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2156 				     last->fence.context,
2157 				     last->fence.seqno,
2158 				     last->sched.attr.priority,
2159 				     execlists->queue_priority_hint);
2160 			record_preemption(execlists);
2161 
2162 			/*
2163 			 * Don't let the RING_HEAD advance past the breadcrumb
2164 			 * as we unwind (and until we resubmit) so that we do
2165 			 * not accidentally tell it to go backwards.
2166 			 */
2167 			ring_set_paused(engine, 1);
2168 
2169 			/*
2170 			 * Note that we have not stopped the GPU at this point,
2171 			 * so we are unwinding the incomplete requests as they
2172 			 * remain inflight and so by the time we do complete
2173 			 * the preemption, some of the unwound requests may
2174 			 * complete!
2175 			 */
2176 			__unwind_incomplete_requests(engine);
2177 
2178 			last = NULL;
2179 		} else if (need_timeslice(engine, last, rb) &&
2180 			   timeslice_expired(execlists, last)) {
2181 			if (i915_request_completed(last)) {
2182 				tasklet_hi_schedule(&execlists->tasklet);
2183 				return;
2184 			}
2185 
2186 			ENGINE_TRACE(engine,
2187 				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2188 				     last->fence.context,
2189 				     last->fence.seqno,
2190 				     last->sched.attr.priority,
2191 				     execlists->queue_priority_hint,
2192 				     yesno(timeslice_yield(execlists, last)));
2193 
2194 			ring_set_paused(engine, 1);
2195 			defer_active(engine);
2196 
2197 			/*
2198 			 * Unlike for preemption, if we rewind and continue
2199 			 * executing the same context as previously active,
2200 			 * the order of execution will remain the same and
2201 			 * the tail will only advance. We do not need to
2202 			 * force a full context restore, as a lite-restore
2203 			 * is sufficient to resample the monotonic TAIL.
2204 			 *
2205 			 * If we switch to any other context, similarly we
2206 			 * will not rewind TAIL of current context, and
2207 			 * normal save/restore will preserve state and allow
2208 			 * us to later continue executing the same request.
2209 			 */
2210 			last = NULL;
2211 		} else {
2212 			/*
2213 			 * Otherwise if we already have a request pending
2214 			 * for execution after the current one, we can
2215 			 * just wait until the next CS event before
2216 			 * queuing more. In either case we will force a
2217 			 * lite-restore preemption event, but if we wait
2218 			 * we hopefully coalesce several updates into a single
2219 			 * submission.
2220 			 */
2221 			if (!list_is_last(&last->sched.link,
2222 					  &engine->active.requests)) {
2223 				/*
2224 				 * Even if ELSP[1] is occupied and not worthy
2225 				 * of timeslices, our queue might be.
2226 				 */
2227 				start_timeslice(engine, queue_prio(execlists));
2228 				return;
2229 			}
2230 		}
2231 	}
2232 
2233 	while (rb) { /* XXX virtual is always taking precedence */
2234 		struct virtual_engine *ve =
2235 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2236 		struct i915_request *rq;
2237 
2238 		spin_lock(&ve->base.active.lock);
2239 
2240 		rq = ve->request;
2241 		if (unlikely(!rq)) { /* lost the race to a sibling */
2242 			spin_unlock(&ve->base.active.lock);
2243 			rb_erase_cached(rb, &execlists->virtual);
2244 			RB_CLEAR_NODE(rb);
2245 			rb = rb_first_cached(&execlists->virtual);
2246 			continue;
2247 		}
2248 
2249 		GEM_BUG_ON(rq != ve->request);
2250 		GEM_BUG_ON(rq->engine != &ve->base);
2251 		GEM_BUG_ON(rq->context != &ve->context);
2252 
2253 		if (rq_prio(rq) >= queue_prio(execlists)) {
2254 			if (!virtual_matches(ve, rq, engine)) {
2255 				spin_unlock(&ve->base.active.lock);
2256 				rb = rb_next(rb);
2257 				continue;
2258 			}
2259 
2260 			if (last && !can_merge_rq(last, rq)) {
2261 				spin_unlock(&ve->base.active.lock);
2262 				start_timeslice(engine, rq_prio(rq));
2263 				return; /* leave this for another sibling */
2264 			}
2265 
2266 			ENGINE_TRACE(engine,
2267 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2268 				     rq->fence.context,
2269 				     rq->fence.seqno,
2270 				     i915_request_completed(rq) ? "!" :
2271 				     i915_request_started(rq) ? "*" :
2272 				     "",
2273 				     yesno(engine != ve->siblings[0]));
2274 
2275 			WRITE_ONCE(ve->request, NULL);
2276 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2277 				   INT_MIN);
2278 			rb_erase_cached(rb, &execlists->virtual);
2279 			RB_CLEAR_NODE(rb);
2280 
2281 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2282 			WRITE_ONCE(rq->engine, engine);
2283 
2284 			if (__i915_request_submit(rq)) {
2285 				/*
2286 				 * Only after we confirm that we will submit
2287 				 * this request (i.e. it has not already
2288 				 * completed), do we want to update the context.
2289 				 *
2290 				 * This serves two purposes. It avoids
2291 				 * unnecessary work if we are resubmitting an
2292 				 * already completed request after timeslicing.
2293 				 * But more importantly, it prevents us altering
2294 				 * ve->siblings[] on an idle context, where
2295 				 * we may be using ve->siblings[] in
2296 				 * virtual_context_enter / virtual_context_exit.
2297 				 */
2298 				virtual_xfer_context(ve, engine);
2299 				GEM_BUG_ON(ve->siblings[0] != engine);
2300 
2301 				submit = true;
2302 				last = rq;
2303 			}
2304 			i915_request_put(rq);
2305 
2306 			/*
2307 			 * Hmm, we have a bunch of virtual engine requests,
2308 			 * but the first one was already completed (thanks
2309 			 * preempt-to-busy!). Keep looking at the veng queue
2310 			 * until we have no more relevant requests (i.e.
2311 			 * the normal submit queue has higher priority).
2312 			 */
2313 			if (!submit) {
2314 				spin_unlock(&ve->base.active.lock);
2315 				rb = rb_first_cached(&execlists->virtual);
2316 				continue;
2317 			}
2318 		}
2319 
2320 		spin_unlock(&ve->base.active.lock);
2321 		break;
2322 	}
2323 
2324 	while ((rb = rb_first_cached(&execlists->queue))) {
2325 		struct i915_priolist *p = to_priolist(rb);
2326 		struct i915_request *rq, *rn;
2327 		int i;
2328 
2329 		priolist_for_each_request_consume(rq, rn, p, i) {
2330 			bool merge = true;
2331 
2332 			/*
2333 			 * Can we combine this request with the current port?
2334 			 * It has to be the same context/ringbuffer and not
2335 			 * have any exceptions (e.g. GVT saying never to
2336 			 * combine contexts).
2337 			 *
2338 			 * If we can combine the requests, we can execute both
2339 			 * by updating the RING_TAIL to point to the end of the
2340 			 * second request, and so we never need to tell the
2341 			 * hardware about the first.
2342 			 */
2343 			if (last && !can_merge_rq(last, rq)) {
2344 				/*
2345 				 * If we are on the second port and cannot
2346 				 * combine this request with the last, then we
2347 				 * are done.
2348 				 */
2349 				if (port == last_port)
2350 					goto done;
2351 
2352 				/*
2353 				 * We must not populate both ELSP[] with the
2354 				 * same LRCA, i.e. we must submit 2 different
2355 				 * contexts if we submit 2 ELSP.
2356 				 */
2357 				if (last->context == rq->context)
2358 					goto done;
2359 
2360 				if (i915_request_has_sentinel(last))
2361 					goto done;
2362 
2363 				/*
2364 				 * If GVT overrides us we only ever submit
2365 				 * port[0], leaving port[1] empty. Note that we
2366 				 * also have to be careful that we don't queue
2367 				 * the same context (even though a different
2368 				 * request) to the second port.
2369 				 */
2370 				if (ctx_single_port_submission(last->context) ||
2371 				    ctx_single_port_submission(rq->context))
2372 					goto done;
2373 
2374 				merge = false;
2375 			}
2376 
2377 			if (__i915_request_submit(rq)) {
2378 				if (!merge) {
2379 					*port = execlists_schedule_in(last, port - execlists->pending);
2380 					port++;
2381 					last = NULL;
2382 				}
2383 
2384 				GEM_BUG_ON(last &&
2385 					   !can_merge_ctx(last->context,
2386 							  rq->context));
2387 				GEM_BUG_ON(last &&
2388 					   i915_seqno_passed(last->fence.seqno,
2389 							     rq->fence.seqno));
2390 
2391 				submit = true;
2392 				last = rq;
2393 			}
2394 		}
2395 
2396 		rb_erase_cached(&p->node, &execlists->queue);
2397 		i915_priolist_free(p);
2398 	}
2399 
2400 done:
2401 	/*
2402 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2403 	 *
2404 	 * We choose the priority hint such that if we add a request of greater
2405 	 * priority than this, we kick the submission tasklet to decide on
2406 	 * the right order of submitting the requests to hardware. We must
2407 	 * also be prepared to reorder requests as they are in-flight on the
2408 	 * HW. We derive the priority hint then as the first "hole" in
2409 	 * the HW submission ports and if there are no available slots,
2410 	 * the priority of the lowest executing request, i.e. last.
2411 	 *
2412 	 * When we do receive a higher priority request ready to run from the
2413 	 * user, see queue_request(), the priority hint is bumped to that
2414 	 * request triggering preemption on the next dequeue (or subsequent
2415 	 * interrupt for secondary ports).
2416 	 */
2417 	execlists->queue_priority_hint = queue_prio(execlists);
2418 
2419 	if (submit) {
2420 		*port = execlists_schedule_in(last, port - execlists->pending);
2421 		execlists->switch_priority_hint =
2422 			switch_prio(engine, *execlists->pending);
2423 
2424 		/*
2425 		 * Skip if we ended up with exactly the same set of requests,
2426 		 * e.g. trying to timeslice a pair of ordered contexts
2427 		 */
2428 		if (!memcmp(active, execlists->pending,
2429 			    (port - execlists->pending + 1) * sizeof(*port))) {
2430 			do
2431 				execlists_schedule_out(fetch_and_zero(port));
2432 			while (port-- != execlists->pending);
2433 
2434 			goto skip_submit;
2435 		}
2436 		clear_ports(port + 1, last_port - port);
2437 
2438 		WRITE_ONCE(execlists->yield, -1);
2439 		set_preempt_timeout(engine, *active);
2440 		execlists_submit_ports(engine);
2441 	} else {
2442 		start_timeslice(engine, execlists->queue_priority_hint);
2443 skip_submit:
2444 		ring_set_paused(engine, 0);
2445 	}
2446 }
2447 
2448 static void
2449 cancel_port_requests(struct intel_engine_execlists * const execlists)
2450 {
2451 	struct i915_request * const *port;
2452 
2453 	for (port = execlists->pending; *port; port++)
2454 		execlists_schedule_out(*port);
2455 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2456 
2457 	/* Mark the end of active before we overwrite *active */
2458 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2459 		execlists_schedule_out(*port);
2460 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2461 
2462 	smp_wmb(); /* complete the seqlock for execlists_active() */
2463 	WRITE_ONCE(execlists->active, execlists->inflight);
2464 }
2465 
2466 static inline void
2467 invalidate_csb_entries(const u64 *first, const u64 *last)
2468 {
2469 	clflush((void *)first);
2470 	clflush((void *)last);
2471 }
2472 
2473 /*
2474  * Starting with Gen12, the status has a new format:
2475  *
2476  *     bit  0:     switched to new queue
2477  *     bit  1:     reserved
2478  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2479  *                 switch detail is set to "wait on semaphore"
2480  *     bits 3-5:   engine class
2481  *     bits 6-11:  engine instance
2482  *     bits 12-14: reserved
2483  *     bits 15-25: sw context id of the lrc the GT switched to
2484  *     bits 26-31: sw counter of the lrc the GT switched to
2485  *     bits 32-35: context switch detail
2486  *                  - 0: ctx complete
2487  *                  - 1: wait on sync flip
2488  *                  - 2: wait on vblank
2489  *                  - 3: wait on scanline
2490  *                  - 4: wait on semaphore
2491  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2492  *                       WAIT_FOR_EVENT)
2493  *     bit  36:    reserved
2494  *     bits 37-43: wait detail (for switch detail 1 to 4)
2495  *     bits 44-46: reserved
2496  *     bits 47-57: sw context id of the lrc the GT switched away from
2497  *     bits 58-63: sw counter of the lrc the GT switched away from
2498  */
2499 static inline bool gen12_csb_parse(const u64 csb)
2500 {
2501 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(csb));
2502 	bool new_queue =
2503 		lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2504 
2505 	/*
2506 	 * The context switch detail is not guaranteed to be 5 when a preemption
2507 	 * occurs, so we can't just check for that. The check below works for
2508 	 * all the cases we care about, including preemptions of WAIT
2509 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2510 	 * would require some extra handling, but we don't support that.
2511 	 */
2512 	if (!ctx_away_valid || new_queue) {
2513 		GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(csb)));
2514 		return true;
2515 	}
2516 
2517 	/*
2518 	 * switch detail = 5 is covered by the case above and we do not expect a
2519 	 * context switch on an unsuccessful wait instruction since we always
2520 	 * use polling mode.
2521 	 */
2522 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb)));
2523 	return false;
2524 }
2525 
2526 static inline bool gen8_csb_parse(const u64 csb)
2527 {
2528 	return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2529 }
2530 
2531 static noinline u64
2532 wa_csb_read(const struct intel_engine_cs *engine, u64 * const csb)
2533 {
2534 	u64 entry;
2535 
2536 	/*
2537 	 * Reading from the HWSP has one particular advantage: we can detect
2538 	 * a stale entry. Since the write into HWSP is broken, we have no reason
2539 	 * to trust the HW at all, the mmio entry may equally be unordered, so
2540 	 * we prefer the path that is self-checking and as a last resort,
2541 	 * return the mmio value.
2542 	 *
2543 	 * tgl,dg1:HSDES#22011327657
2544 	 */
2545 	preempt_disable();
2546 	if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 10)) {
2547 		int idx = csb - engine->execlists.csb_status;
2548 		int status;
2549 
2550 		status = GEN8_EXECLISTS_STATUS_BUF;
2551 		if (idx >= 6) {
2552 			status = GEN11_EXECLISTS_STATUS_BUF2;
2553 			idx -= 6;
2554 		}
2555 		status += sizeof(u64) * idx;
2556 
2557 		entry = intel_uncore_read64(engine->uncore,
2558 					    _MMIO(engine->mmio_base + status));
2559 	}
2560 	preempt_enable();
2561 
2562 	return entry;
2563 }
2564 
2565 static inline u64
2566 csb_read(const struct intel_engine_cs *engine, u64 * const csb)
2567 {
2568 	u64 entry = READ_ONCE(*csb);
2569 
2570 	/*
2571 	 * Unfortunately, the GPU does not always serialise its write
2572 	 * of the CSB entries before its write of the CSB pointer, at least
2573 	 * from the perspective of the CPU, using what is known as a Global
2574 	 * Observation Point. We may read a new CSB tail pointer, but then
2575 	 * read the stale CSB entries, causing us to misinterpret the
2576 	 * context-switch events, and eventually declare the GPU hung.
2577 	 *
2578 	 * icl:HSDES#1806554093
2579 	 * tgl:HSDES#22011248461
2580 	 */
2581 	if (unlikely(entry == -1))
2582 		entry = wa_csb_read(engine, csb);
2583 
2584 	/* Consume this entry so that we can spot its future reuse. */
2585 	WRITE_ONCE(*csb, -1);
2586 
2587 	/* ELSP is an implicit wmb() before the GPU wraps and overwrites csb */
2588 	return entry;
2589 }
2590 
2591 static void process_csb(struct intel_engine_cs *engine)
2592 {
2593 	struct intel_engine_execlists * const execlists = &engine->execlists;
2594 	u64 * const buf = execlists->csb_status;
2595 	const u8 num_entries = execlists->csb_size;
2596 	u8 head, tail;
2597 
2598 	/*
2599 	 * As we modify our execlists state tracking we require exclusive
2600 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2601 	 * and we assume that is only inside the reset paths and so serialised.
2602 	 */
2603 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2604 		   !reset_in_progress(execlists));
2605 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2606 
2607 	/*
2608 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2609 	 * When reading from the csb_write mmio register, we have to be
2610 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2611 	 * the low 4bits. As it happens we know the next 4bits are always
2612 	 * zero and so we can simply masked off the low u8 of the register
2613 	 * and treat it identically to reading from the HWSP (without having
2614 	 * to use explicit shifting and masking, and probably bifurcating
2615 	 * the code to handle the legacy mmio read).
2616 	 */
2617 	head = execlists->csb_head;
2618 	tail = READ_ONCE(*execlists->csb_write);
2619 	if (unlikely(head == tail))
2620 		return;
2621 
2622 	/*
2623 	 * We will consume all events from HW, or at least pretend to.
2624 	 *
2625 	 * The sequence of events from the HW is deterministic, and derived
2626 	 * from our writes to the ELSP, with a smidgen of variability for
2627 	 * the arrival of the asynchronous requests wrt to the inflight
2628 	 * execution. If the HW sends an event that does not correspond with
2629 	 * the one we are expecting, we have to abandon all hope as we lose
2630 	 * all tracking of what the engine is actually executing. We will
2631 	 * only detect we are out of sequence with the HW when we get an
2632 	 * 'impossible' event because we have already drained our own
2633 	 * preemption/promotion queue. If this occurs, we know that we likely
2634 	 * lost track of execution earlier and must unwind and restart, the
2635 	 * simplest way is by stop processing the event queue and force the
2636 	 * engine to reset.
2637 	 */
2638 	execlists->csb_head = tail;
2639 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2640 
2641 	/*
2642 	 * Hopefully paired with a wmb() in HW!
2643 	 *
2644 	 * We must complete the read of the write pointer before any reads
2645 	 * from the CSB, so that we do not see stale values. Without an rmb
2646 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2647 	 * we perform the READ_ONCE(*csb_write).
2648 	 */
2649 	rmb();
2650 	do {
2651 		bool promote;
2652 		u64 csb;
2653 
2654 		if (++head == num_entries)
2655 			head = 0;
2656 
2657 		/*
2658 		 * We are flying near dragons again.
2659 		 *
2660 		 * We hold a reference to the request in execlist_port[]
2661 		 * but no more than that. We are operating in softirq
2662 		 * context and so cannot hold any mutex or sleep. That
2663 		 * prevents us stopping the requests we are processing
2664 		 * in port[] from being retired simultaneously (the
2665 		 * breadcrumb will be complete before we see the
2666 		 * context-switch). As we only hold the reference to the
2667 		 * request, any pointer chasing underneath the request
2668 		 * is subject to a potential use-after-free. Thus we
2669 		 * store all of the bookkeeping within port[] as
2670 		 * required, and avoid using unguarded pointers beneath
2671 		 * request itself. The same applies to the atomic
2672 		 * status notifier.
2673 		 */
2674 
2675 		csb = csb_read(engine, buf + head);
2676 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2677 			     head, upper_32_bits(csb), lower_32_bits(csb));
2678 
2679 		if (INTEL_GEN(engine->i915) >= 12)
2680 			promote = gen12_csb_parse(csb);
2681 		else
2682 			promote = gen8_csb_parse(csb);
2683 		if (promote) {
2684 			struct i915_request * const *old = execlists->active;
2685 
2686 			if (GEM_WARN_ON(!*execlists->pending)) {
2687 				execlists->error_interrupt |= ERROR_CSB;
2688 				break;
2689 			}
2690 
2691 			ring_set_paused(engine, 0);
2692 
2693 			/* Point active to the new ELSP; prevent overwriting */
2694 			WRITE_ONCE(execlists->active, execlists->pending);
2695 			smp_wmb(); /* notify execlists_active() */
2696 
2697 			/* cancel old inflight, prepare for switch */
2698 			trace_ports(execlists, "preempted", old);
2699 			while (*old)
2700 				execlists_schedule_out(*old++);
2701 
2702 			/* switch pending to inflight */
2703 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2704 			copy_ports(execlists->inflight,
2705 				   execlists->pending,
2706 				   execlists_num_ports(execlists));
2707 			smp_wmb(); /* complete the seqlock */
2708 			WRITE_ONCE(execlists->active, execlists->inflight);
2709 
2710 			/* XXX Magic delay for tgl */
2711 			ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2712 
2713 			WRITE_ONCE(execlists->pending[0], NULL);
2714 		} else {
2715 			if (GEM_WARN_ON(!*execlists->active)) {
2716 				execlists->error_interrupt |= ERROR_CSB;
2717 				break;
2718 			}
2719 
2720 			/* port0 completed, advanced to port1 */
2721 			trace_ports(execlists, "completed", execlists->active);
2722 
2723 			/*
2724 			 * We rely on the hardware being strongly
2725 			 * ordered, that the breadcrumb write is
2726 			 * coherent (visible from the CPU) before the
2727 			 * user interrupt is processed. One might assume
2728 			 * that the breadcrumb write being before the
2729 			 * user interrupt and the CS event for the context
2730 			 * switch would therefore be before the CS event
2731 			 * itself...
2732 			 */
2733 			if (GEM_SHOW_DEBUG() &&
2734 			    !i915_request_completed(*execlists->active)) {
2735 				struct i915_request *rq = *execlists->active;
2736 				const u32 *regs __maybe_unused =
2737 					rq->context->lrc_reg_state;
2738 
2739 				ENGINE_TRACE(engine,
2740 					     "context completed before request!\n");
2741 				ENGINE_TRACE(engine,
2742 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2743 					     ENGINE_READ(engine, RING_START),
2744 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2745 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2746 					     ENGINE_READ(engine, RING_CTL),
2747 					     ENGINE_READ(engine, RING_MI_MODE));
2748 				ENGINE_TRACE(engine,
2749 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2750 					     i915_ggtt_offset(rq->ring->vma),
2751 					     rq->head, rq->tail,
2752 					     rq->fence.context,
2753 					     lower_32_bits(rq->fence.seqno),
2754 					     hwsp_seqno(rq));
2755 				ENGINE_TRACE(engine,
2756 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2757 					     regs[CTX_RING_START],
2758 					     regs[CTX_RING_HEAD],
2759 					     regs[CTX_RING_TAIL]);
2760 			}
2761 
2762 			execlists_schedule_out(*execlists->active++);
2763 
2764 			GEM_BUG_ON(execlists->active - execlists->inflight >
2765 				   execlists_num_ports(execlists));
2766 		}
2767 	} while (head != tail);
2768 
2769 	set_timeslice(engine);
2770 
2771 	/*
2772 	 * Gen11 has proven to fail wrt global observation point between
2773 	 * entry and tail update, failing on the ordering and thus
2774 	 * we see an old entry in the context status buffer.
2775 	 *
2776 	 * Forcibly evict out entries for the next gpu csb update,
2777 	 * to increase the odds that we get a fresh entries with non
2778 	 * working hardware. The cost for doing so comes out mostly with
2779 	 * the wash as hardware, working or not, will need to do the
2780 	 * invalidation before.
2781 	 */
2782 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2783 }
2784 
2785 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2786 {
2787 	lockdep_assert_held(&engine->active.lock);
2788 	if (!READ_ONCE(engine->execlists.pending[0])) {
2789 		rcu_read_lock(); /* protect peeking at execlists->active */
2790 		execlists_dequeue(engine);
2791 		rcu_read_unlock();
2792 	}
2793 }
2794 
2795 static void __execlists_hold(struct i915_request *rq)
2796 {
2797 	LIST_HEAD(list);
2798 
2799 	do {
2800 		struct i915_dependency *p;
2801 
2802 		if (i915_request_is_active(rq))
2803 			__i915_request_unsubmit(rq);
2804 
2805 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2806 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2807 		i915_request_set_hold(rq);
2808 		RQ_TRACE(rq, "on hold\n");
2809 
2810 		for_each_waiter(p, rq) {
2811 			struct i915_request *w =
2812 				container_of(p->waiter, typeof(*w), sched);
2813 
2814 			/* Leave semaphores spinning on the other engines */
2815 			if (w->engine != rq->engine)
2816 				continue;
2817 
2818 			if (!i915_request_is_ready(w))
2819 				continue;
2820 
2821 			if (i915_request_completed(w))
2822 				continue;
2823 
2824 			if (i915_request_on_hold(w))
2825 				continue;
2826 
2827 			list_move_tail(&w->sched.link, &list);
2828 		}
2829 
2830 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2831 	} while (rq);
2832 }
2833 
2834 static bool execlists_hold(struct intel_engine_cs *engine,
2835 			   struct i915_request *rq)
2836 {
2837 	spin_lock_irq(&engine->active.lock);
2838 
2839 	if (i915_request_completed(rq)) { /* too late! */
2840 		rq = NULL;
2841 		goto unlock;
2842 	}
2843 
2844 	if (rq->engine != engine) { /* preempted virtual engine */
2845 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2846 
2847 		/*
2848 		 * intel_context_inflight() is only protected by virtue
2849 		 * of process_csb() being called only by the tasklet (or
2850 		 * directly from inside reset while the tasklet is suspended).
2851 		 * Assert that neither of those are allowed to run while we
2852 		 * poke at the request queues.
2853 		 */
2854 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2855 
2856 		/*
2857 		 * An unsubmitted request along a virtual engine will
2858 		 * remain on the active (this) engine until we are able
2859 		 * to process the context switch away (and so mark the
2860 		 * context as no longer in flight). That cannot have happened
2861 		 * yet, otherwise we would not be hanging!
2862 		 */
2863 		spin_lock(&ve->base.active.lock);
2864 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2865 		GEM_BUG_ON(ve->request != rq);
2866 		ve->request = NULL;
2867 		spin_unlock(&ve->base.active.lock);
2868 		i915_request_put(rq);
2869 
2870 		rq->engine = engine;
2871 	}
2872 
2873 	/*
2874 	 * Transfer this request onto the hold queue to prevent it
2875 	 * being resumbitted to HW (and potentially completed) before we have
2876 	 * released it. Since we may have already submitted following
2877 	 * requests, we need to remove those as well.
2878 	 */
2879 	GEM_BUG_ON(i915_request_on_hold(rq));
2880 	GEM_BUG_ON(rq->engine != engine);
2881 	__execlists_hold(rq);
2882 	GEM_BUG_ON(list_empty(&engine->active.hold));
2883 
2884 unlock:
2885 	spin_unlock_irq(&engine->active.lock);
2886 	return rq;
2887 }
2888 
2889 static bool hold_request(const struct i915_request *rq)
2890 {
2891 	struct i915_dependency *p;
2892 	bool result = false;
2893 
2894 	/*
2895 	 * If one of our ancestors is on hold, we must also be on hold,
2896 	 * otherwise we will bypass it and execute before it.
2897 	 */
2898 	rcu_read_lock();
2899 	for_each_signaler(p, rq) {
2900 		const struct i915_request *s =
2901 			container_of(p->signaler, typeof(*s), sched);
2902 
2903 		if (s->engine != rq->engine)
2904 			continue;
2905 
2906 		result = i915_request_on_hold(s);
2907 		if (result)
2908 			break;
2909 	}
2910 	rcu_read_unlock();
2911 
2912 	return result;
2913 }
2914 
2915 static void __execlists_unhold(struct i915_request *rq)
2916 {
2917 	LIST_HEAD(list);
2918 
2919 	do {
2920 		struct i915_dependency *p;
2921 
2922 		RQ_TRACE(rq, "hold release\n");
2923 
2924 		GEM_BUG_ON(!i915_request_on_hold(rq));
2925 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2926 
2927 		i915_request_clear_hold(rq);
2928 		list_move_tail(&rq->sched.link,
2929 			       i915_sched_lookup_priolist(rq->engine,
2930 							  rq_prio(rq)));
2931 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2932 
2933 		/* Also release any children on this engine that are ready */
2934 		for_each_waiter(p, rq) {
2935 			struct i915_request *w =
2936 				container_of(p->waiter, typeof(*w), sched);
2937 
2938 			/* Propagate any change in error status */
2939 			if (rq->fence.error)
2940 				i915_request_set_error_once(w, rq->fence.error);
2941 
2942 			if (w->engine != rq->engine)
2943 				continue;
2944 
2945 			if (!i915_request_on_hold(w))
2946 				continue;
2947 
2948 			/* Check that no other parents are also on hold */
2949 			if (hold_request(w))
2950 				continue;
2951 
2952 			list_move_tail(&w->sched.link, &list);
2953 		}
2954 
2955 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2956 	} while (rq);
2957 }
2958 
2959 static void execlists_unhold(struct intel_engine_cs *engine,
2960 			     struct i915_request *rq)
2961 {
2962 	spin_lock_irq(&engine->active.lock);
2963 
2964 	/*
2965 	 * Move this request back to the priority queue, and all of its
2966 	 * children and grandchildren that were suspended along with it.
2967 	 */
2968 	__execlists_unhold(rq);
2969 
2970 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2971 		engine->execlists.queue_priority_hint = rq_prio(rq);
2972 		tasklet_hi_schedule(&engine->execlists.tasklet);
2973 	}
2974 
2975 	spin_unlock_irq(&engine->active.lock);
2976 }
2977 
2978 struct execlists_capture {
2979 	struct work_struct work;
2980 	struct i915_request *rq;
2981 	struct i915_gpu_coredump *error;
2982 };
2983 
2984 static void execlists_capture_work(struct work_struct *work)
2985 {
2986 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2987 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2988 	struct intel_engine_cs *engine = cap->rq->engine;
2989 	struct intel_gt_coredump *gt = cap->error->gt;
2990 	struct intel_engine_capture_vma *vma;
2991 
2992 	/* Compress all the objects attached to the request, slow! */
2993 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2994 	if (vma) {
2995 		struct i915_vma_compress *compress =
2996 			i915_vma_capture_prepare(gt);
2997 
2998 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2999 		i915_vma_capture_finish(gt, compress);
3000 	}
3001 
3002 	gt->simulated = gt->engine->simulated;
3003 	cap->error->simulated = gt->simulated;
3004 
3005 	/* Publish the error state, and announce it to the world */
3006 	i915_error_state_store(cap->error);
3007 	i915_gpu_coredump_put(cap->error);
3008 
3009 	/* Return this request and all that depend upon it for signaling */
3010 	execlists_unhold(engine, cap->rq);
3011 	i915_request_put(cap->rq);
3012 
3013 	kfree(cap);
3014 }
3015 
3016 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
3017 {
3018 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
3019 	struct execlists_capture *cap;
3020 
3021 	cap = kmalloc(sizeof(*cap), gfp);
3022 	if (!cap)
3023 		return NULL;
3024 
3025 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
3026 	if (!cap->error)
3027 		goto err_cap;
3028 
3029 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
3030 	if (!cap->error->gt)
3031 		goto err_gpu;
3032 
3033 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
3034 	if (!cap->error->gt->engine)
3035 		goto err_gt;
3036 
3037 	return cap;
3038 
3039 err_gt:
3040 	kfree(cap->error->gt);
3041 err_gpu:
3042 	kfree(cap->error);
3043 err_cap:
3044 	kfree(cap);
3045 	return NULL;
3046 }
3047 
3048 static struct i915_request *
3049 active_context(struct intel_engine_cs *engine, u32 ccid)
3050 {
3051 	const struct intel_engine_execlists * const el = &engine->execlists;
3052 	struct i915_request * const *port, *rq;
3053 
3054 	/*
3055 	 * Use the most recent result from process_csb(), but just in case
3056 	 * we trigger an error (via interrupt) before the first CS event has
3057 	 * been written, peek at the next submission.
3058 	 */
3059 
3060 	for (port = el->active; (rq = *port); port++) {
3061 		if (rq->context->lrc.ccid == ccid) {
3062 			ENGINE_TRACE(engine,
3063 				     "ccid found at active:%zd\n",
3064 				     port - el->active);
3065 			return rq;
3066 		}
3067 	}
3068 
3069 	for (port = el->pending; (rq = *port); port++) {
3070 		if (rq->context->lrc.ccid == ccid) {
3071 			ENGINE_TRACE(engine,
3072 				     "ccid found at pending:%zd\n",
3073 				     port - el->pending);
3074 			return rq;
3075 		}
3076 	}
3077 
3078 	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3079 	return NULL;
3080 }
3081 
3082 static u32 active_ccid(struct intel_engine_cs *engine)
3083 {
3084 	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3085 }
3086 
3087 static void execlists_capture(struct intel_engine_cs *engine)
3088 {
3089 	struct execlists_capture *cap;
3090 
3091 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3092 		return;
3093 
3094 	/*
3095 	 * We need to _quickly_ capture the engine state before we reset.
3096 	 * We are inside an atomic section (softirq) here and we are delaying
3097 	 * the forced preemption event.
3098 	 */
3099 	cap = capture_regs(engine);
3100 	if (!cap)
3101 		return;
3102 
3103 	spin_lock_irq(&engine->active.lock);
3104 	cap->rq = active_context(engine, active_ccid(engine));
3105 	if (cap->rq) {
3106 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3107 		cap->rq = i915_request_get_rcu(cap->rq);
3108 	}
3109 	spin_unlock_irq(&engine->active.lock);
3110 	if (!cap->rq)
3111 		goto err_free;
3112 
3113 	/*
3114 	 * Remove the request from the execlists queue, and take ownership
3115 	 * of the request. We pass it to our worker who will _slowly_ compress
3116 	 * all the pages the _user_ requested for debugging their batch, after
3117 	 * which we return it to the queue for signaling.
3118 	 *
3119 	 * By removing them from the execlists queue, we also remove the
3120 	 * requests from being processed by __unwind_incomplete_requests()
3121 	 * during the intel_engine_reset(), and so they will *not* be replayed
3122 	 * afterwards.
3123 	 *
3124 	 * Note that because we have not yet reset the engine at this point,
3125 	 * it is possible for the request that we have identified as being
3126 	 * guilty, did in fact complete and we will then hit an arbitration
3127 	 * point allowing the outstanding preemption to succeed. The likelihood
3128 	 * of that is very low (as capturing of the engine registers should be
3129 	 * fast enough to run inside an irq-off atomic section!), so we will
3130 	 * simply hold that request accountable for being non-preemptible
3131 	 * long enough to force the reset.
3132 	 */
3133 	if (!execlists_hold(engine, cap->rq))
3134 		goto err_rq;
3135 
3136 	INIT_WORK(&cap->work, execlists_capture_work);
3137 	schedule_work(&cap->work);
3138 	return;
3139 
3140 err_rq:
3141 	i915_request_put(cap->rq);
3142 err_free:
3143 	i915_gpu_coredump_put(cap->error);
3144 	kfree(cap);
3145 }
3146 
3147 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3148 {
3149 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3150 	unsigned long *lock = &engine->gt->reset.flags;
3151 
3152 	if (!intel_has_reset_engine(engine->gt))
3153 		return;
3154 
3155 	if (test_and_set_bit(bit, lock))
3156 		return;
3157 
3158 	ENGINE_TRACE(engine, "reset for %s\n", msg);
3159 
3160 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3161 	tasklet_disable_nosync(&engine->execlists.tasklet);
3162 
3163 	ring_set_paused(engine, 1); /* Freeze the current request in place */
3164 	execlists_capture(engine);
3165 	intel_engine_reset(engine, msg);
3166 
3167 	tasklet_enable(&engine->execlists.tasklet);
3168 	clear_and_wake_up_bit(bit, lock);
3169 }
3170 
3171 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3172 {
3173 	const struct timer_list *t = &engine->execlists.preempt;
3174 
3175 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3176 		return false;
3177 
3178 	if (!timer_expired(t))
3179 		return false;
3180 
3181 	return READ_ONCE(engine->execlists.pending[0]);
3182 }
3183 
3184 /*
3185  * Check the unread Context Status Buffers and manage the submission of new
3186  * contexts to the ELSP accordingly.
3187  */
3188 static void execlists_submission_tasklet(unsigned long data)
3189 {
3190 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3191 	bool timeout = preempt_timeout(engine);
3192 
3193 	process_csb(engine);
3194 
3195 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3196 		const char *msg;
3197 
3198 		/* Generate the error message in priority wrt to the user! */
3199 		if (engine->execlists.error_interrupt & GENMASK(15, 0))
3200 			msg = "CS error"; /* thrown by a user payload */
3201 		else if (engine->execlists.error_interrupt & ERROR_CSB)
3202 			msg = "invalid CSB event";
3203 		else
3204 			msg = "internal error";
3205 
3206 		engine->execlists.error_interrupt = 0;
3207 		execlists_reset(engine, msg);
3208 	}
3209 
3210 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3211 		unsigned long flags;
3212 
3213 		spin_lock_irqsave(&engine->active.lock, flags);
3214 		__execlists_submission_tasklet(engine);
3215 		spin_unlock_irqrestore(&engine->active.lock, flags);
3216 
3217 		/* Recheck after serialising with direct-submission */
3218 		if (unlikely(timeout && preempt_timeout(engine)))
3219 			execlists_reset(engine, "preemption time out");
3220 	}
3221 }
3222 
3223 static void __execlists_kick(struct intel_engine_execlists *execlists)
3224 {
3225 	/* Kick the tasklet for some interrupt coalescing and reset handling */
3226 	tasklet_hi_schedule(&execlists->tasklet);
3227 }
3228 
3229 #define execlists_kick(t, member) \
3230 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3231 
3232 static void execlists_timeslice(struct timer_list *timer)
3233 {
3234 	execlists_kick(timer, timer);
3235 }
3236 
3237 static void execlists_preempt(struct timer_list *timer)
3238 {
3239 	execlists_kick(timer, preempt);
3240 }
3241 
3242 static void queue_request(struct intel_engine_cs *engine,
3243 			  struct i915_request *rq)
3244 {
3245 	GEM_BUG_ON(!list_empty(&rq->sched.link));
3246 	list_add_tail(&rq->sched.link,
3247 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3248 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3249 }
3250 
3251 static void __submit_queue_imm(struct intel_engine_cs *engine)
3252 {
3253 	struct intel_engine_execlists * const execlists = &engine->execlists;
3254 
3255 	if (reset_in_progress(execlists))
3256 		return; /* defer until we restart the engine following reset */
3257 
3258 	__execlists_submission_tasklet(engine);
3259 }
3260 
3261 static void submit_queue(struct intel_engine_cs *engine,
3262 			 const struct i915_request *rq)
3263 {
3264 	struct intel_engine_execlists *execlists = &engine->execlists;
3265 
3266 	if (rq_prio(rq) <= execlists->queue_priority_hint)
3267 		return;
3268 
3269 	execlists->queue_priority_hint = rq_prio(rq);
3270 	__submit_queue_imm(engine);
3271 }
3272 
3273 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3274 			     const struct i915_request *rq)
3275 {
3276 	GEM_BUG_ON(i915_request_on_hold(rq));
3277 	return !list_empty(&engine->active.hold) && hold_request(rq);
3278 }
3279 
3280 static void flush_csb(struct intel_engine_cs *engine)
3281 {
3282 	struct intel_engine_execlists *el = &engine->execlists;
3283 
3284 	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3285 		if (!reset_in_progress(el))
3286 			process_csb(engine);
3287 		tasklet_unlock(&el->tasklet);
3288 	}
3289 }
3290 
3291 static void execlists_submit_request(struct i915_request *request)
3292 {
3293 	struct intel_engine_cs *engine = request->engine;
3294 	unsigned long flags;
3295 
3296 	/* Hopefully we clear execlists->pending[] to let us through */
3297 	flush_csb(engine);
3298 
3299 	/* Will be called from irq-context when using foreign fences. */
3300 	spin_lock_irqsave(&engine->active.lock, flags);
3301 
3302 	if (unlikely(ancestor_on_hold(engine, request))) {
3303 		RQ_TRACE(request, "ancestor on hold\n");
3304 		list_add_tail(&request->sched.link, &engine->active.hold);
3305 		i915_request_set_hold(request);
3306 	} else {
3307 		queue_request(engine, request);
3308 
3309 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3310 		GEM_BUG_ON(list_empty(&request->sched.link));
3311 
3312 		submit_queue(engine, request);
3313 	}
3314 
3315 	spin_unlock_irqrestore(&engine->active.lock, flags);
3316 }
3317 
3318 static void __execlists_context_fini(struct intel_context *ce)
3319 {
3320 	intel_ring_put(ce->ring);
3321 	i915_vma_put(ce->state);
3322 }
3323 
3324 static void execlists_context_destroy(struct kref *kref)
3325 {
3326 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3327 
3328 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3329 	GEM_BUG_ON(intel_context_is_pinned(ce));
3330 
3331 	if (ce->state)
3332 		__execlists_context_fini(ce);
3333 
3334 	intel_context_fini(ce);
3335 	intel_context_free(ce);
3336 }
3337 
3338 static void
3339 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3340 {
3341 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3342 		return;
3343 
3344 	vaddr += engine->context_size;
3345 
3346 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3347 }
3348 
3349 static void
3350 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3351 {
3352 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3353 		return;
3354 
3355 	vaddr += engine->context_size;
3356 
3357 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3358 		drm_err_once(&engine->i915->drm,
3359 			     "%s context redzone overwritten!\n",
3360 			     engine->name);
3361 }
3362 
3363 static void execlists_context_unpin(struct intel_context *ce)
3364 {
3365 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3366 		      ce->engine);
3367 }
3368 
3369 static void execlists_context_post_unpin(struct intel_context *ce)
3370 {
3371 	i915_gem_object_unpin_map(ce->state->obj);
3372 }
3373 
3374 static u32 *
3375 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3376 {
3377 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3378 		MI_SRM_LRM_GLOBAL_GTT |
3379 		MI_LRI_LRM_CS_MMIO;
3380 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3381 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3382 		CTX_TIMESTAMP * sizeof(u32);
3383 	*cs++ = 0;
3384 
3385 	*cs++ = MI_LOAD_REGISTER_REG |
3386 		MI_LRR_SOURCE_CS_MMIO |
3387 		MI_LRI_LRM_CS_MMIO;
3388 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3389 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3390 
3391 	*cs++ = MI_LOAD_REGISTER_REG |
3392 		MI_LRR_SOURCE_CS_MMIO |
3393 		MI_LRI_LRM_CS_MMIO;
3394 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3395 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3396 
3397 	return cs;
3398 }
3399 
3400 static u32 *
3401 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3402 {
3403 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3404 
3405 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3406 		MI_SRM_LRM_GLOBAL_GTT |
3407 		MI_LRI_LRM_CS_MMIO;
3408 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3409 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3410 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3411 	*cs++ = 0;
3412 
3413 	return cs;
3414 }
3415 
3416 static u32 *
3417 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3418 {
3419 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3420 
3421 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3422 		MI_SRM_LRM_GLOBAL_GTT |
3423 		MI_LRI_LRM_CS_MMIO;
3424 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3425 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3426 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3427 	*cs++ = 0;
3428 
3429 	*cs++ = MI_LOAD_REGISTER_REG |
3430 		MI_LRR_SOURCE_CS_MMIO |
3431 		MI_LRI_LRM_CS_MMIO;
3432 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3433 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3434 
3435 	return cs;
3436 }
3437 
3438 static u32 *
3439 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3440 {
3441 	cs = gen12_emit_timestamp_wa(ce, cs);
3442 	cs = gen12_emit_cmd_buf_wa(ce, cs);
3443 	cs = gen12_emit_restore_scratch(ce, cs);
3444 
3445 	return cs;
3446 }
3447 
3448 static u32 *
3449 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3450 {
3451 	cs = gen12_emit_timestamp_wa(ce, cs);
3452 	cs = gen12_emit_restore_scratch(ce, cs);
3453 
3454 	return cs;
3455 }
3456 
3457 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3458 {
3459 	return PAGE_SIZE * ce->wa_bb_page;
3460 }
3461 
3462 static u32 *context_indirect_bb(const struct intel_context *ce)
3463 {
3464 	void *ptr;
3465 
3466 	GEM_BUG_ON(!ce->wa_bb_page);
3467 
3468 	ptr = ce->lrc_reg_state;
3469 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3470 	ptr += context_wa_bb_offset(ce);
3471 
3472 	return ptr;
3473 }
3474 
3475 static void
3476 setup_indirect_ctx_bb(const struct intel_context *ce,
3477 		      const struct intel_engine_cs *engine,
3478 		      u32 *(*emit)(const struct intel_context *, u32 *))
3479 {
3480 	u32 * const start = context_indirect_bb(ce);
3481 	u32 *cs;
3482 
3483 	cs = emit(ce, start);
3484 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3485 	while ((unsigned long)cs % CACHELINE_BYTES)
3486 		*cs++ = MI_NOOP;
3487 
3488 	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3489 				    i915_ggtt_offset(ce->state) +
3490 				    context_wa_bb_offset(ce),
3491 				    (cs - start) * sizeof(*cs));
3492 }
3493 
3494 static void
3495 __execlists_update_reg_state(const struct intel_context *ce,
3496 			     const struct intel_engine_cs *engine,
3497 			     u32 head)
3498 {
3499 	struct intel_ring *ring = ce->ring;
3500 	u32 *regs = ce->lrc_reg_state;
3501 
3502 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3503 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3504 
3505 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3506 	regs[CTX_RING_HEAD] = head;
3507 	regs[CTX_RING_TAIL] = ring->tail;
3508 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3509 
3510 	/* RPCS */
3511 	if (engine->class == RENDER_CLASS) {
3512 		regs[CTX_R_PWR_CLK_STATE] =
3513 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3514 
3515 		i915_oa_init_reg_state(ce, engine);
3516 	}
3517 
3518 	if (ce->wa_bb_page) {
3519 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3520 
3521 		fn = gen12_emit_indirect_ctx_xcs;
3522 		if (ce->engine->class == RENDER_CLASS)
3523 			fn = gen12_emit_indirect_ctx_rcs;
3524 
3525 		/* Mutually exclusive wrt to global indirect bb */
3526 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3527 		setup_indirect_ctx_bb(ce, engine, fn);
3528 	}
3529 }
3530 
3531 static int
3532 execlists_context_pre_pin(struct intel_context *ce,
3533 			  struct i915_gem_ww_ctx *ww, void **vaddr)
3534 {
3535 	GEM_BUG_ON(!ce->state);
3536 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3537 
3538 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
3539 					i915_coherent_map_type(ce->engine->i915) |
3540 					I915_MAP_OVERRIDE);
3541 
3542 	return PTR_ERR_OR_ZERO(*vaddr);
3543 }
3544 
3545 static int
3546 __execlists_context_pin(struct intel_context *ce,
3547 			struct intel_engine_cs *engine,
3548 			void *vaddr)
3549 {
3550 	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3551 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3552 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3553 
3554 	return 0;
3555 }
3556 
3557 static int execlists_context_pin(struct intel_context *ce, void *vaddr)
3558 {
3559 	return __execlists_context_pin(ce, ce->engine, vaddr);
3560 }
3561 
3562 static int execlists_context_alloc(struct intel_context *ce)
3563 {
3564 	return __execlists_context_alloc(ce, ce->engine);
3565 }
3566 
3567 static void execlists_context_reset(struct intel_context *ce)
3568 {
3569 	CE_TRACE(ce, "reset\n");
3570 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3571 
3572 	intel_ring_reset(ce->ring, ce->ring->emit);
3573 
3574 	/* Scrub away the garbage */
3575 	execlists_init_reg_state(ce->lrc_reg_state,
3576 				 ce, ce->engine, ce->ring, true);
3577 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3578 
3579 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3580 }
3581 
3582 static const struct intel_context_ops execlists_context_ops = {
3583 	.alloc = execlists_context_alloc,
3584 
3585 	.pre_pin = execlists_context_pre_pin,
3586 	.pin = execlists_context_pin,
3587 	.unpin = execlists_context_unpin,
3588 	.post_unpin = execlists_context_post_unpin,
3589 
3590 	.enter = intel_context_enter_engine,
3591 	.exit = intel_context_exit_engine,
3592 
3593 	.reset = execlists_context_reset,
3594 	.destroy = execlists_context_destroy,
3595 };
3596 
3597 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3598 {
3599 	u32 *cs;
3600 
3601 	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3602 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3603 		return 0;
3604 
3605 	cs = intel_ring_begin(rq, 6);
3606 	if (IS_ERR(cs))
3607 		return PTR_ERR(cs);
3608 
3609 	/*
3610 	 * Check if we have been preempted before we even get started.
3611 	 *
3612 	 * After this point i915_request_started() reports true, even if
3613 	 * we get preempted and so are no longer running.
3614 	 */
3615 	*cs++ = MI_ARB_CHECK;
3616 	*cs++ = MI_NOOP;
3617 
3618 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3619 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3620 	*cs++ = 0;
3621 	*cs++ = rq->fence.seqno - 1;
3622 
3623 	intel_ring_advance(rq, cs);
3624 
3625 	/* Record the updated position of the request's payload */
3626 	rq->infix = intel_ring_offset(rq, cs);
3627 
3628 	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3629 
3630 	return 0;
3631 }
3632 
3633 static int emit_pdps(struct i915_request *rq)
3634 {
3635 	const struct intel_engine_cs * const engine = rq->engine;
3636 	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3637 	int err, i;
3638 	u32 *cs;
3639 
3640 	GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3641 
3642 	/*
3643 	 * Beware ye of the dragons, this sequence is magic!
3644 	 *
3645 	 * Small changes to this sequence can cause anything from
3646 	 * GPU hangs to forcewake errors and machine lockups!
3647 	 */
3648 
3649 	/* Flush any residual operations from the context load */
3650 	err = engine->emit_flush(rq, EMIT_FLUSH);
3651 	if (err)
3652 		return err;
3653 
3654 	/* Magic required to prevent forcewake errors! */
3655 	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3656 	if (err)
3657 		return err;
3658 
3659 	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3660 	if (IS_ERR(cs))
3661 		return PTR_ERR(cs);
3662 
3663 	/* Ensure the LRI have landed before we invalidate & continue */
3664 	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3665 	for (i = GEN8_3LVL_PDPES; i--; ) {
3666 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3667 		u32 base = engine->mmio_base;
3668 
3669 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3670 		*cs++ = upper_32_bits(pd_daddr);
3671 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3672 		*cs++ = lower_32_bits(pd_daddr);
3673 	}
3674 	*cs++ = MI_NOOP;
3675 
3676 	intel_ring_advance(rq, cs);
3677 
3678 	return 0;
3679 }
3680 
3681 static int execlists_request_alloc(struct i915_request *request)
3682 {
3683 	int ret;
3684 
3685 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3686 
3687 	/*
3688 	 * Flush enough space to reduce the likelihood of waiting after
3689 	 * we start building the request - in which case we will just
3690 	 * have to repeat work.
3691 	 */
3692 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3693 
3694 	/*
3695 	 * Note that after this point, we have committed to using
3696 	 * this request as it is being used to both track the
3697 	 * state of engine initialisation and liveness of the
3698 	 * golden renderstate above. Think twice before you try
3699 	 * to cancel/unwind this request now.
3700 	 */
3701 
3702 	if (!i915_vm_is_4lvl(request->context->vm)) {
3703 		ret = emit_pdps(request);
3704 		if (ret)
3705 			return ret;
3706 	}
3707 
3708 	/* Unconditionally invalidate GPU caches and TLBs. */
3709 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3710 	if (ret)
3711 		return ret;
3712 
3713 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3714 	return 0;
3715 }
3716 
3717 /*
3718  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3719  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3720  * but there is a slight complication as this is applied in WA batch where the
3721  * values are only initialized once so we cannot take register value at the
3722  * beginning and reuse it further; hence we save its value to memory, upload a
3723  * constant value with bit21 set and then we restore it back with the saved value.
3724  * To simplify the WA, a constant value is formed by using the default value
3725  * of this register. This shouldn't be a problem because we are only modifying
3726  * it for a short period and this batch in non-premptible. We can ofcourse
3727  * use additional instructions that read the actual value of the register
3728  * at that time and set our bit of interest but it makes the WA complicated.
3729  *
3730  * This WA is also required for Gen9 so extracting as a function avoids
3731  * code duplication.
3732  */
3733 static u32 *
3734 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3735 {
3736 	/* NB no one else is allowed to scribble over scratch + 256! */
3737 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3738 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3739 	*batch++ = intel_gt_scratch_offset(engine->gt,
3740 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3741 	*batch++ = 0;
3742 
3743 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3744 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3745 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3746 
3747 	batch = gen8_emit_pipe_control(batch,
3748 				       PIPE_CONTROL_CS_STALL |
3749 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3750 				       0);
3751 
3752 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3753 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3754 	*batch++ = intel_gt_scratch_offset(engine->gt,
3755 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3756 	*batch++ = 0;
3757 
3758 	return batch;
3759 }
3760 
3761 /*
3762  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3763  * initialized at the beginning and shared across all contexts but this field
3764  * helps us to have multiple batches at different offsets and select them based
3765  * on a criteria. At the moment this batch always start at the beginning of the page
3766  * and at this point we don't have multiple wa_ctx batch buffers.
3767  *
3768  * The number of WA applied are not known at the beginning; we use this field
3769  * to return the no of DWORDS written.
3770  *
3771  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3772  * so it adds NOOPs as padding to make it cacheline aligned.
3773  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3774  * makes a complete batch buffer.
3775  */
3776 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3777 {
3778 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3779 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3780 
3781 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3782 	if (IS_BROADWELL(engine->i915))
3783 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3784 
3785 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3786 	/* Actual scratch location is at 128 bytes offset */
3787 	batch = gen8_emit_pipe_control(batch,
3788 				       PIPE_CONTROL_FLUSH_L3 |
3789 				       PIPE_CONTROL_STORE_DATA_INDEX |
3790 				       PIPE_CONTROL_CS_STALL |
3791 				       PIPE_CONTROL_QW_WRITE,
3792 				       LRC_PPHWSP_SCRATCH_ADDR);
3793 
3794 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3795 
3796 	/* Pad to end of cacheline */
3797 	while ((unsigned long)batch % CACHELINE_BYTES)
3798 		*batch++ = MI_NOOP;
3799 
3800 	/*
3801 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3802 	 * execution depends on the length specified in terms of cache lines
3803 	 * in the register CTX_RCS_INDIRECT_CTX
3804 	 */
3805 
3806 	return batch;
3807 }
3808 
3809 struct lri {
3810 	i915_reg_t reg;
3811 	u32 value;
3812 };
3813 
3814 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3815 {
3816 	GEM_BUG_ON(!count || count > 63);
3817 
3818 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3819 	do {
3820 		*batch++ = i915_mmio_reg_offset(lri->reg);
3821 		*batch++ = lri->value;
3822 	} while (lri++, --count);
3823 	*batch++ = MI_NOOP;
3824 
3825 	return batch;
3826 }
3827 
3828 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3829 {
3830 	static const struct lri lri[] = {
3831 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3832 		{
3833 			COMMON_SLICE_CHICKEN2,
3834 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3835 				       0),
3836 		},
3837 
3838 		/* BSpec: 11391 */
3839 		{
3840 			FF_SLICE_CHICKEN,
3841 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3842 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3843 		},
3844 
3845 		/* BSpec: 11299 */
3846 		{
3847 			_3D_CHICKEN3,
3848 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3849 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3850 		}
3851 	};
3852 
3853 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3854 
3855 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3856 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3857 
3858 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3859 	batch = gen8_emit_pipe_control(batch,
3860 				       PIPE_CONTROL_FLUSH_L3 |
3861 				       PIPE_CONTROL_STORE_DATA_INDEX |
3862 				       PIPE_CONTROL_CS_STALL |
3863 				       PIPE_CONTROL_QW_WRITE,
3864 				       LRC_PPHWSP_SCRATCH_ADDR);
3865 
3866 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3867 
3868 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3869 	if (HAS_POOLED_EU(engine->i915)) {
3870 		/*
3871 		 * EU pool configuration is setup along with golden context
3872 		 * during context initialization. This value depends on
3873 		 * device type (2x6 or 3x6) and needs to be updated based
3874 		 * on which subslice is disabled especially for 2x6
3875 		 * devices, however it is safe to load default
3876 		 * configuration of 3x6 device instead of masking off
3877 		 * corresponding bits because HW ignores bits of a disabled
3878 		 * subslice and drops down to appropriate config. Please
3879 		 * see render_state_setup() in i915_gem_render_state.c for
3880 		 * possible configurations, to avoid duplication they are
3881 		 * not shown here again.
3882 		 */
3883 		*batch++ = GEN9_MEDIA_POOL_STATE;
3884 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3885 		*batch++ = 0x00777000;
3886 		*batch++ = 0;
3887 		*batch++ = 0;
3888 		*batch++ = 0;
3889 	}
3890 
3891 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3892 
3893 	/* Pad to end of cacheline */
3894 	while ((unsigned long)batch % CACHELINE_BYTES)
3895 		*batch++ = MI_NOOP;
3896 
3897 	return batch;
3898 }
3899 
3900 static u32 *
3901 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3902 {
3903 	int i;
3904 
3905 	/*
3906 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3907 	 *
3908 	 * Ensure the engine is idle prior to programming a
3909 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3910 	 */
3911 	batch = gen8_emit_pipe_control(batch,
3912 				       PIPE_CONTROL_CS_STALL,
3913 				       0);
3914 	/*
3915 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3916 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3917 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3918 	 * confusing. Since gen8_emit_pipe_control() already advances the
3919 	 * batch by 6 dwords, we advance the other 10 here, completing a
3920 	 * cacheline. It's not clear if the workaround requires this padding
3921 	 * before other commands, or if it's just the regular padding we would
3922 	 * already have for the workaround bb, so leave it here for now.
3923 	 */
3924 	for (i = 0; i < 10; i++)
3925 		*batch++ = MI_NOOP;
3926 
3927 	/* Pad to end of cacheline */
3928 	while ((unsigned long)batch % CACHELINE_BYTES)
3929 		*batch++ = MI_NOOP;
3930 
3931 	return batch;
3932 }
3933 
3934 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3935 
3936 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3937 {
3938 	struct drm_i915_gem_object *obj;
3939 	struct i915_vma *vma;
3940 	int err;
3941 
3942 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3943 	if (IS_ERR(obj))
3944 		return PTR_ERR(obj);
3945 
3946 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3947 	if (IS_ERR(vma)) {
3948 		err = PTR_ERR(vma);
3949 		goto err;
3950 	}
3951 
3952 	err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
3953 	if (err)
3954 		goto err;
3955 
3956 	engine->wa_ctx.vma = vma;
3957 	return 0;
3958 
3959 err:
3960 	i915_gem_object_put(obj);
3961 	return err;
3962 }
3963 
3964 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3965 {
3966 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3967 }
3968 
3969 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3970 
3971 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3972 {
3973 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3974 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3975 					    &wa_ctx->per_ctx };
3976 	wa_bb_func_t wa_bb_fn[2];
3977 	void *batch, *batch_ptr;
3978 	unsigned int i;
3979 	int ret;
3980 
3981 	if (engine->class != RENDER_CLASS)
3982 		return 0;
3983 
3984 	switch (INTEL_GEN(engine->i915)) {
3985 	case 12:
3986 	case 11:
3987 		return 0;
3988 	case 10:
3989 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3990 		wa_bb_fn[1] = NULL;
3991 		break;
3992 	case 9:
3993 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3994 		wa_bb_fn[1] = NULL;
3995 		break;
3996 	case 8:
3997 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3998 		wa_bb_fn[1] = NULL;
3999 		break;
4000 	default:
4001 		MISSING_CASE(INTEL_GEN(engine->i915));
4002 		return 0;
4003 	}
4004 
4005 	ret = lrc_setup_wa_ctx(engine);
4006 	if (ret) {
4007 		drm_dbg(&engine->i915->drm,
4008 			"Failed to setup context WA page: %d\n", ret);
4009 		return ret;
4010 	}
4011 
4012 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
4013 
4014 	/*
4015 	 * Emit the two workaround batch buffers, recording the offset from the
4016 	 * start of the workaround batch buffer object for each and their
4017 	 * respective sizes.
4018 	 */
4019 	batch_ptr = batch;
4020 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
4021 		wa_bb[i]->offset = batch_ptr - batch;
4022 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
4023 						  CACHELINE_BYTES))) {
4024 			ret = -EINVAL;
4025 			break;
4026 		}
4027 		if (wa_bb_fn[i])
4028 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
4029 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
4030 	}
4031 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
4032 
4033 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
4034 	__i915_gem_object_release_map(wa_ctx->vma->obj);
4035 	if (ret)
4036 		lrc_destroy_wa_ctx(engine);
4037 
4038 	return ret;
4039 }
4040 
4041 static void reset_csb_pointers(struct intel_engine_cs *engine)
4042 {
4043 	struct intel_engine_execlists * const execlists = &engine->execlists;
4044 	const unsigned int reset_value = execlists->csb_size - 1;
4045 
4046 	ring_set_paused(engine, 0);
4047 
4048 	/*
4049 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
4050 	 * Bludgeon them with a mmio update to be sure.
4051 	 */
4052 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4053 		     0xffff << 16 | reset_value << 8 | reset_value);
4054 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4055 
4056 	/*
4057 	 * After a reset, the HW starts writing into CSB entry [0]. We
4058 	 * therefore have to set our HEAD pointer back one entry so that
4059 	 * the *first* entry we check is entry 0. To complicate this further,
4060 	 * as we don't wait for the first interrupt after reset, we have to
4061 	 * fake the HW write to point back to the last entry so that our
4062 	 * inline comparison of our cached head position against the last HW
4063 	 * write works even before the first interrupt.
4064 	 */
4065 	execlists->csb_head = reset_value;
4066 	WRITE_ONCE(*execlists->csb_write, reset_value);
4067 	wmb(); /* Make sure this is visible to HW (paranoia?) */
4068 
4069 	/* Check that the GPU does indeed update the CSB entries! */
4070 	memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64));
4071 	invalidate_csb_entries(&execlists->csb_status[0],
4072 			       &execlists->csb_status[reset_value]);
4073 
4074 	/* Once more for luck and our trusty paranoia */
4075 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4076 		     0xffff << 16 | reset_value << 8 | reset_value);
4077 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4078 
4079 	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4080 }
4081 
4082 static void execlists_sanitize(struct intel_engine_cs *engine)
4083 {
4084 	/*
4085 	 * Poison residual state on resume, in case the suspend didn't!
4086 	 *
4087 	 * We have to assume that across suspend/resume (or other loss
4088 	 * of control) that the contents of our pinned buffers has been
4089 	 * lost, replaced by garbage. Since this doesn't always happen,
4090 	 * let's poison such state so that we more quickly spot when
4091 	 * we falsely assume it has been preserved.
4092 	 */
4093 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4094 		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4095 
4096 	reset_csb_pointers(engine);
4097 
4098 	/*
4099 	 * The kernel_context HWSP is stored in the status_page. As above,
4100 	 * that may be lost on resume/initialisation, and so we need to
4101 	 * reset the value in the HWSP.
4102 	 */
4103 	intel_timeline_reset_seqno(engine->kernel_context->timeline);
4104 
4105 	/* And scrub the dirty cachelines for the HWSP */
4106 	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4107 }
4108 
4109 static void enable_error_interrupt(struct intel_engine_cs *engine)
4110 {
4111 	u32 status;
4112 
4113 	engine->execlists.error_interrupt = 0;
4114 	ENGINE_WRITE(engine, RING_EMR, ~0u);
4115 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4116 
4117 	status = ENGINE_READ(engine, RING_ESR);
4118 	if (unlikely(status)) {
4119 		drm_err(&engine->i915->drm,
4120 			"engine '%s' resumed still in error: %08x\n",
4121 			engine->name, status);
4122 		__intel_gt_reset(engine->gt, engine->mask);
4123 	}
4124 
4125 	/*
4126 	 * On current gen8+, we have 2 signals to play with
4127 	 *
4128 	 * - I915_ERROR_INSTUCTION (bit 0)
4129 	 *
4130 	 *    Generate an error if the command parser encounters an invalid
4131 	 *    instruction
4132 	 *
4133 	 *    This is a fatal error.
4134 	 *
4135 	 * - CP_PRIV (bit 2)
4136 	 *
4137 	 *    Generate an error on privilege violation (where the CP replaces
4138 	 *    the instruction with a no-op). This also fires for writes into
4139 	 *    read-only scratch pages.
4140 	 *
4141 	 *    This is a non-fatal error, parsing continues.
4142 	 *
4143 	 * * there are a few others defined for odd HW that we do not use
4144 	 *
4145 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4146 	 * error (as the HW is validating and suppressing the mistakes), we
4147 	 * only unmask the instruction error bit.
4148 	 */
4149 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4150 }
4151 
4152 static void enable_execlists(struct intel_engine_cs *engine)
4153 {
4154 	u32 mode;
4155 
4156 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4157 
4158 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4159 
4160 	if (INTEL_GEN(engine->i915) >= 11)
4161 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4162 	else
4163 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4164 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4165 
4166 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4167 
4168 	ENGINE_WRITE_FW(engine,
4169 			RING_HWS_PGA,
4170 			i915_ggtt_offset(engine->status_page.vma));
4171 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4172 
4173 	enable_error_interrupt(engine);
4174 
4175 	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4176 }
4177 
4178 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4179 {
4180 	bool unexpected = false;
4181 
4182 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4183 		drm_dbg(&engine->i915->drm,
4184 			"STOP_RING still set in RING_MI_MODE\n");
4185 		unexpected = true;
4186 	}
4187 
4188 	return unexpected;
4189 }
4190 
4191 static int execlists_resume(struct intel_engine_cs *engine)
4192 {
4193 	intel_mocs_init_engine(engine);
4194 
4195 	intel_breadcrumbs_reset(engine->breadcrumbs);
4196 
4197 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4198 		struct drm_printer p = drm_debug_printer(__func__);
4199 
4200 		intel_engine_dump(engine, &p, NULL);
4201 	}
4202 
4203 	enable_execlists(engine);
4204 
4205 	return 0;
4206 }
4207 
4208 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4209 {
4210 	struct intel_engine_execlists * const execlists = &engine->execlists;
4211 	unsigned long flags;
4212 
4213 	ENGINE_TRACE(engine, "depth<-%d\n",
4214 		     atomic_read(&execlists->tasklet.count));
4215 
4216 	/*
4217 	 * Prevent request submission to the hardware until we have
4218 	 * completed the reset in i915_gem_reset_finish(). If a request
4219 	 * is completed by one engine, it may then queue a request
4220 	 * to a second via its execlists->tasklet *just* as we are
4221 	 * calling engine->resume() and also writing the ELSP.
4222 	 * Turning off the execlists->tasklet until the reset is over
4223 	 * prevents the race.
4224 	 */
4225 	__tasklet_disable_sync_once(&execlists->tasklet);
4226 	GEM_BUG_ON(!reset_in_progress(execlists));
4227 
4228 	/* And flush any current direct submission. */
4229 	spin_lock_irqsave(&engine->active.lock, flags);
4230 	spin_unlock_irqrestore(&engine->active.lock, flags);
4231 
4232 	/*
4233 	 * We stop engines, otherwise we might get failed reset and a
4234 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4235 	 * from system hang if batchbuffer is progressing when
4236 	 * the reset is issued, regardless of READY_TO_RESET ack.
4237 	 * Thus assume it is best to stop engines on all gens
4238 	 * where we have a gpu reset.
4239 	 *
4240 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4241 	 *
4242 	 * FIXME: Wa for more modern gens needs to be validated
4243 	 */
4244 	ring_set_paused(engine, 1);
4245 	intel_engine_stop_cs(engine);
4246 
4247 	engine->execlists.reset_ccid = active_ccid(engine);
4248 }
4249 
4250 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4251 {
4252 	int x;
4253 
4254 	x = lrc_ring_mi_mode(engine);
4255 	if (x != -1) {
4256 		regs[x + 1] &= ~STOP_RING;
4257 		regs[x + 1] |= STOP_RING << 16;
4258 	}
4259 }
4260 
4261 static void __execlists_reset_reg_state(const struct intel_context *ce,
4262 					const struct intel_engine_cs *engine)
4263 {
4264 	u32 *regs = ce->lrc_reg_state;
4265 
4266 	__reset_stop_ring(regs, engine);
4267 }
4268 
4269 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4270 {
4271 	struct intel_engine_execlists * const execlists = &engine->execlists;
4272 	struct intel_context *ce;
4273 	struct i915_request *rq;
4274 	u32 head;
4275 
4276 	mb(); /* paranoia: read the CSB pointers from after the reset */
4277 	clflush(execlists->csb_write);
4278 	mb();
4279 
4280 	process_csb(engine); /* drain preemption events */
4281 
4282 	/* Following the reset, we need to reload the CSB read/write pointers */
4283 	reset_csb_pointers(engine);
4284 
4285 	/*
4286 	 * Save the currently executing context, even if we completed
4287 	 * its request, it was still running at the time of the
4288 	 * reset and will have been clobbered.
4289 	 */
4290 	rq = active_context(engine, engine->execlists.reset_ccid);
4291 	if (!rq)
4292 		goto unwind;
4293 
4294 	ce = rq->context;
4295 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4296 
4297 	if (i915_request_completed(rq)) {
4298 		/* Idle context; tidy up the ring so we can restart afresh */
4299 		head = intel_ring_wrap(ce->ring, rq->tail);
4300 		goto out_replay;
4301 	}
4302 
4303 	/* We still have requests in-flight; the engine should be active */
4304 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4305 
4306 	/* Context has requests still in-flight; it should not be idle! */
4307 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4308 
4309 	rq = active_request(ce->timeline, rq);
4310 	head = intel_ring_wrap(ce->ring, rq->head);
4311 	GEM_BUG_ON(head == ce->ring->tail);
4312 
4313 	/*
4314 	 * If this request hasn't started yet, e.g. it is waiting on a
4315 	 * semaphore, we need to avoid skipping the request or else we
4316 	 * break the signaling chain. However, if the context is corrupt
4317 	 * the request will not restart and we will be stuck with a wedged
4318 	 * device. It is quite often the case that if we issue a reset
4319 	 * while the GPU is loading the context image, that the context
4320 	 * image becomes corrupt.
4321 	 *
4322 	 * Otherwise, if we have not started yet, the request should replay
4323 	 * perfectly and we do not need to flag the result as being erroneous.
4324 	 */
4325 	if (!i915_request_started(rq))
4326 		goto out_replay;
4327 
4328 	/*
4329 	 * If the request was innocent, we leave the request in the ELSP
4330 	 * and will try to replay it on restarting. The context image may
4331 	 * have been corrupted by the reset, in which case we may have
4332 	 * to service a new GPU hang, but more likely we can continue on
4333 	 * without impact.
4334 	 *
4335 	 * If the request was guilty, we presume the context is corrupt
4336 	 * and have to at least restore the RING register in the context
4337 	 * image back to the expected values to skip over the guilty request.
4338 	 */
4339 	__i915_request_reset(rq, stalled);
4340 
4341 	/*
4342 	 * We want a simple context + ring to execute the breadcrumb update.
4343 	 * We cannot rely on the context being intact across the GPU hang,
4344 	 * so clear it and rebuild just what we need for the breadcrumb.
4345 	 * All pending requests for this context will be zapped, and any
4346 	 * future request will be after userspace has had the opportunity
4347 	 * to recreate its own state.
4348 	 */
4349 out_replay:
4350 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4351 		     head, ce->ring->tail);
4352 	__execlists_reset_reg_state(ce, engine);
4353 	__execlists_update_reg_state(ce, engine, head);
4354 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4355 
4356 unwind:
4357 	/* Push back any incomplete requests for replay after the reset. */
4358 	cancel_port_requests(execlists);
4359 	__unwind_incomplete_requests(engine);
4360 }
4361 
4362 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4363 {
4364 	unsigned long flags;
4365 
4366 	ENGINE_TRACE(engine, "\n");
4367 
4368 	spin_lock_irqsave(&engine->active.lock, flags);
4369 
4370 	__execlists_reset(engine, stalled);
4371 
4372 	spin_unlock_irqrestore(&engine->active.lock, flags);
4373 }
4374 
4375 static void nop_submission_tasklet(unsigned long data)
4376 {
4377 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4378 
4379 	/* The driver is wedged; don't process any more events. */
4380 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4381 }
4382 
4383 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4384 {
4385 	struct intel_engine_execlists * const execlists = &engine->execlists;
4386 	struct i915_request *rq, *rn;
4387 	struct rb_node *rb;
4388 	unsigned long flags;
4389 
4390 	ENGINE_TRACE(engine, "\n");
4391 
4392 	/*
4393 	 * Before we call engine->cancel_requests(), we should have exclusive
4394 	 * access to the submission state. This is arranged for us by the
4395 	 * caller disabling the interrupt generation, the tasklet and other
4396 	 * threads that may then access the same state, giving us a free hand
4397 	 * to reset state. However, we still need to let lockdep be aware that
4398 	 * we know this state may be accessed in hardirq context, so we
4399 	 * disable the irq around this manipulation and we want to keep
4400 	 * the spinlock focused on its duties and not accidentally conflate
4401 	 * coverage to the submission's irq state. (Similarly, although we
4402 	 * shouldn't need to disable irq around the manipulation of the
4403 	 * submission's irq state, we also wish to remind ourselves that
4404 	 * it is irq state.)
4405 	 */
4406 	spin_lock_irqsave(&engine->active.lock, flags);
4407 
4408 	__execlists_reset(engine, true);
4409 
4410 	/* Mark all executing requests as skipped. */
4411 	list_for_each_entry(rq, &engine->active.requests, sched.link)
4412 		mark_eio(rq);
4413 	intel_engine_signal_breadcrumbs(engine);
4414 
4415 	/* Flush the queued requests to the timeline list (for retiring). */
4416 	while ((rb = rb_first_cached(&execlists->queue))) {
4417 		struct i915_priolist *p = to_priolist(rb);
4418 		int i;
4419 
4420 		priolist_for_each_request_consume(rq, rn, p, i) {
4421 			mark_eio(rq);
4422 			__i915_request_submit(rq);
4423 		}
4424 
4425 		rb_erase_cached(&p->node, &execlists->queue);
4426 		i915_priolist_free(p);
4427 	}
4428 
4429 	/* On-hold requests will be flushed to timeline upon their release */
4430 	list_for_each_entry(rq, &engine->active.hold, sched.link)
4431 		mark_eio(rq);
4432 
4433 	/* Cancel all attached virtual engines */
4434 	while ((rb = rb_first_cached(&execlists->virtual))) {
4435 		struct virtual_engine *ve =
4436 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4437 
4438 		rb_erase_cached(rb, &execlists->virtual);
4439 		RB_CLEAR_NODE(rb);
4440 
4441 		spin_lock(&ve->base.active.lock);
4442 		rq = fetch_and_zero(&ve->request);
4443 		if (rq) {
4444 			mark_eio(rq);
4445 
4446 			rq->engine = engine;
4447 			__i915_request_submit(rq);
4448 			i915_request_put(rq);
4449 
4450 			ve->base.execlists.queue_priority_hint = INT_MIN;
4451 		}
4452 		spin_unlock(&ve->base.active.lock);
4453 	}
4454 
4455 	/* Remaining _unready_ requests will be nop'ed when submitted */
4456 
4457 	execlists->queue_priority_hint = INT_MIN;
4458 	execlists->queue = RB_ROOT_CACHED;
4459 
4460 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4461 	execlists->tasklet.func = nop_submission_tasklet;
4462 
4463 	spin_unlock_irqrestore(&engine->active.lock, flags);
4464 }
4465 
4466 static void execlists_reset_finish(struct intel_engine_cs *engine)
4467 {
4468 	struct intel_engine_execlists * const execlists = &engine->execlists;
4469 
4470 	/*
4471 	 * After a GPU reset, we may have requests to replay. Do so now while
4472 	 * we still have the forcewake to be sure that the GPU is not allowed
4473 	 * to sleep before we restart and reload a context.
4474 	 */
4475 	GEM_BUG_ON(!reset_in_progress(execlists));
4476 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4477 		execlists->tasklet.func(execlists->tasklet.data);
4478 
4479 	if (__tasklet_enable(&execlists->tasklet))
4480 		/* And kick in case we missed a new request submission. */
4481 		tasklet_hi_schedule(&execlists->tasklet);
4482 	ENGINE_TRACE(engine, "depth->%d\n",
4483 		     atomic_read(&execlists->tasklet.count));
4484 }
4485 
4486 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4487 				    u64 offset, u32 len,
4488 				    const unsigned int flags)
4489 {
4490 	u32 *cs;
4491 
4492 	cs = intel_ring_begin(rq, 4);
4493 	if (IS_ERR(cs))
4494 		return PTR_ERR(cs);
4495 
4496 	/*
4497 	 * WaDisableCtxRestoreArbitration:bdw,chv
4498 	 *
4499 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4500 	 * particular all the gen that do not need the w/a at all!), if we
4501 	 * took care to make sure that on every switch into this context
4502 	 * (both ordinary and for preemption) that arbitrartion was enabled
4503 	 * we would be fine.  However, for gen8 there is another w/a that
4504 	 * requires us to not preempt inside GPGPU execution, so we keep
4505 	 * arbitration disabled for gen8 batches. Arbitration will be
4506 	 * re-enabled before we close the request
4507 	 * (engine->emit_fini_breadcrumb).
4508 	 */
4509 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4510 
4511 	/* FIXME(BDW+): Address space and security selectors. */
4512 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4513 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4514 	*cs++ = lower_32_bits(offset);
4515 	*cs++ = upper_32_bits(offset);
4516 
4517 	intel_ring_advance(rq, cs);
4518 
4519 	return 0;
4520 }
4521 
4522 static int gen8_emit_bb_start(struct i915_request *rq,
4523 			      u64 offset, u32 len,
4524 			      const unsigned int flags)
4525 {
4526 	u32 *cs;
4527 
4528 	cs = intel_ring_begin(rq, 6);
4529 	if (IS_ERR(cs))
4530 		return PTR_ERR(cs);
4531 
4532 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4533 
4534 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4535 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4536 	*cs++ = lower_32_bits(offset);
4537 	*cs++ = upper_32_bits(offset);
4538 
4539 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4540 	*cs++ = MI_NOOP;
4541 
4542 	intel_ring_advance(rq, cs);
4543 
4544 	return 0;
4545 }
4546 
4547 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4548 {
4549 	ENGINE_WRITE(engine, RING_IMR,
4550 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4551 	ENGINE_POSTING_READ(engine, RING_IMR);
4552 }
4553 
4554 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4555 {
4556 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4557 }
4558 
4559 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4560 {
4561 	u32 cmd, *cs;
4562 
4563 	cs = intel_ring_begin(request, 4);
4564 	if (IS_ERR(cs))
4565 		return PTR_ERR(cs);
4566 
4567 	cmd = MI_FLUSH_DW + 1;
4568 
4569 	/* We always require a command barrier so that subsequent
4570 	 * commands, such as breadcrumb interrupts, are strictly ordered
4571 	 * wrt the contents of the write cache being flushed to memory
4572 	 * (and thus being coherent from the CPU).
4573 	 */
4574 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4575 
4576 	if (mode & EMIT_INVALIDATE) {
4577 		cmd |= MI_INVALIDATE_TLB;
4578 		if (request->engine->class == VIDEO_DECODE_CLASS)
4579 			cmd |= MI_INVALIDATE_BSD;
4580 	}
4581 
4582 	*cs++ = cmd;
4583 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4584 	*cs++ = 0; /* upper addr */
4585 	*cs++ = 0; /* value */
4586 	intel_ring_advance(request, cs);
4587 
4588 	return 0;
4589 }
4590 
4591 static int gen8_emit_flush_render(struct i915_request *request,
4592 				  u32 mode)
4593 {
4594 	bool vf_flush_wa = false, dc_flush_wa = false;
4595 	u32 *cs, flags = 0;
4596 	int len;
4597 
4598 	flags |= PIPE_CONTROL_CS_STALL;
4599 
4600 	if (mode & EMIT_FLUSH) {
4601 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4602 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4603 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4604 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4605 	}
4606 
4607 	if (mode & EMIT_INVALIDATE) {
4608 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4609 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4610 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4611 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4612 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4613 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4614 		flags |= PIPE_CONTROL_QW_WRITE;
4615 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4616 
4617 		/*
4618 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4619 		 * pipe control.
4620 		 */
4621 		if (IS_GEN(request->engine->i915, 9))
4622 			vf_flush_wa = true;
4623 
4624 		/* WaForGAMHang:kbl */
4625 		if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0))
4626 			dc_flush_wa = true;
4627 	}
4628 
4629 	len = 6;
4630 
4631 	if (vf_flush_wa)
4632 		len += 6;
4633 
4634 	if (dc_flush_wa)
4635 		len += 12;
4636 
4637 	cs = intel_ring_begin(request, len);
4638 	if (IS_ERR(cs))
4639 		return PTR_ERR(cs);
4640 
4641 	if (vf_flush_wa)
4642 		cs = gen8_emit_pipe_control(cs, 0, 0);
4643 
4644 	if (dc_flush_wa)
4645 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4646 					    0);
4647 
4648 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4649 
4650 	if (dc_flush_wa)
4651 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4652 
4653 	intel_ring_advance(request, cs);
4654 
4655 	return 0;
4656 }
4657 
4658 static int gen11_emit_flush_render(struct i915_request *request,
4659 				   u32 mode)
4660 {
4661 	if (mode & EMIT_FLUSH) {
4662 		u32 *cs;
4663 		u32 flags = 0;
4664 
4665 		flags |= PIPE_CONTROL_CS_STALL;
4666 
4667 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4668 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4669 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4670 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4671 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4672 		flags |= PIPE_CONTROL_QW_WRITE;
4673 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4674 
4675 		cs = intel_ring_begin(request, 6);
4676 		if (IS_ERR(cs))
4677 			return PTR_ERR(cs);
4678 
4679 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4680 		intel_ring_advance(request, cs);
4681 	}
4682 
4683 	if (mode & EMIT_INVALIDATE) {
4684 		u32 *cs;
4685 		u32 flags = 0;
4686 
4687 		flags |= PIPE_CONTROL_CS_STALL;
4688 
4689 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4690 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4691 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4692 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4693 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4694 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4695 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4696 		flags |= PIPE_CONTROL_QW_WRITE;
4697 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4698 
4699 		cs = intel_ring_begin(request, 6);
4700 		if (IS_ERR(cs))
4701 			return PTR_ERR(cs);
4702 
4703 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4704 		intel_ring_advance(request, cs);
4705 	}
4706 
4707 	return 0;
4708 }
4709 
4710 static u32 preparser_disable(bool state)
4711 {
4712 	return MI_ARB_CHECK | 1 << 8 | state;
4713 }
4714 
4715 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4716 {
4717 	static const i915_reg_t vd[] = {
4718 		GEN12_VD0_AUX_NV,
4719 		GEN12_VD1_AUX_NV,
4720 		GEN12_VD2_AUX_NV,
4721 		GEN12_VD3_AUX_NV,
4722 	};
4723 
4724 	static const i915_reg_t ve[] = {
4725 		GEN12_VE0_AUX_NV,
4726 		GEN12_VE1_AUX_NV,
4727 	};
4728 
4729 	if (engine->class == VIDEO_DECODE_CLASS)
4730 		return vd[engine->instance];
4731 
4732 	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4733 		return ve[engine->instance];
4734 
4735 	GEM_BUG_ON("unknown aux_inv_reg\n");
4736 
4737 	return INVALID_MMIO_REG;
4738 }
4739 
4740 static u32 *
4741 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4742 {
4743 	*cs++ = MI_LOAD_REGISTER_IMM(1);
4744 	*cs++ = i915_mmio_reg_offset(inv_reg);
4745 	*cs++ = AUX_INV;
4746 	*cs++ = MI_NOOP;
4747 
4748 	return cs;
4749 }
4750 
4751 static int gen12_emit_flush_render(struct i915_request *request,
4752 				   u32 mode)
4753 {
4754 	if (mode & EMIT_FLUSH) {
4755 		u32 flags = 0;
4756 		u32 *cs;
4757 
4758 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4759 		flags |= PIPE_CONTROL_FLUSH_L3;
4760 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4761 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4762 		/* Wa_1409600907:tgl */
4763 		flags |= PIPE_CONTROL_DEPTH_STALL;
4764 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4765 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4766 
4767 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4768 		flags |= PIPE_CONTROL_QW_WRITE;
4769 
4770 		flags |= PIPE_CONTROL_CS_STALL;
4771 
4772 		cs = intel_ring_begin(request, 6);
4773 		if (IS_ERR(cs))
4774 			return PTR_ERR(cs);
4775 
4776 		cs = gen12_emit_pipe_control(cs,
4777 					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4778 					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4779 		intel_ring_advance(request, cs);
4780 	}
4781 
4782 	if (mode & EMIT_INVALIDATE) {
4783 		u32 flags = 0;
4784 		u32 *cs;
4785 
4786 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4787 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4788 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4789 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4790 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4791 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4792 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4793 
4794 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4795 		flags |= PIPE_CONTROL_QW_WRITE;
4796 
4797 		flags |= PIPE_CONTROL_CS_STALL;
4798 
4799 		cs = intel_ring_begin(request, 8 + 4);
4800 		if (IS_ERR(cs))
4801 			return PTR_ERR(cs);
4802 
4803 		/*
4804 		 * Prevent the pre-parser from skipping past the TLB
4805 		 * invalidate and loading a stale page for the batch
4806 		 * buffer / request payload.
4807 		 */
4808 		*cs++ = preparser_disable(true);
4809 
4810 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4811 
4812 		/* hsdes: 1809175790 */
4813 		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4814 
4815 		*cs++ = preparser_disable(false);
4816 		intel_ring_advance(request, cs);
4817 	}
4818 
4819 	return 0;
4820 }
4821 
4822 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4823 {
4824 	intel_engine_mask_t aux_inv = 0;
4825 	u32 cmd, *cs;
4826 
4827 	cmd = 4;
4828 	if (mode & EMIT_INVALIDATE)
4829 		cmd += 2;
4830 	if (mode & EMIT_INVALIDATE)
4831 		aux_inv = request->engine->mask & ~BIT(BCS0);
4832 	if (aux_inv)
4833 		cmd += 2 * hweight8(aux_inv) + 2;
4834 
4835 	cs = intel_ring_begin(request, cmd);
4836 	if (IS_ERR(cs))
4837 		return PTR_ERR(cs);
4838 
4839 	if (mode & EMIT_INVALIDATE)
4840 		*cs++ = preparser_disable(true);
4841 
4842 	cmd = MI_FLUSH_DW + 1;
4843 
4844 	/* We always require a command barrier so that subsequent
4845 	 * commands, such as breadcrumb interrupts, are strictly ordered
4846 	 * wrt the contents of the write cache being flushed to memory
4847 	 * (and thus being coherent from the CPU).
4848 	 */
4849 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4850 
4851 	if (mode & EMIT_INVALIDATE) {
4852 		cmd |= MI_INVALIDATE_TLB;
4853 		if (request->engine->class == VIDEO_DECODE_CLASS)
4854 			cmd |= MI_INVALIDATE_BSD;
4855 	}
4856 
4857 	*cs++ = cmd;
4858 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4859 	*cs++ = 0; /* upper addr */
4860 	*cs++ = 0; /* value */
4861 
4862 	if (aux_inv) { /* hsdes: 1809175790 */
4863 		struct intel_engine_cs *engine;
4864 		unsigned int tmp;
4865 
4866 		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4867 		for_each_engine_masked(engine, request->engine->gt,
4868 				       aux_inv, tmp) {
4869 			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4870 			*cs++ = AUX_INV;
4871 		}
4872 		*cs++ = MI_NOOP;
4873 	}
4874 
4875 	if (mode & EMIT_INVALIDATE)
4876 		*cs++ = preparser_disable(false);
4877 
4878 	intel_ring_advance(request, cs);
4879 
4880 	return 0;
4881 }
4882 
4883 static void assert_request_valid(struct i915_request *rq)
4884 {
4885 	struct intel_ring *ring __maybe_unused = rq->ring;
4886 
4887 	/* Can we unwind this request without appearing to go forwards? */
4888 	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4889 }
4890 
4891 /*
4892  * Reserve space for 2 NOOPs at the end of each request to be
4893  * used as a workaround for not being allowed to do lite
4894  * restore with HEAD==TAIL (WaIdleLiteRestore).
4895  */
4896 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4897 {
4898 	/* Ensure there's always at least one preemption point per-request. */
4899 	*cs++ = MI_ARB_CHECK;
4900 	*cs++ = MI_NOOP;
4901 	request->wa_tail = intel_ring_offset(request, cs);
4902 
4903 	/* Check that entire request is less than half the ring */
4904 	assert_request_valid(request);
4905 
4906 	return cs;
4907 }
4908 
4909 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4910 {
4911 	*cs++ = MI_SEMAPHORE_WAIT |
4912 		MI_SEMAPHORE_GLOBAL_GTT |
4913 		MI_SEMAPHORE_POLL |
4914 		MI_SEMAPHORE_SAD_EQ_SDD;
4915 	*cs++ = 0;
4916 	*cs++ = intel_hws_preempt_address(request->engine);
4917 	*cs++ = 0;
4918 
4919 	return cs;
4920 }
4921 
4922 static __always_inline u32*
4923 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4924 {
4925 	*cs++ = MI_USER_INTERRUPT;
4926 
4927 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4928 	if (intel_engine_has_semaphores(request->engine))
4929 		cs = emit_preempt_busywait(request, cs);
4930 
4931 	request->tail = intel_ring_offset(request, cs);
4932 	assert_ring_tail_valid(request->ring, request->tail);
4933 
4934 	return gen8_emit_wa_tail(request, cs);
4935 }
4936 
4937 static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs)
4938 {
4939 	u32 addr = i915_request_active_timeline(request)->hwsp_offset;
4940 
4941 	return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0);
4942 }
4943 
4944 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4945 {
4946 	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4947 }
4948 
4949 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4950 {
4951 	cs = gen8_emit_pipe_control(cs,
4952 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4953 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4954 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4955 				    0);
4956 
4957 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4958 	cs = gen8_emit_ggtt_write_rcs(cs,
4959 				      request->fence.seqno,
4960 				      i915_request_active_timeline(request)->hwsp_offset,
4961 				      PIPE_CONTROL_FLUSH_ENABLE |
4962 				      PIPE_CONTROL_CS_STALL);
4963 
4964 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4965 }
4966 
4967 static u32 *
4968 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4969 {
4970 	cs = gen8_emit_ggtt_write_rcs(cs,
4971 				      request->fence.seqno,
4972 				      i915_request_active_timeline(request)->hwsp_offset,
4973 				      PIPE_CONTROL_CS_STALL |
4974 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4975 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4976 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4977 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4978 				      PIPE_CONTROL_FLUSH_ENABLE);
4979 
4980 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4981 }
4982 
4983 /*
4984  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4985  * flush and will continue pre-fetching the instructions after it before the
4986  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4987  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4988  * of the next request before the memory has been flushed, we're guaranteed that
4989  * we won't access the batch itself too early.
4990  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4991  * so, if the current request is modifying an instruction in the next request on
4992  * the same intel_context, we might pre-fetch and then execute the pre-update
4993  * instruction. To avoid this, the users of self-modifying code should either
4994  * disable the parser around the code emitting the memory writes, via a new flag
4995  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4996  * the in-kernel use-cases we've opted to use a separate context, see
4997  * reloc_gpu() as an example.
4998  * All the above applies only to the instructions themselves. Non-inline data
4999  * used by the instructions is not pre-fetched.
5000  */
5001 
5002 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
5003 {
5004 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
5005 		MI_SEMAPHORE_GLOBAL_GTT |
5006 		MI_SEMAPHORE_POLL |
5007 		MI_SEMAPHORE_SAD_EQ_SDD;
5008 	*cs++ = 0;
5009 	*cs++ = intel_hws_preempt_address(request->engine);
5010 	*cs++ = 0;
5011 	*cs++ = 0;
5012 	*cs++ = MI_NOOP;
5013 
5014 	return cs;
5015 }
5016 
5017 static __always_inline u32*
5018 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
5019 {
5020 	*cs++ = MI_USER_INTERRUPT;
5021 
5022 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
5023 	if (intel_engine_has_semaphores(request->engine))
5024 		cs = gen12_emit_preempt_busywait(request, cs);
5025 
5026 	request->tail = intel_ring_offset(request, cs);
5027 	assert_ring_tail_valid(request->ring, request->tail);
5028 
5029 	return gen8_emit_wa_tail(request, cs);
5030 }
5031 
5032 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
5033 {
5034 	return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
5035 }
5036 
5037 static u32 *
5038 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
5039 {
5040 	cs = gen12_emit_ggtt_write_rcs(cs,
5041 				       request->fence.seqno,
5042 				       i915_request_active_timeline(request)->hwsp_offset,
5043 				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
5044 				       PIPE_CONTROL_CS_STALL |
5045 				       PIPE_CONTROL_TILE_CACHE_FLUSH |
5046 				       PIPE_CONTROL_FLUSH_L3 |
5047 				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5048 				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5049 				       /* Wa_1409600907:tgl */
5050 				       PIPE_CONTROL_DEPTH_STALL |
5051 				       PIPE_CONTROL_DC_FLUSH_ENABLE |
5052 				       PIPE_CONTROL_FLUSH_ENABLE);
5053 
5054 	return gen12_emit_fini_breadcrumb_tail(request, cs);
5055 }
5056 
5057 static void execlists_park(struct intel_engine_cs *engine)
5058 {
5059 	cancel_timer(&engine->execlists.timer);
5060 	cancel_timer(&engine->execlists.preempt);
5061 }
5062 
5063 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
5064 {
5065 	engine->submit_request = execlists_submit_request;
5066 	engine->schedule = i915_schedule;
5067 	engine->execlists.tasklet.func = execlists_submission_tasklet;
5068 
5069 	engine->reset.prepare = execlists_reset_prepare;
5070 	engine->reset.rewind = execlists_reset_rewind;
5071 	engine->reset.cancel = execlists_reset_cancel;
5072 	engine->reset.finish = execlists_reset_finish;
5073 
5074 	engine->park = execlists_park;
5075 	engine->unpark = NULL;
5076 
5077 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5078 	if (!intel_vgpu_active(engine->i915)) {
5079 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5080 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5081 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5082 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5083 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5084 		}
5085 	}
5086 
5087 	if (INTEL_GEN(engine->i915) >= 12)
5088 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5089 
5090 	if (intel_engine_has_preemption(engine))
5091 		engine->emit_bb_start = gen8_emit_bb_start;
5092 	else
5093 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
5094 }
5095 
5096 static void execlists_shutdown(struct intel_engine_cs *engine)
5097 {
5098 	/* Synchronise with residual timers and any softirq they raise */
5099 	del_timer_sync(&engine->execlists.timer);
5100 	del_timer_sync(&engine->execlists.preempt);
5101 	tasklet_kill(&engine->execlists.tasklet);
5102 }
5103 
5104 static void execlists_release(struct intel_engine_cs *engine)
5105 {
5106 	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5107 
5108 	execlists_shutdown(engine);
5109 
5110 	intel_engine_cleanup_common(engine);
5111 	lrc_destroy_wa_ctx(engine);
5112 }
5113 
5114 static void
5115 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5116 {
5117 	/* Default vfuncs which can be overriden by each engine. */
5118 
5119 	engine->resume = execlists_resume;
5120 
5121 	engine->cops = &execlists_context_ops;
5122 	engine->request_alloc = execlists_request_alloc;
5123 
5124 	engine->emit_flush = gen8_emit_flush;
5125 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5126 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5127 	if (INTEL_GEN(engine->i915) >= 12) {
5128 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5129 		engine->emit_flush = gen12_emit_flush;
5130 	}
5131 	engine->set_default_submission = intel_execlists_set_default_submission;
5132 
5133 	if (INTEL_GEN(engine->i915) < 11) {
5134 		engine->irq_enable = gen8_logical_ring_enable_irq;
5135 		engine->irq_disable = gen8_logical_ring_disable_irq;
5136 	} else {
5137 		/*
5138 		 * TODO: On Gen11 interrupt masks need to be clear
5139 		 * to allow C6 entry. Keep interrupts enabled at
5140 		 * and take the hit of generating extra interrupts
5141 		 * until a more refined solution exists.
5142 		 */
5143 	}
5144 }
5145 
5146 static inline void
5147 logical_ring_default_irqs(struct intel_engine_cs *engine)
5148 {
5149 	unsigned int shift = 0;
5150 
5151 	if (INTEL_GEN(engine->i915) < 11) {
5152 		const u8 irq_shifts[] = {
5153 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5154 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5155 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5156 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5157 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5158 		};
5159 
5160 		shift = irq_shifts[engine->id];
5161 	}
5162 
5163 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5164 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5165 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5166 	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5167 }
5168 
5169 static void rcs_submission_override(struct intel_engine_cs *engine)
5170 {
5171 	switch (INTEL_GEN(engine->i915)) {
5172 	case 12:
5173 		engine->emit_flush = gen12_emit_flush_render;
5174 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5175 		break;
5176 	case 11:
5177 		engine->emit_flush = gen11_emit_flush_render;
5178 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5179 		break;
5180 	default:
5181 		engine->emit_flush = gen8_emit_flush_render;
5182 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5183 		break;
5184 	}
5185 }
5186 
5187 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5188 {
5189 	struct intel_engine_execlists * const execlists = &engine->execlists;
5190 	struct drm_i915_private *i915 = engine->i915;
5191 	struct intel_uncore *uncore = engine->uncore;
5192 	u32 base = engine->mmio_base;
5193 
5194 	tasklet_init(&engine->execlists.tasklet,
5195 		     execlists_submission_tasklet, (unsigned long)engine);
5196 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5197 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5198 
5199 	logical_ring_default_vfuncs(engine);
5200 	logical_ring_default_irqs(engine);
5201 
5202 	if (engine->class == RENDER_CLASS)
5203 		rcs_submission_override(engine);
5204 
5205 	if (intel_init_workaround_bb(engine))
5206 		/*
5207 		 * We continue even if we fail to initialize WA batch
5208 		 * because we only expect rare glitches but nothing
5209 		 * critical to prevent us from using GPU
5210 		 */
5211 		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5212 
5213 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5214 		execlists->submit_reg = uncore->regs +
5215 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5216 		execlists->ctrl_reg = uncore->regs +
5217 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5218 	} else {
5219 		execlists->submit_reg = uncore->regs +
5220 			i915_mmio_reg_offset(RING_ELSP(base));
5221 	}
5222 
5223 	execlists->csb_status =
5224 		(u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5225 
5226 	execlists->csb_write =
5227 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5228 
5229 	if (INTEL_GEN(i915) < 11)
5230 		execlists->csb_size = GEN8_CSB_ENTRIES;
5231 	else
5232 		execlists->csb_size = GEN11_CSB_ENTRIES;
5233 
5234 	if (INTEL_GEN(engine->i915) >= 11) {
5235 		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5236 		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5237 	}
5238 
5239 	/* Finally, take ownership and responsibility for cleanup! */
5240 	engine->sanitize = execlists_sanitize;
5241 	engine->release = execlists_release;
5242 
5243 	return 0;
5244 }
5245 
5246 static void init_common_reg_state(u32 * const regs,
5247 				  const struct intel_engine_cs *engine,
5248 				  const struct intel_ring *ring,
5249 				  bool inhibit)
5250 {
5251 	u32 ctl;
5252 
5253 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5254 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5255 	if (inhibit)
5256 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5257 	if (INTEL_GEN(engine->i915) < 11)
5258 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5259 					   CTX_CTRL_RS_CTX_ENABLE);
5260 	regs[CTX_CONTEXT_CONTROL] = ctl;
5261 
5262 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5263 	regs[CTX_TIMESTAMP] = 0;
5264 }
5265 
5266 static void init_wa_bb_reg_state(u32 * const regs,
5267 				 const struct intel_engine_cs *engine)
5268 {
5269 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5270 
5271 	if (wa_ctx->per_ctx.size) {
5272 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5273 
5274 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5275 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5276 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5277 	}
5278 
5279 	if (wa_ctx->indirect_ctx.size) {
5280 		lrc_ring_setup_indirect_ctx(regs, engine,
5281 					    i915_ggtt_offset(wa_ctx->vma) +
5282 					    wa_ctx->indirect_ctx.offset,
5283 					    wa_ctx->indirect_ctx.size);
5284 	}
5285 }
5286 
5287 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5288 {
5289 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5290 		/* 64b PPGTT (48bit canonical)
5291 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5292 		 * other PDP Descriptors are ignored.
5293 		 */
5294 		ASSIGN_CTX_PML4(ppgtt, regs);
5295 	} else {
5296 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5297 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5298 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5299 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5300 	}
5301 }
5302 
5303 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5304 {
5305 	if (i915_is_ggtt(vm))
5306 		return i915_vm_to_ggtt(vm)->alias;
5307 	else
5308 		return i915_vm_to_ppgtt(vm);
5309 }
5310 
5311 static void execlists_init_reg_state(u32 *regs,
5312 				     const struct intel_context *ce,
5313 				     const struct intel_engine_cs *engine,
5314 				     const struct intel_ring *ring,
5315 				     bool inhibit)
5316 {
5317 	/*
5318 	 * A context is actually a big batch buffer with several
5319 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5320 	 * values we are setting here are only for the first context restore:
5321 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5322 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5323 	 * we are not initializing here).
5324 	 *
5325 	 * Must keep consistent with virtual_update_register_offsets().
5326 	 */
5327 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5328 
5329 	init_common_reg_state(regs, engine, ring, inhibit);
5330 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5331 
5332 	init_wa_bb_reg_state(regs, engine);
5333 
5334 	__reset_stop_ring(regs, engine);
5335 }
5336 
5337 static int
5338 populate_lr_context(struct intel_context *ce,
5339 		    struct drm_i915_gem_object *ctx_obj,
5340 		    struct intel_engine_cs *engine,
5341 		    struct intel_ring *ring)
5342 {
5343 	bool inhibit = true;
5344 	void *vaddr;
5345 
5346 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5347 	if (IS_ERR(vaddr)) {
5348 		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5349 		return PTR_ERR(vaddr);
5350 	}
5351 
5352 	set_redzone(vaddr, engine);
5353 
5354 	if (engine->default_state) {
5355 		shmem_read(engine->default_state, 0,
5356 			   vaddr, engine->context_size);
5357 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5358 		inhibit = false;
5359 	}
5360 
5361 	/* Clear the ppHWSP (inc. per-context counters) */
5362 	memset(vaddr, 0, PAGE_SIZE);
5363 
5364 	/*
5365 	 * The second page of the context object contains some registers which
5366 	 * must be set up prior to the first execution.
5367 	 */
5368 	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5369 				 ce, engine, ring, inhibit);
5370 
5371 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5372 	i915_gem_object_unpin_map(ctx_obj);
5373 	return 0;
5374 }
5375 
5376 static struct intel_timeline *pinned_timeline(struct intel_context *ce)
5377 {
5378 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
5379 
5380 	return intel_timeline_create_from_engine(ce->engine,
5381 						 page_unmask_bits(tl));
5382 }
5383 
5384 static int __execlists_context_alloc(struct intel_context *ce,
5385 				     struct intel_engine_cs *engine)
5386 {
5387 	struct drm_i915_gem_object *ctx_obj;
5388 	struct intel_ring *ring;
5389 	struct i915_vma *vma;
5390 	u32 context_size;
5391 	int ret;
5392 
5393 	GEM_BUG_ON(ce->state);
5394 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5395 
5396 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5397 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5398 
5399 	if (INTEL_GEN(engine->i915) == 12) {
5400 		ce->wa_bb_page = context_size / PAGE_SIZE;
5401 		context_size += PAGE_SIZE;
5402 	}
5403 
5404 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5405 	if (IS_ERR(ctx_obj))
5406 		return PTR_ERR(ctx_obj);
5407 
5408 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5409 	if (IS_ERR(vma)) {
5410 		ret = PTR_ERR(vma);
5411 		goto error_deref_obj;
5412 	}
5413 
5414 	if (!page_mask_bits(ce->timeline)) {
5415 		struct intel_timeline *tl;
5416 
5417 		/*
5418 		 * Use the static global HWSP for the kernel context, and
5419 		 * a dynamically allocated cacheline for everyone else.
5420 		 */
5421 		if (unlikely(ce->timeline))
5422 			tl = pinned_timeline(ce);
5423 		else
5424 			tl = intel_timeline_create(engine->gt);
5425 		if (IS_ERR(tl)) {
5426 			ret = PTR_ERR(tl);
5427 			goto error_deref_obj;
5428 		}
5429 
5430 		ce->timeline = tl;
5431 	}
5432 
5433 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5434 	if (IS_ERR(ring)) {
5435 		ret = PTR_ERR(ring);
5436 		goto error_deref_obj;
5437 	}
5438 
5439 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5440 	if (ret) {
5441 		drm_dbg(&engine->i915->drm,
5442 			"Failed to populate LRC: %d\n", ret);
5443 		goto error_ring_free;
5444 	}
5445 
5446 	ce->ring = ring;
5447 	ce->state = vma;
5448 
5449 	return 0;
5450 
5451 error_ring_free:
5452 	intel_ring_put(ring);
5453 error_deref_obj:
5454 	i915_gem_object_put(ctx_obj);
5455 	return ret;
5456 }
5457 
5458 static struct list_head *virtual_queue(struct virtual_engine *ve)
5459 {
5460 	return &ve->base.execlists.default_priolist.requests[0];
5461 }
5462 
5463 static void virtual_context_destroy(struct kref *kref)
5464 {
5465 	struct virtual_engine *ve =
5466 		container_of(kref, typeof(*ve), context.ref);
5467 	unsigned int n;
5468 
5469 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5470 	GEM_BUG_ON(ve->request);
5471 	GEM_BUG_ON(ve->context.inflight);
5472 
5473 	for (n = 0; n < ve->num_siblings; n++) {
5474 		struct intel_engine_cs *sibling = ve->siblings[n];
5475 		struct rb_node *node = &ve->nodes[sibling->id].rb;
5476 		unsigned long flags;
5477 
5478 		if (RB_EMPTY_NODE(node))
5479 			continue;
5480 
5481 		spin_lock_irqsave(&sibling->active.lock, flags);
5482 
5483 		/* Detachment is lazily performed in the execlists tasklet */
5484 		if (!RB_EMPTY_NODE(node))
5485 			rb_erase_cached(node, &sibling->execlists.virtual);
5486 
5487 		spin_unlock_irqrestore(&sibling->active.lock, flags);
5488 	}
5489 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5490 
5491 	if (ve->context.state)
5492 		__execlists_context_fini(&ve->context);
5493 	intel_context_fini(&ve->context);
5494 
5495 	intel_engine_free_request_pool(&ve->base);
5496 
5497 	kfree(ve->bonds);
5498 	kfree(ve);
5499 }
5500 
5501 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5502 {
5503 	int swp;
5504 
5505 	/*
5506 	 * Pick a random sibling on starting to help spread the load around.
5507 	 *
5508 	 * New contexts are typically created with exactly the same order
5509 	 * of siblings, and often started in batches. Due to the way we iterate
5510 	 * the array of sibling when submitting requests, sibling[0] is
5511 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5512 	 * randomised across the system, we also help spread the load by the
5513 	 * first engine we inspect being different each time.
5514 	 *
5515 	 * NB This does not force us to execute on this engine, it will just
5516 	 * typically be the first we inspect for submission.
5517 	 */
5518 	swp = prandom_u32_max(ve->num_siblings);
5519 	if (swp)
5520 		swap(ve->siblings[swp], ve->siblings[0]);
5521 }
5522 
5523 static int virtual_context_alloc(struct intel_context *ce)
5524 {
5525 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5526 
5527 	return __execlists_context_alloc(ce, ve->siblings[0]);
5528 }
5529 
5530 static int virtual_context_pin(struct intel_context *ce, void *vaddr)
5531 {
5532 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5533 
5534 	/* Note: we must use a real engine class for setting up reg state */
5535 	return __execlists_context_pin(ce, ve->siblings[0], vaddr);
5536 }
5537 
5538 static void virtual_context_enter(struct intel_context *ce)
5539 {
5540 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5541 	unsigned int n;
5542 
5543 	for (n = 0; n < ve->num_siblings; n++)
5544 		intel_engine_pm_get(ve->siblings[n]);
5545 
5546 	intel_timeline_enter(ce->timeline);
5547 }
5548 
5549 static void virtual_context_exit(struct intel_context *ce)
5550 {
5551 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5552 	unsigned int n;
5553 
5554 	intel_timeline_exit(ce->timeline);
5555 
5556 	for (n = 0; n < ve->num_siblings; n++)
5557 		intel_engine_pm_put(ve->siblings[n]);
5558 }
5559 
5560 static const struct intel_context_ops virtual_context_ops = {
5561 	.alloc = virtual_context_alloc,
5562 
5563 	.pre_pin = execlists_context_pre_pin,
5564 	.pin = virtual_context_pin,
5565 	.unpin = execlists_context_unpin,
5566 	.post_unpin = execlists_context_post_unpin,
5567 
5568 	.enter = virtual_context_enter,
5569 	.exit = virtual_context_exit,
5570 
5571 	.destroy = virtual_context_destroy,
5572 };
5573 
5574 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5575 {
5576 	struct i915_request *rq;
5577 	intel_engine_mask_t mask;
5578 
5579 	rq = READ_ONCE(ve->request);
5580 	if (!rq)
5581 		return 0;
5582 
5583 	/* The rq is ready for submission; rq->execution_mask is now stable. */
5584 	mask = rq->execution_mask;
5585 	if (unlikely(!mask)) {
5586 		/* Invalid selection, submit to a random engine in error */
5587 		i915_request_set_error_once(rq, -ENODEV);
5588 		mask = ve->siblings[0]->mask;
5589 	}
5590 
5591 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5592 		     rq->fence.context, rq->fence.seqno,
5593 		     mask, ve->base.execlists.queue_priority_hint);
5594 
5595 	return mask;
5596 }
5597 
5598 static void virtual_submission_tasklet(unsigned long data)
5599 {
5600 	struct virtual_engine * const ve = (struct virtual_engine *)data;
5601 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5602 	intel_engine_mask_t mask;
5603 	unsigned int n;
5604 
5605 	rcu_read_lock();
5606 	mask = virtual_submission_mask(ve);
5607 	rcu_read_unlock();
5608 	if (unlikely(!mask))
5609 		return;
5610 
5611 	local_irq_disable();
5612 	for (n = 0; n < ve->num_siblings; n++) {
5613 		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5614 		struct ve_node * const node = &ve->nodes[sibling->id];
5615 		struct rb_node **parent, *rb;
5616 		bool first;
5617 
5618 		if (!READ_ONCE(ve->request))
5619 			break; /* already handled by a sibling's tasklet */
5620 
5621 		if (unlikely(!(mask & sibling->mask))) {
5622 			if (!RB_EMPTY_NODE(&node->rb)) {
5623 				spin_lock(&sibling->active.lock);
5624 				rb_erase_cached(&node->rb,
5625 						&sibling->execlists.virtual);
5626 				RB_CLEAR_NODE(&node->rb);
5627 				spin_unlock(&sibling->active.lock);
5628 			}
5629 			continue;
5630 		}
5631 
5632 		spin_lock(&sibling->active.lock);
5633 
5634 		if (!RB_EMPTY_NODE(&node->rb)) {
5635 			/*
5636 			 * Cheat and avoid rebalancing the tree if we can
5637 			 * reuse this node in situ.
5638 			 */
5639 			first = rb_first_cached(&sibling->execlists.virtual) ==
5640 				&node->rb;
5641 			if (prio == node->prio || (prio > node->prio && first))
5642 				goto submit_engine;
5643 
5644 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5645 		}
5646 
5647 		rb = NULL;
5648 		first = true;
5649 		parent = &sibling->execlists.virtual.rb_root.rb_node;
5650 		while (*parent) {
5651 			struct ve_node *other;
5652 
5653 			rb = *parent;
5654 			other = rb_entry(rb, typeof(*other), rb);
5655 			if (prio > other->prio) {
5656 				parent = &rb->rb_left;
5657 			} else {
5658 				parent = &rb->rb_right;
5659 				first = false;
5660 			}
5661 		}
5662 
5663 		rb_link_node(&node->rb, rb, parent);
5664 		rb_insert_color_cached(&node->rb,
5665 				       &sibling->execlists.virtual,
5666 				       first);
5667 
5668 submit_engine:
5669 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5670 		node->prio = prio;
5671 		if (first && prio > sibling->execlists.queue_priority_hint)
5672 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5673 
5674 		spin_unlock(&sibling->active.lock);
5675 	}
5676 	local_irq_enable();
5677 }
5678 
5679 static void virtual_submit_request(struct i915_request *rq)
5680 {
5681 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5682 	struct i915_request *old;
5683 	unsigned long flags;
5684 
5685 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5686 		     rq->fence.context,
5687 		     rq->fence.seqno);
5688 
5689 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5690 
5691 	spin_lock_irqsave(&ve->base.active.lock, flags);
5692 
5693 	old = ve->request;
5694 	if (old) { /* background completion event from preempt-to-busy */
5695 		GEM_BUG_ON(!i915_request_completed(old));
5696 		__i915_request_submit(old);
5697 		i915_request_put(old);
5698 	}
5699 
5700 	if (i915_request_completed(rq)) {
5701 		__i915_request_submit(rq);
5702 
5703 		ve->base.execlists.queue_priority_hint = INT_MIN;
5704 		ve->request = NULL;
5705 	} else {
5706 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5707 		ve->request = i915_request_get(rq);
5708 
5709 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5710 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5711 
5712 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
5713 	}
5714 
5715 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5716 }
5717 
5718 static struct ve_bond *
5719 virtual_find_bond(struct virtual_engine *ve,
5720 		  const struct intel_engine_cs *master)
5721 {
5722 	int i;
5723 
5724 	for (i = 0; i < ve->num_bonds; i++) {
5725 		if (ve->bonds[i].master == master)
5726 			return &ve->bonds[i];
5727 	}
5728 
5729 	return NULL;
5730 }
5731 
5732 static void
5733 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5734 {
5735 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5736 	intel_engine_mask_t allowed, exec;
5737 	struct ve_bond *bond;
5738 
5739 	allowed = ~to_request(signal)->engine->mask;
5740 
5741 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5742 	if (bond)
5743 		allowed &= bond->sibling_mask;
5744 
5745 	/* Restrict the bonded request to run on only the available engines */
5746 	exec = READ_ONCE(rq->execution_mask);
5747 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5748 		;
5749 
5750 	/* Prevent the master from being re-run on the bonded engines */
5751 	to_request(signal)->execution_mask &= ~allowed;
5752 }
5753 
5754 struct intel_context *
5755 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5756 			       unsigned int count)
5757 {
5758 	struct virtual_engine *ve;
5759 	unsigned int n;
5760 	int err;
5761 
5762 	if (count == 0)
5763 		return ERR_PTR(-EINVAL);
5764 
5765 	if (count == 1)
5766 		return intel_context_create(siblings[0]);
5767 
5768 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5769 	if (!ve)
5770 		return ERR_PTR(-ENOMEM);
5771 
5772 	ve->base.i915 = siblings[0]->i915;
5773 	ve->base.gt = siblings[0]->gt;
5774 	ve->base.uncore = siblings[0]->uncore;
5775 	ve->base.id = -1;
5776 
5777 	ve->base.class = OTHER_CLASS;
5778 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5779 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5780 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5781 
5782 	/*
5783 	 * The decision on whether to submit a request using semaphores
5784 	 * depends on the saturated state of the engine. We only compute
5785 	 * this during HW submission of the request, and we need for this
5786 	 * state to be globally applied to all requests being submitted
5787 	 * to this engine. Virtual engines encompass more than one physical
5788 	 * engine and so we cannot accurately tell in advance if one of those
5789 	 * engines is already saturated and so cannot afford to use a semaphore
5790 	 * and be pessimized in priority for doing so -- if we are the only
5791 	 * context using semaphores after all other clients have stopped, we
5792 	 * will be starved on the saturated system. Such a global switch for
5793 	 * semaphores is less than ideal, but alas is the current compromise.
5794 	 */
5795 	ve->base.saturated = ALL_ENGINES;
5796 
5797 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5798 
5799 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5800 	intel_engine_init_execlists(&ve->base);
5801 
5802 	ve->base.cops = &virtual_context_ops;
5803 	ve->base.request_alloc = execlists_request_alloc;
5804 
5805 	ve->base.schedule = i915_schedule;
5806 	ve->base.submit_request = virtual_submit_request;
5807 	ve->base.bond_execute = virtual_bond_execute;
5808 
5809 	INIT_LIST_HEAD(virtual_queue(ve));
5810 	ve->base.execlists.queue_priority_hint = INT_MIN;
5811 	tasklet_init(&ve->base.execlists.tasklet,
5812 		     virtual_submission_tasklet,
5813 		     (unsigned long)ve);
5814 
5815 	intel_context_init(&ve->context, &ve->base);
5816 
5817 	ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
5818 	if (!ve->base.breadcrumbs) {
5819 		err = -ENOMEM;
5820 		goto err_put;
5821 	}
5822 
5823 	for (n = 0; n < count; n++) {
5824 		struct intel_engine_cs *sibling = siblings[n];
5825 
5826 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5827 		if (sibling->mask & ve->base.mask) {
5828 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5829 				  sibling->name);
5830 			err = -EINVAL;
5831 			goto err_put;
5832 		}
5833 
5834 		/*
5835 		 * The virtual engine implementation is tightly coupled to
5836 		 * the execlists backend -- we push out request directly
5837 		 * into a tree inside each physical engine. We could support
5838 		 * layering if we handle cloning of the requests and
5839 		 * submitting a copy into each backend.
5840 		 */
5841 		if (sibling->execlists.tasklet.func !=
5842 		    execlists_submission_tasklet) {
5843 			err = -ENODEV;
5844 			goto err_put;
5845 		}
5846 
5847 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5848 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5849 
5850 		ve->siblings[ve->num_siblings++] = sibling;
5851 		ve->base.mask |= sibling->mask;
5852 
5853 		/*
5854 		 * All physical engines must be compatible for their emission
5855 		 * functions (as we build the instructions during request
5856 		 * construction and do not alter them before submission
5857 		 * on the physical engine). We use the engine class as a guide
5858 		 * here, although that could be refined.
5859 		 */
5860 		if (ve->base.class != OTHER_CLASS) {
5861 			if (ve->base.class != sibling->class) {
5862 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5863 					  sibling->class, ve->base.class);
5864 				err = -EINVAL;
5865 				goto err_put;
5866 			}
5867 			continue;
5868 		}
5869 
5870 		ve->base.class = sibling->class;
5871 		ve->base.uabi_class = sibling->uabi_class;
5872 		snprintf(ve->base.name, sizeof(ve->base.name),
5873 			 "v%dx%d", ve->base.class, count);
5874 		ve->base.context_size = sibling->context_size;
5875 
5876 		ve->base.emit_bb_start = sibling->emit_bb_start;
5877 		ve->base.emit_flush = sibling->emit_flush;
5878 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5879 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5880 		ve->base.emit_fini_breadcrumb_dw =
5881 			sibling->emit_fini_breadcrumb_dw;
5882 
5883 		ve->base.flags = sibling->flags;
5884 	}
5885 
5886 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5887 
5888 	virtual_engine_initial_hint(ve);
5889 	return &ve->context;
5890 
5891 err_put:
5892 	intel_context_put(&ve->context);
5893 	return ERR_PTR(err);
5894 }
5895 
5896 struct intel_context *
5897 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5898 {
5899 	struct virtual_engine *se = to_virtual_engine(src);
5900 	struct intel_context *dst;
5901 
5902 	dst = intel_execlists_create_virtual(se->siblings,
5903 					     se->num_siblings);
5904 	if (IS_ERR(dst))
5905 		return dst;
5906 
5907 	if (se->num_bonds) {
5908 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5909 
5910 		de->bonds = kmemdup(se->bonds,
5911 				    sizeof(*se->bonds) * se->num_bonds,
5912 				    GFP_KERNEL);
5913 		if (!de->bonds) {
5914 			intel_context_put(dst);
5915 			return ERR_PTR(-ENOMEM);
5916 		}
5917 
5918 		de->num_bonds = se->num_bonds;
5919 	}
5920 
5921 	return dst;
5922 }
5923 
5924 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5925 				     const struct intel_engine_cs *master,
5926 				     const struct intel_engine_cs *sibling)
5927 {
5928 	struct virtual_engine *ve = to_virtual_engine(engine);
5929 	struct ve_bond *bond;
5930 	int n;
5931 
5932 	/* Sanity check the sibling is part of the virtual engine */
5933 	for (n = 0; n < ve->num_siblings; n++)
5934 		if (sibling == ve->siblings[n])
5935 			break;
5936 	if (n == ve->num_siblings)
5937 		return -EINVAL;
5938 
5939 	bond = virtual_find_bond(ve, master);
5940 	if (bond) {
5941 		bond->sibling_mask |= sibling->mask;
5942 		return 0;
5943 	}
5944 
5945 	bond = krealloc(ve->bonds,
5946 			sizeof(*bond) * (ve->num_bonds + 1),
5947 			GFP_KERNEL);
5948 	if (!bond)
5949 		return -ENOMEM;
5950 
5951 	bond[ve->num_bonds].master = master;
5952 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5953 
5954 	ve->bonds = bond;
5955 	ve->num_bonds++;
5956 
5957 	return 0;
5958 }
5959 
5960 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5961 				   struct drm_printer *m,
5962 				   void (*show_request)(struct drm_printer *m,
5963 							struct i915_request *rq,
5964 							const char *prefix),
5965 				   unsigned int max)
5966 {
5967 	const struct intel_engine_execlists *execlists = &engine->execlists;
5968 	struct i915_request *rq, *last;
5969 	unsigned long flags;
5970 	unsigned int count;
5971 	struct rb_node *rb;
5972 
5973 	spin_lock_irqsave(&engine->active.lock, flags);
5974 
5975 	last = NULL;
5976 	count = 0;
5977 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5978 		if (count++ < max - 1)
5979 			show_request(m, rq, "\t\tE ");
5980 		else
5981 			last = rq;
5982 	}
5983 	if (last) {
5984 		if (count > max) {
5985 			drm_printf(m,
5986 				   "\t\t...skipping %d executing requests...\n",
5987 				   count - max);
5988 		}
5989 		show_request(m, last, "\t\tE ");
5990 	}
5991 
5992 	if (execlists->switch_priority_hint != INT_MIN)
5993 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5994 			   READ_ONCE(execlists->switch_priority_hint));
5995 	if (execlists->queue_priority_hint != INT_MIN)
5996 		drm_printf(m, "\t\tQueue priority hint: %d\n",
5997 			   READ_ONCE(execlists->queue_priority_hint));
5998 
5999 	last = NULL;
6000 	count = 0;
6001 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
6002 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
6003 		int i;
6004 
6005 		priolist_for_each_request(rq, p, i) {
6006 			if (count++ < max - 1)
6007 				show_request(m, rq, "\t\tQ ");
6008 			else
6009 				last = rq;
6010 		}
6011 	}
6012 	if (last) {
6013 		if (count > max) {
6014 			drm_printf(m,
6015 				   "\t\t...skipping %d queued requests...\n",
6016 				   count - max);
6017 		}
6018 		show_request(m, last, "\t\tQ ");
6019 	}
6020 
6021 	last = NULL;
6022 	count = 0;
6023 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
6024 		struct virtual_engine *ve =
6025 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
6026 		struct i915_request *rq = READ_ONCE(ve->request);
6027 
6028 		if (rq) {
6029 			if (count++ < max - 1)
6030 				show_request(m, rq, "\t\tV ");
6031 			else
6032 				last = rq;
6033 		}
6034 	}
6035 	if (last) {
6036 		if (count > max) {
6037 			drm_printf(m,
6038 				   "\t\t...skipping %d virtual requests...\n",
6039 				   count - max);
6040 		}
6041 		show_request(m, last, "\t\tV ");
6042 	}
6043 
6044 	spin_unlock_irqrestore(&engine->active.lock, flags);
6045 }
6046 
6047 void intel_lr_context_reset(struct intel_engine_cs *engine,
6048 			    struct intel_context *ce,
6049 			    u32 head,
6050 			    bool scrub)
6051 {
6052 	GEM_BUG_ON(!intel_context_is_pinned(ce));
6053 
6054 	/*
6055 	 * We want a simple context + ring to execute the breadcrumb update.
6056 	 * We cannot rely on the context being intact across the GPU hang,
6057 	 * so clear it and rebuild just what we need for the breadcrumb.
6058 	 * All pending requests for this context will be zapped, and any
6059 	 * future request will be after userspace has had the opportunity
6060 	 * to recreate its own state.
6061 	 */
6062 	if (scrub)
6063 		restore_default_state(ce, engine);
6064 
6065 	/* Rerun the request; its payload has been neutered (if guilty). */
6066 	__execlists_update_reg_state(ce, engine, head);
6067 }
6068 
6069 bool
6070 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6071 {
6072 	return engine->set_default_submission ==
6073 	       intel_execlists_set_default_submission;
6074 }
6075 
6076 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6077 #include "selftest_lrc.c"
6078 #endif
6079