xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision b5b349b9)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_breadcrumbs.h"
141 #include "intel_context.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_gt_requests.h"
146 #include "intel_lrc_reg.h"
147 #include "intel_mocs.h"
148 #include "intel_reset.h"
149 #include "intel_ring.h"
150 #include "intel_workarounds.h"
151 #include "shmem_utils.h"
152 
153 #define RING_EXECLIST_QFULL		(1 << 0x2)
154 #define RING_EXECLIST1_VALID		(1 << 0x3)
155 #define RING_EXECLIST0_VALID		(1 << 0x4)
156 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
157 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
158 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
159 
160 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
161 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
163 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
164 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
165 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
166 
167 #define GEN8_CTX_STATUS_COMPLETED_MASK \
168 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
169 
170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
171 
172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
174 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
175 #define GEN12_IDLE_CTX_ID		0x7FF
176 #define GEN12_CSB_CTX_VALID(csb_dw) \
177 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
178 
179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
181 
182 struct virtual_engine {
183 	struct intel_engine_cs base;
184 	struct intel_context context;
185 
186 	/*
187 	 * We allow only a single request through the virtual engine at a time
188 	 * (each request in the timeline waits for the completion fence of
189 	 * the previous before being submitted). By restricting ourselves to
190 	 * only submitting a single request, each request is placed on to a
191 	 * physical to maximise load spreading (by virtue of the late greedy
192 	 * scheduling -- each real engine takes the next available request
193 	 * upon idling).
194 	 */
195 	struct i915_request *request;
196 
197 	/*
198 	 * We keep a rbtree of available virtual engines inside each physical
199 	 * engine, sorted by priority. Here we preallocate the nodes we need
200 	 * for the virtual engine, indexed by physical_engine->id.
201 	 */
202 	struct ve_node {
203 		struct rb_node rb;
204 		int prio;
205 	} nodes[I915_NUM_ENGINES];
206 
207 	/*
208 	 * Keep track of bonded pairs -- restrictions upon on our selection
209 	 * of physical engines any particular request may be submitted to.
210 	 * If we receive a submit-fence from a master engine, we will only
211 	 * use one of sibling_mask physical engines.
212 	 */
213 	struct ve_bond {
214 		const struct intel_engine_cs *master;
215 		intel_engine_mask_t sibling_mask;
216 	} *bonds;
217 	unsigned int num_bonds;
218 
219 	/* And finally, which physical engines this virtual engine maps onto. */
220 	unsigned int num_siblings;
221 	struct intel_engine_cs *siblings[];
222 };
223 
224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
225 {
226 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
227 	return container_of(engine, struct virtual_engine, base);
228 }
229 
230 static int __execlists_context_alloc(struct intel_context *ce,
231 				     struct intel_engine_cs *engine);
232 
233 static void execlists_init_reg_state(u32 *reg_state,
234 				     const struct intel_context *ce,
235 				     const struct intel_engine_cs *engine,
236 				     const struct intel_ring *ring,
237 				     bool close);
238 static void
239 __execlists_update_reg_state(const struct intel_context *ce,
240 			     const struct intel_engine_cs *engine,
241 			     u32 head);
242 
243 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
244 {
245 	if (INTEL_GEN(engine->i915) >= 12)
246 		return 0x60;
247 	else if (INTEL_GEN(engine->i915) >= 9)
248 		return 0x54;
249 	else if (engine->class == RENDER_CLASS)
250 		return 0x58;
251 	else
252 		return -1;
253 }
254 
255 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
256 {
257 	if (INTEL_GEN(engine->i915) >= 12)
258 		return 0x74;
259 	else if (INTEL_GEN(engine->i915) >= 9)
260 		return 0x68;
261 	else if (engine->class == RENDER_CLASS)
262 		return 0xd8;
263 	else
264 		return -1;
265 }
266 
267 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
268 {
269 	if (INTEL_GEN(engine->i915) >= 12)
270 		return 0x12;
271 	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
272 		return 0x18;
273 	else
274 		return -1;
275 }
276 
277 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
278 {
279 	int x;
280 
281 	x = lrc_ring_wa_bb_per_ctx(engine);
282 	if (x < 0)
283 		return x;
284 
285 	return x + 2;
286 }
287 
288 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
289 {
290 	int x;
291 
292 	x = lrc_ring_indirect_ptr(engine);
293 	if (x < 0)
294 		return x;
295 
296 	return x + 2;
297 }
298 
299 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
300 {
301 	if (engine->class != RENDER_CLASS)
302 		return -1;
303 
304 	if (INTEL_GEN(engine->i915) >= 12)
305 		return 0xb6;
306 	else if (INTEL_GEN(engine->i915) >= 11)
307 		return 0xaa;
308 	else
309 		return -1;
310 }
311 
312 static u32
313 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
314 {
315 	switch (INTEL_GEN(engine->i915)) {
316 	default:
317 		MISSING_CASE(INTEL_GEN(engine->i915));
318 		fallthrough;
319 	case 12:
320 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
321 	case 11:
322 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
323 	case 10:
324 		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
325 	case 9:
326 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
327 	case 8:
328 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
329 	}
330 }
331 
332 static void
333 lrc_ring_setup_indirect_ctx(u32 *regs,
334 			    const struct intel_engine_cs *engine,
335 			    u32 ctx_bb_ggtt_addr,
336 			    u32 size)
337 {
338 	GEM_BUG_ON(!size);
339 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
340 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
341 	regs[lrc_ring_indirect_ptr(engine) + 1] =
342 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
343 
344 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
345 	regs[lrc_ring_indirect_offset(engine) + 1] =
346 		lrc_ring_indirect_offset_default(engine) << 6;
347 }
348 
349 static u32 intel_context_get_runtime(const struct intel_context *ce)
350 {
351 	/*
352 	 * We can use either ppHWSP[16] which is recorded before the context
353 	 * switch (and so excludes the cost of context switches) or use the
354 	 * value from the context image itself, which is saved/restored earlier
355 	 * and so includes the cost of the save.
356 	 */
357 	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
358 }
359 
360 static void mark_eio(struct i915_request *rq)
361 {
362 	if (i915_request_completed(rq))
363 		return;
364 
365 	GEM_BUG_ON(i915_request_signaled(rq));
366 
367 	i915_request_set_error_once(rq, -EIO);
368 	i915_request_mark_complete(rq);
369 }
370 
371 static struct i915_request *
372 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
373 {
374 	struct i915_request *active = rq;
375 
376 	rcu_read_lock();
377 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
378 		if (i915_request_completed(rq))
379 			break;
380 
381 		active = rq;
382 	}
383 	rcu_read_unlock();
384 
385 	return active;
386 }
387 
388 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
389 {
390 	return (i915_ggtt_offset(engine->status_page.vma) +
391 		I915_GEM_HWS_PREEMPT_ADDR);
392 }
393 
394 static inline void
395 ring_set_paused(const struct intel_engine_cs *engine, int state)
396 {
397 	/*
398 	 * We inspect HWS_PREEMPT with a semaphore inside
399 	 * engine->emit_fini_breadcrumb. If the dword is true,
400 	 * the ring is paused as the semaphore will busywait
401 	 * until the dword is false.
402 	 */
403 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
404 	if (state)
405 		wmb();
406 }
407 
408 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
409 {
410 	return rb_entry(rb, struct i915_priolist, node);
411 }
412 
413 static inline int rq_prio(const struct i915_request *rq)
414 {
415 	return READ_ONCE(rq->sched.attr.priority);
416 }
417 
418 static int effective_prio(const struct i915_request *rq)
419 {
420 	int prio = rq_prio(rq);
421 
422 	/*
423 	 * If this request is special and must not be interrupted at any
424 	 * cost, so be it. Note we are only checking the most recent request
425 	 * in the context and so may be masking an earlier vip request. It
426 	 * is hoped that under the conditions where nopreempt is used, this
427 	 * will not matter (i.e. all requests to that context will be
428 	 * nopreempt for as long as desired).
429 	 */
430 	if (i915_request_has_nopreempt(rq))
431 		prio = I915_PRIORITY_UNPREEMPTABLE;
432 
433 	return prio;
434 }
435 
436 static int queue_prio(const struct intel_engine_execlists *execlists)
437 {
438 	struct i915_priolist *p;
439 	struct rb_node *rb;
440 
441 	rb = rb_first_cached(&execlists->queue);
442 	if (!rb)
443 		return INT_MIN;
444 
445 	/*
446 	 * As the priolist[] are inverted, with the highest priority in [0],
447 	 * we have to flip the index value to become priority.
448 	 */
449 	p = to_priolist(rb);
450 	if (!I915_USER_PRIORITY_SHIFT)
451 		return p->priority;
452 
453 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
454 }
455 
456 static inline bool need_preempt(const struct intel_engine_cs *engine,
457 				const struct i915_request *rq,
458 				struct rb_node *rb)
459 {
460 	int last_prio;
461 
462 	if (!intel_engine_has_semaphores(engine))
463 		return false;
464 
465 	/*
466 	 * Check if the current priority hint merits a preemption attempt.
467 	 *
468 	 * We record the highest value priority we saw during rescheduling
469 	 * prior to this dequeue, therefore we know that if it is strictly
470 	 * less than the current tail of ESLP[0], we do not need to force
471 	 * a preempt-to-idle cycle.
472 	 *
473 	 * However, the priority hint is a mere hint that we may need to
474 	 * preempt. If that hint is stale or we may be trying to preempt
475 	 * ourselves, ignore the request.
476 	 *
477 	 * More naturally we would write
478 	 *      prio >= max(0, last);
479 	 * except that we wish to prevent triggering preemption at the same
480 	 * priority level: the task that is running should remain running
481 	 * to preserve FIFO ordering of dependencies.
482 	 */
483 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
484 	if (engine->execlists.queue_priority_hint <= last_prio)
485 		return false;
486 
487 	/*
488 	 * Check against the first request in ELSP[1], it will, thanks to the
489 	 * power of PI, be the highest priority of that context.
490 	 */
491 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
492 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
493 		return true;
494 
495 	if (rb) {
496 		struct virtual_engine *ve =
497 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
498 		bool preempt = false;
499 
500 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
501 			struct i915_request *next;
502 
503 			rcu_read_lock();
504 			next = READ_ONCE(ve->request);
505 			if (next)
506 				preempt = rq_prio(next) > last_prio;
507 			rcu_read_unlock();
508 		}
509 
510 		if (preempt)
511 			return preempt;
512 	}
513 
514 	/*
515 	 * If the inflight context did not trigger the preemption, then maybe
516 	 * it was the set of queued requests? Pick the highest priority in
517 	 * the queue (the first active priolist) and see if it deserves to be
518 	 * running instead of ELSP[0].
519 	 *
520 	 * The highest priority request in the queue can not be either
521 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
522 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
523 	 */
524 	return queue_prio(&engine->execlists) > last_prio;
525 }
526 
527 __maybe_unused static inline bool
528 assert_priority_queue(const struct i915_request *prev,
529 		      const struct i915_request *next)
530 {
531 	/*
532 	 * Without preemption, the prev may refer to the still active element
533 	 * which we refuse to let go.
534 	 *
535 	 * Even with preemption, there are times when we think it is better not
536 	 * to preempt and leave an ostensibly lower priority request in flight.
537 	 */
538 	if (i915_request_is_active(prev))
539 		return true;
540 
541 	return rq_prio(prev) >= rq_prio(next);
542 }
543 
544 /*
545  * The context descriptor encodes various attributes of a context,
546  * including its GTT address and some flags. Because it's fairly
547  * expensive to calculate, we'll just do it once and cache the result,
548  * which remains valid until the context is unpinned.
549  *
550  * This is what a descriptor looks like, from LSB to MSB::
551  *
552  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
553  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
554  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
555  *      bits 53-54:    mbz, reserved for use by hardware
556  *      bits 55-63:    group ID, currently unused and set to 0
557  *
558  * Starting from Gen11, the upper dword of the descriptor has a new format:
559  *
560  *      bits 32-36:    reserved
561  *      bits 37-47:    SW context ID
562  *      bits 48:53:    engine instance
563  *      bit 54:        mbz, reserved for use by hardware
564  *      bits 55-60:    SW counter
565  *      bits 61-63:    engine class
566  *
567  * engine info, SW context ID and SW counter need to form a unique number
568  * (Context ID) per lrc.
569  */
570 static u32
571 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
572 {
573 	u32 desc;
574 
575 	desc = INTEL_LEGACY_32B_CONTEXT;
576 	if (i915_vm_is_4lvl(ce->vm))
577 		desc = INTEL_LEGACY_64B_CONTEXT;
578 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
579 
580 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
581 	if (IS_GEN(engine->i915, 8))
582 		desc |= GEN8_CTX_L3LLC_COHERENT;
583 
584 	return i915_ggtt_offset(ce->state) | desc;
585 }
586 
587 static inline unsigned int dword_in_page(void *addr)
588 {
589 	return offset_in_page(addr) / sizeof(u32);
590 }
591 
592 static void set_offsets(u32 *regs,
593 			const u8 *data,
594 			const struct intel_engine_cs *engine,
595 			bool clear)
596 #define NOP(x) (BIT(7) | (x))
597 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
598 #define POSTED BIT(0)
599 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
600 #define REG16(x) \
601 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
602 	(((x) >> 2) & 0x7f)
603 #define END(total_state_size) 0, (total_state_size)
604 {
605 	const u32 base = engine->mmio_base;
606 
607 	while (*data) {
608 		u8 count, flags;
609 
610 		if (*data & BIT(7)) { /* skip */
611 			count = *data++ & ~BIT(7);
612 			if (clear)
613 				memset32(regs, MI_NOOP, count);
614 			regs += count;
615 			continue;
616 		}
617 
618 		count = *data & 0x3f;
619 		flags = *data >> 6;
620 		data++;
621 
622 		*regs = MI_LOAD_REGISTER_IMM(count);
623 		if (flags & POSTED)
624 			*regs |= MI_LRI_FORCE_POSTED;
625 		if (INTEL_GEN(engine->i915) >= 11)
626 			*regs |= MI_LRI_LRM_CS_MMIO;
627 		regs++;
628 
629 		GEM_BUG_ON(!count);
630 		do {
631 			u32 offset = 0;
632 			u8 v;
633 
634 			do {
635 				v = *data++;
636 				offset <<= 7;
637 				offset |= v & ~BIT(7);
638 			} while (v & BIT(7));
639 
640 			regs[0] = base + (offset << 2);
641 			if (clear)
642 				regs[1] = 0;
643 			regs += 2;
644 		} while (--count);
645 	}
646 
647 	if (clear) {
648 		u8 count = *++data;
649 
650 		/* Clear past the tail for HW access */
651 		GEM_BUG_ON(dword_in_page(regs) > count);
652 		memset32(regs, MI_NOOP, count - dword_in_page(regs));
653 
654 		/* Close the batch; used mainly by live_lrc_layout() */
655 		*regs = MI_BATCH_BUFFER_END;
656 		if (INTEL_GEN(engine->i915) >= 10)
657 			*regs |= BIT(0);
658 	}
659 }
660 
661 static const u8 gen8_xcs_offsets[] = {
662 	NOP(1),
663 	LRI(11, 0),
664 	REG16(0x244),
665 	REG(0x034),
666 	REG(0x030),
667 	REG(0x038),
668 	REG(0x03c),
669 	REG(0x168),
670 	REG(0x140),
671 	REG(0x110),
672 	REG(0x11c),
673 	REG(0x114),
674 	REG(0x118),
675 
676 	NOP(9),
677 	LRI(9, 0),
678 	REG16(0x3a8),
679 	REG16(0x28c),
680 	REG16(0x288),
681 	REG16(0x284),
682 	REG16(0x280),
683 	REG16(0x27c),
684 	REG16(0x278),
685 	REG16(0x274),
686 	REG16(0x270),
687 
688 	NOP(13),
689 	LRI(2, 0),
690 	REG16(0x200),
691 	REG(0x028),
692 
693 	END(80)
694 };
695 
696 static const u8 gen9_xcs_offsets[] = {
697 	NOP(1),
698 	LRI(14, POSTED),
699 	REG16(0x244),
700 	REG(0x034),
701 	REG(0x030),
702 	REG(0x038),
703 	REG(0x03c),
704 	REG(0x168),
705 	REG(0x140),
706 	REG(0x110),
707 	REG(0x11c),
708 	REG(0x114),
709 	REG(0x118),
710 	REG(0x1c0),
711 	REG(0x1c4),
712 	REG(0x1c8),
713 
714 	NOP(3),
715 	LRI(9, POSTED),
716 	REG16(0x3a8),
717 	REG16(0x28c),
718 	REG16(0x288),
719 	REG16(0x284),
720 	REG16(0x280),
721 	REG16(0x27c),
722 	REG16(0x278),
723 	REG16(0x274),
724 	REG16(0x270),
725 
726 	NOP(13),
727 	LRI(1, POSTED),
728 	REG16(0x200),
729 
730 	NOP(13),
731 	LRI(44, POSTED),
732 	REG(0x028),
733 	REG(0x09c),
734 	REG(0x0c0),
735 	REG(0x178),
736 	REG(0x17c),
737 	REG16(0x358),
738 	REG(0x170),
739 	REG(0x150),
740 	REG(0x154),
741 	REG(0x158),
742 	REG16(0x41c),
743 	REG16(0x600),
744 	REG16(0x604),
745 	REG16(0x608),
746 	REG16(0x60c),
747 	REG16(0x610),
748 	REG16(0x614),
749 	REG16(0x618),
750 	REG16(0x61c),
751 	REG16(0x620),
752 	REG16(0x624),
753 	REG16(0x628),
754 	REG16(0x62c),
755 	REG16(0x630),
756 	REG16(0x634),
757 	REG16(0x638),
758 	REG16(0x63c),
759 	REG16(0x640),
760 	REG16(0x644),
761 	REG16(0x648),
762 	REG16(0x64c),
763 	REG16(0x650),
764 	REG16(0x654),
765 	REG16(0x658),
766 	REG16(0x65c),
767 	REG16(0x660),
768 	REG16(0x664),
769 	REG16(0x668),
770 	REG16(0x66c),
771 	REG16(0x670),
772 	REG16(0x674),
773 	REG16(0x678),
774 	REG16(0x67c),
775 	REG(0x068),
776 
777 	END(176)
778 };
779 
780 static const u8 gen12_xcs_offsets[] = {
781 	NOP(1),
782 	LRI(13, POSTED),
783 	REG16(0x244),
784 	REG(0x034),
785 	REG(0x030),
786 	REG(0x038),
787 	REG(0x03c),
788 	REG(0x168),
789 	REG(0x140),
790 	REG(0x110),
791 	REG(0x1c0),
792 	REG(0x1c4),
793 	REG(0x1c8),
794 	REG(0x180),
795 	REG16(0x2b4),
796 
797 	NOP(5),
798 	LRI(9, POSTED),
799 	REG16(0x3a8),
800 	REG16(0x28c),
801 	REG16(0x288),
802 	REG16(0x284),
803 	REG16(0x280),
804 	REG16(0x27c),
805 	REG16(0x278),
806 	REG16(0x274),
807 	REG16(0x270),
808 
809 	END(80)
810 };
811 
812 static const u8 gen8_rcs_offsets[] = {
813 	NOP(1),
814 	LRI(14, POSTED),
815 	REG16(0x244),
816 	REG(0x034),
817 	REG(0x030),
818 	REG(0x038),
819 	REG(0x03c),
820 	REG(0x168),
821 	REG(0x140),
822 	REG(0x110),
823 	REG(0x11c),
824 	REG(0x114),
825 	REG(0x118),
826 	REG(0x1c0),
827 	REG(0x1c4),
828 	REG(0x1c8),
829 
830 	NOP(3),
831 	LRI(9, POSTED),
832 	REG16(0x3a8),
833 	REG16(0x28c),
834 	REG16(0x288),
835 	REG16(0x284),
836 	REG16(0x280),
837 	REG16(0x27c),
838 	REG16(0x278),
839 	REG16(0x274),
840 	REG16(0x270),
841 
842 	NOP(13),
843 	LRI(1, 0),
844 	REG(0x0c8),
845 
846 	END(80)
847 };
848 
849 static const u8 gen9_rcs_offsets[] = {
850 	NOP(1),
851 	LRI(14, POSTED),
852 	REG16(0x244),
853 	REG(0x34),
854 	REG(0x30),
855 	REG(0x38),
856 	REG(0x3c),
857 	REG(0x168),
858 	REG(0x140),
859 	REG(0x110),
860 	REG(0x11c),
861 	REG(0x114),
862 	REG(0x118),
863 	REG(0x1c0),
864 	REG(0x1c4),
865 	REG(0x1c8),
866 
867 	NOP(3),
868 	LRI(9, POSTED),
869 	REG16(0x3a8),
870 	REG16(0x28c),
871 	REG16(0x288),
872 	REG16(0x284),
873 	REG16(0x280),
874 	REG16(0x27c),
875 	REG16(0x278),
876 	REG16(0x274),
877 	REG16(0x270),
878 
879 	NOP(13),
880 	LRI(1, 0),
881 	REG(0xc8),
882 
883 	NOP(13),
884 	LRI(44, POSTED),
885 	REG(0x28),
886 	REG(0x9c),
887 	REG(0xc0),
888 	REG(0x178),
889 	REG(0x17c),
890 	REG16(0x358),
891 	REG(0x170),
892 	REG(0x150),
893 	REG(0x154),
894 	REG(0x158),
895 	REG16(0x41c),
896 	REG16(0x600),
897 	REG16(0x604),
898 	REG16(0x608),
899 	REG16(0x60c),
900 	REG16(0x610),
901 	REG16(0x614),
902 	REG16(0x618),
903 	REG16(0x61c),
904 	REG16(0x620),
905 	REG16(0x624),
906 	REG16(0x628),
907 	REG16(0x62c),
908 	REG16(0x630),
909 	REG16(0x634),
910 	REG16(0x638),
911 	REG16(0x63c),
912 	REG16(0x640),
913 	REG16(0x644),
914 	REG16(0x648),
915 	REG16(0x64c),
916 	REG16(0x650),
917 	REG16(0x654),
918 	REG16(0x658),
919 	REG16(0x65c),
920 	REG16(0x660),
921 	REG16(0x664),
922 	REG16(0x668),
923 	REG16(0x66c),
924 	REG16(0x670),
925 	REG16(0x674),
926 	REG16(0x678),
927 	REG16(0x67c),
928 	REG(0x68),
929 
930 	END(176)
931 };
932 
933 static const u8 gen11_rcs_offsets[] = {
934 	NOP(1),
935 	LRI(15, POSTED),
936 	REG16(0x244),
937 	REG(0x034),
938 	REG(0x030),
939 	REG(0x038),
940 	REG(0x03c),
941 	REG(0x168),
942 	REG(0x140),
943 	REG(0x110),
944 	REG(0x11c),
945 	REG(0x114),
946 	REG(0x118),
947 	REG(0x1c0),
948 	REG(0x1c4),
949 	REG(0x1c8),
950 	REG(0x180),
951 
952 	NOP(1),
953 	LRI(9, POSTED),
954 	REG16(0x3a8),
955 	REG16(0x28c),
956 	REG16(0x288),
957 	REG16(0x284),
958 	REG16(0x280),
959 	REG16(0x27c),
960 	REG16(0x278),
961 	REG16(0x274),
962 	REG16(0x270),
963 
964 	LRI(1, POSTED),
965 	REG(0x1b0),
966 
967 	NOP(10),
968 	LRI(1, 0),
969 	REG(0x0c8),
970 
971 	END(80)
972 };
973 
974 static const u8 gen12_rcs_offsets[] = {
975 	NOP(1),
976 	LRI(13, POSTED),
977 	REG16(0x244),
978 	REG(0x034),
979 	REG(0x030),
980 	REG(0x038),
981 	REG(0x03c),
982 	REG(0x168),
983 	REG(0x140),
984 	REG(0x110),
985 	REG(0x1c0),
986 	REG(0x1c4),
987 	REG(0x1c8),
988 	REG(0x180),
989 	REG16(0x2b4),
990 
991 	NOP(5),
992 	LRI(9, POSTED),
993 	REG16(0x3a8),
994 	REG16(0x28c),
995 	REG16(0x288),
996 	REG16(0x284),
997 	REG16(0x280),
998 	REG16(0x27c),
999 	REG16(0x278),
1000 	REG16(0x274),
1001 	REG16(0x270),
1002 
1003 	LRI(3, POSTED),
1004 	REG(0x1b0),
1005 	REG16(0x5a8),
1006 	REG16(0x5ac),
1007 
1008 	NOP(6),
1009 	LRI(1, 0),
1010 	REG(0x0c8),
1011 	NOP(3 + 9 + 1),
1012 
1013 	LRI(51, POSTED),
1014 	REG16(0x588),
1015 	REG16(0x588),
1016 	REG16(0x588),
1017 	REG16(0x588),
1018 	REG16(0x588),
1019 	REG16(0x588),
1020 	REG(0x028),
1021 	REG(0x09c),
1022 	REG(0x0c0),
1023 	REG(0x178),
1024 	REG(0x17c),
1025 	REG16(0x358),
1026 	REG(0x170),
1027 	REG(0x150),
1028 	REG(0x154),
1029 	REG(0x158),
1030 	REG16(0x41c),
1031 	REG16(0x600),
1032 	REG16(0x604),
1033 	REG16(0x608),
1034 	REG16(0x60c),
1035 	REG16(0x610),
1036 	REG16(0x614),
1037 	REG16(0x618),
1038 	REG16(0x61c),
1039 	REG16(0x620),
1040 	REG16(0x624),
1041 	REG16(0x628),
1042 	REG16(0x62c),
1043 	REG16(0x630),
1044 	REG16(0x634),
1045 	REG16(0x638),
1046 	REG16(0x63c),
1047 	REG16(0x640),
1048 	REG16(0x644),
1049 	REG16(0x648),
1050 	REG16(0x64c),
1051 	REG16(0x650),
1052 	REG16(0x654),
1053 	REG16(0x658),
1054 	REG16(0x65c),
1055 	REG16(0x660),
1056 	REG16(0x664),
1057 	REG16(0x668),
1058 	REG16(0x66c),
1059 	REG16(0x670),
1060 	REG16(0x674),
1061 	REG16(0x678),
1062 	REG16(0x67c),
1063 	REG(0x068),
1064 	REG(0x084),
1065 	NOP(1),
1066 
1067 	END(192)
1068 };
1069 
1070 #undef END
1071 #undef REG16
1072 #undef REG
1073 #undef LRI
1074 #undef NOP
1075 
1076 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1077 {
1078 	/*
1079 	 * The gen12+ lists only have the registers we program in the basic
1080 	 * default state. We rely on the context image using relative
1081 	 * addressing to automatic fixup the register state between the
1082 	 * physical engines for virtual engine.
1083 	 */
1084 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1085 		   !intel_engine_has_relative_mmio(engine));
1086 
1087 	if (engine->class == RENDER_CLASS) {
1088 		if (INTEL_GEN(engine->i915) >= 12)
1089 			return gen12_rcs_offsets;
1090 		else if (INTEL_GEN(engine->i915) >= 11)
1091 			return gen11_rcs_offsets;
1092 		else if (INTEL_GEN(engine->i915) >= 9)
1093 			return gen9_rcs_offsets;
1094 		else
1095 			return gen8_rcs_offsets;
1096 	} else {
1097 		if (INTEL_GEN(engine->i915) >= 12)
1098 			return gen12_xcs_offsets;
1099 		else if (INTEL_GEN(engine->i915) >= 9)
1100 			return gen9_xcs_offsets;
1101 		else
1102 			return gen8_xcs_offsets;
1103 	}
1104 }
1105 
1106 static struct i915_request *
1107 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1108 {
1109 	struct i915_request *rq, *rn, *active = NULL;
1110 	struct list_head *pl;
1111 	int prio = I915_PRIORITY_INVALID;
1112 
1113 	lockdep_assert_held(&engine->active.lock);
1114 
1115 	list_for_each_entry_safe_reverse(rq, rn,
1116 					 &engine->active.requests,
1117 					 sched.link) {
1118 		if (i915_request_completed(rq))
1119 			continue; /* XXX */
1120 
1121 		__i915_request_unsubmit(rq);
1122 
1123 		/*
1124 		 * Push the request back into the queue for later resubmission.
1125 		 * If this request is not native to this physical engine (i.e.
1126 		 * it came from a virtual source), push it back onto the virtual
1127 		 * engine so that it can be moved across onto another physical
1128 		 * engine as load dictates.
1129 		 */
1130 		if (likely(rq->execution_mask == engine->mask)) {
1131 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1132 			if (rq_prio(rq) != prio) {
1133 				prio = rq_prio(rq);
1134 				pl = i915_sched_lookup_priolist(engine, prio);
1135 			}
1136 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1137 
1138 			list_move(&rq->sched.link, pl);
1139 			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1140 
1141 			/* Check in case we rollback so far we wrap [size/2] */
1142 			if (intel_ring_direction(rq->ring,
1143 						 rq->tail,
1144 						 rq->ring->tail + 8) > 0)
1145 				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1146 
1147 			active = rq;
1148 		} else {
1149 			struct intel_engine_cs *owner = rq->context->engine;
1150 
1151 			WRITE_ONCE(rq->engine, owner);
1152 			owner->submit_request(rq);
1153 			active = NULL;
1154 		}
1155 	}
1156 
1157 	return active;
1158 }
1159 
1160 struct i915_request *
1161 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1162 {
1163 	struct intel_engine_cs *engine =
1164 		container_of(execlists, typeof(*engine), execlists);
1165 
1166 	return __unwind_incomplete_requests(engine);
1167 }
1168 
1169 static inline void
1170 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1171 {
1172 	/*
1173 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1174 	 * The compiler should eliminate this function as dead-code.
1175 	 */
1176 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1177 		return;
1178 
1179 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1180 				   status, rq);
1181 }
1182 
1183 static void intel_engine_context_in(struct intel_engine_cs *engine)
1184 {
1185 	unsigned long flags;
1186 
1187 	if (atomic_add_unless(&engine->stats.active, 1, 0))
1188 		return;
1189 
1190 	write_seqlock_irqsave(&engine->stats.lock, flags);
1191 	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1192 		engine->stats.start = ktime_get();
1193 		atomic_inc(&engine->stats.active);
1194 	}
1195 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1196 }
1197 
1198 static void intel_engine_context_out(struct intel_engine_cs *engine)
1199 {
1200 	unsigned long flags;
1201 
1202 	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1203 
1204 	if (atomic_add_unless(&engine->stats.active, -1, 1))
1205 		return;
1206 
1207 	write_seqlock_irqsave(&engine->stats.lock, flags);
1208 	if (atomic_dec_and_test(&engine->stats.active)) {
1209 		engine->stats.total =
1210 			ktime_add(engine->stats.total,
1211 				  ktime_sub(ktime_get(), engine->stats.start));
1212 	}
1213 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1214 }
1215 
1216 static void
1217 execlists_check_context(const struct intel_context *ce,
1218 			const struct intel_engine_cs *engine,
1219 			const char *when)
1220 {
1221 	const struct intel_ring *ring = ce->ring;
1222 	u32 *regs = ce->lrc_reg_state;
1223 	bool valid = true;
1224 	int x;
1225 
1226 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1227 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1228 		       engine->name,
1229 		       regs[CTX_RING_START],
1230 		       i915_ggtt_offset(ring->vma));
1231 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1232 		valid = false;
1233 	}
1234 
1235 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1236 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1237 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1238 		       engine->name,
1239 		       regs[CTX_RING_CTL],
1240 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1241 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1242 		valid = false;
1243 	}
1244 
1245 	x = lrc_ring_mi_mode(engine);
1246 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1247 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1248 		       engine->name, regs[x + 1]);
1249 		regs[x + 1] &= ~STOP_RING;
1250 		regs[x + 1] |= STOP_RING << 16;
1251 		valid = false;
1252 	}
1253 
1254 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1255 }
1256 
1257 static void restore_default_state(struct intel_context *ce,
1258 				  struct intel_engine_cs *engine)
1259 {
1260 	u32 *regs;
1261 
1262 	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1263 	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1264 
1265 	ce->runtime.last = intel_context_get_runtime(ce);
1266 }
1267 
1268 static void reset_active(struct i915_request *rq,
1269 			 struct intel_engine_cs *engine)
1270 {
1271 	struct intel_context * const ce = rq->context;
1272 	u32 head;
1273 
1274 	/*
1275 	 * The executing context has been cancelled. We want to prevent
1276 	 * further execution along this context and propagate the error on
1277 	 * to anything depending on its results.
1278 	 *
1279 	 * In __i915_request_submit(), we apply the -EIO and remove the
1280 	 * requests' payloads for any banned requests. But first, we must
1281 	 * rewind the context back to the start of the incomplete request so
1282 	 * that we do not jump back into the middle of the batch.
1283 	 *
1284 	 * We preserve the breadcrumbs and semaphores of the incomplete
1285 	 * requests so that inter-timeline dependencies (i.e other timelines)
1286 	 * remain correctly ordered. And we defer to __i915_request_submit()
1287 	 * so that all asynchronous waits are correctly handled.
1288 	 */
1289 	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1290 		     rq->fence.context, rq->fence.seqno);
1291 
1292 	/* On resubmission of the active request, payload will be scrubbed */
1293 	if (i915_request_completed(rq))
1294 		head = rq->tail;
1295 	else
1296 		head = active_request(ce->timeline, rq)->head;
1297 	head = intel_ring_wrap(ce->ring, head);
1298 
1299 	/* Scrub the context image to prevent replaying the previous batch */
1300 	restore_default_state(ce, engine);
1301 	__execlists_update_reg_state(ce, engine, head);
1302 
1303 	/* We've switched away, so this should be a no-op, but intent matters */
1304 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1305 }
1306 
1307 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1308 {
1309 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1310 	ce->runtime.num_underflow++;
1311 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1312 #endif
1313 }
1314 
1315 static void intel_context_update_runtime(struct intel_context *ce)
1316 {
1317 	u32 old;
1318 	s32 dt;
1319 
1320 	if (intel_context_is_barrier(ce))
1321 		return;
1322 
1323 	old = ce->runtime.last;
1324 	ce->runtime.last = intel_context_get_runtime(ce);
1325 	dt = ce->runtime.last - old;
1326 
1327 	if (unlikely(dt < 0)) {
1328 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1329 			 old, ce->runtime.last, dt);
1330 		st_update_runtime_underflow(ce, dt);
1331 		return;
1332 	}
1333 
1334 	ewma_runtime_add(&ce->runtime.avg, dt);
1335 	ce->runtime.total += dt;
1336 }
1337 
1338 static inline struct intel_engine_cs *
1339 __execlists_schedule_in(struct i915_request *rq)
1340 {
1341 	struct intel_engine_cs * const engine = rq->engine;
1342 	struct intel_context * const ce = rq->context;
1343 
1344 	intel_context_get(ce);
1345 
1346 	if (unlikely(intel_context_is_banned(ce)))
1347 		reset_active(rq, engine);
1348 
1349 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1350 		execlists_check_context(ce, engine, "before");
1351 
1352 	if (ce->tag) {
1353 		/* Use a fixed tag for OA and friends */
1354 		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1355 		ce->lrc.ccid = ce->tag;
1356 	} else {
1357 		/* We don't need a strict matching tag, just different values */
1358 		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1359 
1360 		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1361 		clear_bit(tag - 1, &engine->context_tag);
1362 		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1363 
1364 		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1365 	}
1366 
1367 	ce->lrc.ccid |= engine->execlists.ccid;
1368 
1369 	__intel_gt_pm_get(engine->gt);
1370 	if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1371 		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1372 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1373 	intel_engine_context_in(engine);
1374 
1375 	return engine;
1376 }
1377 
1378 static inline struct i915_request *
1379 execlists_schedule_in(struct i915_request *rq, int idx)
1380 {
1381 	struct intel_context * const ce = rq->context;
1382 	struct intel_engine_cs *old;
1383 
1384 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1385 	trace_i915_request_in(rq, idx);
1386 
1387 	old = READ_ONCE(ce->inflight);
1388 	do {
1389 		if (!old) {
1390 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1391 			break;
1392 		}
1393 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1394 
1395 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1396 	return i915_request_get(rq);
1397 }
1398 
1399 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1400 {
1401 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1402 	struct i915_request *next = READ_ONCE(ve->request);
1403 
1404 	if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1405 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
1406 }
1407 
1408 static inline void
1409 __execlists_schedule_out(struct i915_request *rq,
1410 			 struct intel_engine_cs * const engine,
1411 			 unsigned int ccid)
1412 {
1413 	struct intel_context * const ce = rq->context;
1414 
1415 	/*
1416 	 * NB process_csb() is not under the engine->active.lock and hence
1417 	 * schedule_out can race with schedule_in meaning that we should
1418 	 * refrain from doing non-trivial work here.
1419 	 */
1420 
1421 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1422 		execlists_check_context(ce, engine, "after");
1423 
1424 	/*
1425 	 * If we have just completed this context, the engine may now be
1426 	 * idle and we want to re-enter powersaving.
1427 	 */
1428 	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1429 	    i915_request_completed(rq))
1430 		intel_engine_add_retire(engine, ce->timeline);
1431 
1432 	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1433 	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1434 	if (ccid < BITS_PER_LONG) {
1435 		GEM_BUG_ON(ccid == 0);
1436 		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1437 		set_bit(ccid - 1, &engine->context_tag);
1438 	}
1439 
1440 	intel_context_update_runtime(ce);
1441 	intel_engine_context_out(engine);
1442 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1443 	if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1444 		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1445 	intel_gt_pm_put_async(engine->gt);
1446 
1447 	/*
1448 	 * If this is part of a virtual engine, its next request may
1449 	 * have been blocked waiting for access to the active context.
1450 	 * We have to kick all the siblings again in case we need to
1451 	 * switch (e.g. the next request is not runnable on this
1452 	 * engine). Hopefully, we will already have submitted the next
1453 	 * request before the tasklet runs and do not need to rebuild
1454 	 * each virtual tree and kick everyone again.
1455 	 */
1456 	if (ce->engine != engine)
1457 		kick_siblings(rq, ce);
1458 
1459 	intel_context_put(ce);
1460 }
1461 
1462 static inline void
1463 execlists_schedule_out(struct i915_request *rq)
1464 {
1465 	struct intel_context * const ce = rq->context;
1466 	struct intel_engine_cs *cur, *old;
1467 	u32 ccid;
1468 
1469 	trace_i915_request_out(rq);
1470 
1471 	ccid = rq->context->lrc.ccid;
1472 	old = READ_ONCE(ce->inflight);
1473 	do
1474 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1475 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1476 	if (!cur)
1477 		__execlists_schedule_out(rq, old, ccid);
1478 
1479 	i915_request_put(rq);
1480 }
1481 
1482 static u64 execlists_update_context(struct i915_request *rq)
1483 {
1484 	struct intel_context *ce = rq->context;
1485 	u64 desc = ce->lrc.desc;
1486 	u32 tail, prev;
1487 
1488 	/*
1489 	 * WaIdleLiteRestore:bdw,skl
1490 	 *
1491 	 * We should never submit the context with the same RING_TAIL twice
1492 	 * just in case we submit an empty ring, which confuses the HW.
1493 	 *
1494 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1495 	 * the normal request to be able to always advance the RING_TAIL on
1496 	 * subsequent resubmissions (for lite restore). Should that fail us,
1497 	 * and we try and submit the same tail again, force the context
1498 	 * reload.
1499 	 *
1500 	 * If we need to return to a preempted context, we need to skip the
1501 	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1502 	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1503 	 * an earlier request.
1504 	 */
1505 	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1506 	prev = rq->ring->tail;
1507 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1508 	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1509 		desc |= CTX_DESC_FORCE_RESTORE;
1510 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1511 	rq->tail = rq->wa_tail;
1512 
1513 	/*
1514 	 * Make sure the context image is complete before we submit it to HW.
1515 	 *
1516 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1517 	 * an uncached write such as our mmio register access, the empirical
1518 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1519 	 * may not be visible to the HW prior to the completion of the UC
1520 	 * register write and that we may begin execution from the context
1521 	 * before its image is complete leading to invalid PD chasing.
1522 	 */
1523 	wmb();
1524 
1525 	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1526 	return desc;
1527 }
1528 
1529 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1530 {
1531 	if (execlists->ctrl_reg) {
1532 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1533 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1534 	} else {
1535 		writel(upper_32_bits(desc), execlists->submit_reg);
1536 		writel(lower_32_bits(desc), execlists->submit_reg);
1537 	}
1538 }
1539 
1540 static __maybe_unused char *
1541 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1542 {
1543 	if (!rq)
1544 		return "";
1545 
1546 	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1547 		 prefix,
1548 		 rq->context->lrc.ccid,
1549 		 rq->fence.context, rq->fence.seqno,
1550 		 i915_request_completed(rq) ? "!" :
1551 		 i915_request_started(rq) ? "*" :
1552 		 "",
1553 		 rq_prio(rq));
1554 
1555 	return buf;
1556 }
1557 
1558 static __maybe_unused void
1559 trace_ports(const struct intel_engine_execlists *execlists,
1560 	    const char *msg,
1561 	    struct i915_request * const *ports)
1562 {
1563 	const struct intel_engine_cs *engine =
1564 		container_of(execlists, typeof(*engine), execlists);
1565 	char __maybe_unused p0[40], p1[40];
1566 
1567 	if (!ports[0])
1568 		return;
1569 
1570 	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1571 		     dump_port(p0, sizeof(p0), "", ports[0]),
1572 		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1573 }
1574 
1575 static inline bool
1576 reset_in_progress(const struct intel_engine_execlists *execlists)
1577 {
1578 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1579 }
1580 
1581 static __maybe_unused bool
1582 assert_pending_valid(const struct intel_engine_execlists *execlists,
1583 		     const char *msg)
1584 {
1585 	struct intel_engine_cs *engine =
1586 		container_of(execlists, typeof(*engine), execlists);
1587 	struct i915_request * const *port, *rq;
1588 	struct intel_context *ce = NULL;
1589 	bool sentinel = false;
1590 	u32 ccid = -1;
1591 
1592 	trace_ports(execlists, msg, execlists->pending);
1593 
1594 	/* We may be messing around with the lists during reset, lalala */
1595 	if (reset_in_progress(execlists))
1596 		return true;
1597 
1598 	if (!execlists->pending[0]) {
1599 		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1600 			      engine->name);
1601 		return false;
1602 	}
1603 
1604 	if (execlists->pending[execlists_num_ports(execlists)]) {
1605 		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1606 			      engine->name, execlists_num_ports(execlists));
1607 		return false;
1608 	}
1609 
1610 	for (port = execlists->pending; (rq = *port); port++) {
1611 		unsigned long flags;
1612 		bool ok = true;
1613 
1614 		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1615 		GEM_BUG_ON(!i915_request_is_active(rq));
1616 
1617 		if (ce == rq->context) {
1618 			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1619 				      engine->name,
1620 				      ce->timeline->fence_context,
1621 				      port - execlists->pending);
1622 			return false;
1623 		}
1624 		ce = rq->context;
1625 
1626 		if (ccid == ce->lrc.ccid) {
1627 			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1628 				      engine->name,
1629 				      ccid, ce->timeline->fence_context,
1630 				      port - execlists->pending);
1631 			return false;
1632 		}
1633 		ccid = ce->lrc.ccid;
1634 
1635 		/*
1636 		 * Sentinels are supposed to be the last request so they flush
1637 		 * the current execution off the HW. Check that they are the only
1638 		 * request in the pending submission.
1639 		 */
1640 		if (sentinel) {
1641 			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1642 				      engine->name,
1643 				      ce->timeline->fence_context,
1644 				      port - execlists->pending);
1645 			return false;
1646 		}
1647 		sentinel = i915_request_has_sentinel(rq);
1648 
1649 		/* Hold tightly onto the lock to prevent concurrent retires! */
1650 		if (!spin_trylock_irqsave(&rq->lock, flags))
1651 			continue;
1652 
1653 		if (i915_request_completed(rq))
1654 			goto unlock;
1655 
1656 		if (i915_active_is_idle(&ce->active) &&
1657 		    !intel_context_is_barrier(ce)) {
1658 			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1659 				      engine->name,
1660 				      ce->timeline->fence_context,
1661 				      port - execlists->pending);
1662 			ok = false;
1663 			goto unlock;
1664 		}
1665 
1666 		if (!i915_vma_is_pinned(ce->state)) {
1667 			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1668 				      engine->name,
1669 				      ce->timeline->fence_context,
1670 				      port - execlists->pending);
1671 			ok = false;
1672 			goto unlock;
1673 		}
1674 
1675 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1676 			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1677 				      engine->name,
1678 				      ce->timeline->fence_context,
1679 				      port - execlists->pending);
1680 			ok = false;
1681 			goto unlock;
1682 		}
1683 
1684 unlock:
1685 		spin_unlock_irqrestore(&rq->lock, flags);
1686 		if (!ok)
1687 			return false;
1688 	}
1689 
1690 	return ce;
1691 }
1692 
1693 static void execlists_submit_ports(struct intel_engine_cs *engine)
1694 {
1695 	struct intel_engine_execlists *execlists = &engine->execlists;
1696 	unsigned int n;
1697 
1698 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1699 
1700 	/*
1701 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1702 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1703 	 * not be relinquished until the device is idle (see
1704 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1705 	 * that all ELSP are drained i.e. we have processed the CSB,
1706 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1707 	 */
1708 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1709 
1710 	/*
1711 	 * ELSQ note: the submit queue is not cleared after being submitted
1712 	 * to the HW so we need to make sure we always clean it up. This is
1713 	 * currently ensured by the fact that we always write the same number
1714 	 * of elsq entries, keep this in mind before changing the loop below.
1715 	 */
1716 	for (n = execlists_num_ports(execlists); n--; ) {
1717 		struct i915_request *rq = execlists->pending[n];
1718 
1719 		write_desc(execlists,
1720 			   rq ? execlists_update_context(rq) : 0,
1721 			   n);
1722 	}
1723 
1724 	/* we need to manually load the submit queue */
1725 	if (execlists->ctrl_reg)
1726 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1727 }
1728 
1729 static bool ctx_single_port_submission(const struct intel_context *ce)
1730 {
1731 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1732 		intel_context_force_single_submission(ce));
1733 }
1734 
1735 static bool can_merge_ctx(const struct intel_context *prev,
1736 			  const struct intel_context *next)
1737 {
1738 	if (prev != next)
1739 		return false;
1740 
1741 	if (ctx_single_port_submission(prev))
1742 		return false;
1743 
1744 	return true;
1745 }
1746 
1747 static unsigned long i915_request_flags(const struct i915_request *rq)
1748 {
1749 	return READ_ONCE(rq->fence.flags);
1750 }
1751 
1752 static bool can_merge_rq(const struct i915_request *prev,
1753 			 const struct i915_request *next)
1754 {
1755 	GEM_BUG_ON(prev == next);
1756 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1757 
1758 	/*
1759 	 * We do not submit known completed requests. Therefore if the next
1760 	 * request is already completed, we can pretend to merge it in
1761 	 * with the previous context (and we will skip updating the ELSP
1762 	 * and tracking). Thus hopefully keeping the ELSP full with active
1763 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1764 	 * us.
1765 	 */
1766 	if (i915_request_completed(next))
1767 		return true;
1768 
1769 	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1770 		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1771 		      BIT(I915_FENCE_FLAG_SENTINEL))))
1772 		return false;
1773 
1774 	if (!can_merge_ctx(prev->context, next->context))
1775 		return false;
1776 
1777 	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1778 	return true;
1779 }
1780 
1781 static void virtual_update_register_offsets(u32 *regs,
1782 					    struct intel_engine_cs *engine)
1783 {
1784 	set_offsets(regs, reg_offsets(engine), engine, false);
1785 }
1786 
1787 static bool virtual_matches(const struct virtual_engine *ve,
1788 			    const struct i915_request *rq,
1789 			    const struct intel_engine_cs *engine)
1790 {
1791 	const struct intel_engine_cs *inflight;
1792 
1793 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1794 		return false;
1795 
1796 	/*
1797 	 * We track when the HW has completed saving the context image
1798 	 * (i.e. when we have seen the final CS event switching out of
1799 	 * the context) and must not overwrite the context image before
1800 	 * then. This restricts us to only using the active engine
1801 	 * while the previous virtualized request is inflight (so
1802 	 * we reuse the register offsets). This is a very small
1803 	 * hystersis on the greedy seelction algorithm.
1804 	 */
1805 	inflight = intel_context_inflight(&ve->context);
1806 	if (inflight && inflight != engine)
1807 		return false;
1808 
1809 	return true;
1810 }
1811 
1812 static void virtual_xfer_context(struct virtual_engine *ve,
1813 				 struct intel_engine_cs *engine)
1814 {
1815 	unsigned int n;
1816 
1817 	if (likely(engine == ve->siblings[0]))
1818 		return;
1819 
1820 	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1821 	if (!intel_engine_has_relative_mmio(engine))
1822 		virtual_update_register_offsets(ve->context.lrc_reg_state,
1823 						engine);
1824 
1825 	/*
1826 	 * Move the bound engine to the top of the list for
1827 	 * future execution. We then kick this tasklet first
1828 	 * before checking others, so that we preferentially
1829 	 * reuse this set of bound registers.
1830 	 */
1831 	for (n = 1; n < ve->num_siblings; n++) {
1832 		if (ve->siblings[n] == engine) {
1833 			swap(ve->siblings[n], ve->siblings[0]);
1834 			break;
1835 		}
1836 	}
1837 }
1838 
1839 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1840 {
1841 	LIST_HEAD(list);
1842 
1843 	/*
1844 	 * We want to move the interrupted request to the back of
1845 	 * the round-robin list (i.e. its priority level), but
1846 	 * in doing so, we must then move all requests that were in
1847 	 * flight and were waiting for the interrupted request to
1848 	 * be run after it again.
1849 	 */
1850 	do {
1851 		struct i915_dependency *p;
1852 
1853 		GEM_BUG_ON(i915_request_is_active(rq));
1854 		list_move_tail(&rq->sched.link, pl);
1855 
1856 		for_each_waiter(p, rq) {
1857 			struct i915_request *w =
1858 				container_of(p->waiter, typeof(*w), sched);
1859 
1860 			if (p->flags & I915_DEPENDENCY_WEAK)
1861 				continue;
1862 
1863 			/* Leave semaphores spinning on the other engines */
1864 			if (w->engine != rq->engine)
1865 				continue;
1866 
1867 			/* No waiter should start before its signaler */
1868 			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1869 				   i915_request_started(w) &&
1870 				   !i915_request_completed(rq));
1871 
1872 			GEM_BUG_ON(i915_request_is_active(w));
1873 			if (!i915_request_is_ready(w))
1874 				continue;
1875 
1876 			if (rq_prio(w) < rq_prio(rq))
1877 				continue;
1878 
1879 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1880 			list_move_tail(&w->sched.link, &list);
1881 		}
1882 
1883 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1884 	} while (rq);
1885 }
1886 
1887 static void defer_active(struct intel_engine_cs *engine)
1888 {
1889 	struct i915_request *rq;
1890 
1891 	rq = __unwind_incomplete_requests(engine);
1892 	if (!rq)
1893 		return;
1894 
1895 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1896 }
1897 
1898 static bool
1899 need_timeslice(const struct intel_engine_cs *engine,
1900 	       const struct i915_request *rq,
1901 	       const struct rb_node *rb)
1902 {
1903 	int hint;
1904 
1905 	if (!intel_engine_has_timeslices(engine))
1906 		return false;
1907 
1908 	hint = engine->execlists.queue_priority_hint;
1909 
1910 	if (rb) {
1911 		const struct virtual_engine *ve =
1912 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1913 		const struct intel_engine_cs *inflight =
1914 			intel_context_inflight(&ve->context);
1915 
1916 		if (!inflight || inflight == engine) {
1917 			struct i915_request *next;
1918 
1919 			rcu_read_lock();
1920 			next = READ_ONCE(ve->request);
1921 			if (next)
1922 				hint = max(hint, rq_prio(next));
1923 			rcu_read_unlock();
1924 		}
1925 	}
1926 
1927 	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1928 		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1929 
1930 	GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1931 	return hint >= effective_prio(rq);
1932 }
1933 
1934 static bool
1935 timeslice_yield(const struct intel_engine_execlists *el,
1936 		const struct i915_request *rq)
1937 {
1938 	/*
1939 	 * Once bitten, forever smitten!
1940 	 *
1941 	 * If the active context ever busy-waited on a semaphore,
1942 	 * it will be treated as a hog until the end of its timeslice (i.e.
1943 	 * until it is scheduled out and replaced by a new submission,
1944 	 * possibly even its own lite-restore). The HW only sends an interrupt
1945 	 * on the first miss, and we do know if that semaphore has been
1946 	 * signaled, or even if it is now stuck on another semaphore. Play
1947 	 * safe, yield if it might be stuck -- it will be given a fresh
1948 	 * timeslice in the near future.
1949 	 */
1950 	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1951 }
1952 
1953 static bool
1954 timeslice_expired(const struct intel_engine_execlists *el,
1955 		  const struct i915_request *rq)
1956 {
1957 	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1958 }
1959 
1960 static int
1961 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1962 {
1963 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1964 		return engine->execlists.queue_priority_hint;
1965 
1966 	return rq_prio(list_next_entry(rq, sched.link));
1967 }
1968 
1969 static inline unsigned long
1970 timeslice(const struct intel_engine_cs *engine)
1971 {
1972 	return READ_ONCE(engine->props.timeslice_duration_ms);
1973 }
1974 
1975 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1976 {
1977 	const struct intel_engine_execlists *execlists = &engine->execlists;
1978 	const struct i915_request *rq = *execlists->active;
1979 
1980 	if (!rq || i915_request_completed(rq))
1981 		return 0;
1982 
1983 	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1984 		return 0;
1985 
1986 	return timeslice(engine);
1987 }
1988 
1989 static void set_timeslice(struct intel_engine_cs *engine)
1990 {
1991 	unsigned long duration;
1992 
1993 	if (!intel_engine_has_timeslices(engine))
1994 		return;
1995 
1996 	duration = active_timeslice(engine);
1997 	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
1998 
1999 	set_timer_ms(&engine->execlists.timer, duration);
2000 }
2001 
2002 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2003 {
2004 	struct intel_engine_execlists *execlists = &engine->execlists;
2005 	unsigned long duration;
2006 
2007 	if (!intel_engine_has_timeslices(engine))
2008 		return;
2009 
2010 	WRITE_ONCE(execlists->switch_priority_hint, prio);
2011 	if (prio == INT_MIN)
2012 		return;
2013 
2014 	if (timer_pending(&execlists->timer))
2015 		return;
2016 
2017 	duration = timeslice(engine);
2018 	ENGINE_TRACE(engine,
2019 		     "start timeslicing, prio:%d, interval:%lu",
2020 		     prio, duration);
2021 
2022 	set_timer_ms(&execlists->timer, duration);
2023 }
2024 
2025 static void record_preemption(struct intel_engine_execlists *execlists)
2026 {
2027 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2028 }
2029 
2030 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2031 					    const struct i915_request *rq)
2032 {
2033 	if (!rq)
2034 		return 0;
2035 
2036 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2037 	if (unlikely(intel_context_is_banned(rq->context)))
2038 		return 1;
2039 
2040 	return READ_ONCE(engine->props.preempt_timeout_ms);
2041 }
2042 
2043 static void set_preempt_timeout(struct intel_engine_cs *engine,
2044 				const struct i915_request *rq)
2045 {
2046 	if (!intel_engine_has_preempt_reset(engine))
2047 		return;
2048 
2049 	set_timer_ms(&engine->execlists.preempt,
2050 		     active_preempt_timeout(engine, rq));
2051 }
2052 
2053 static inline void clear_ports(struct i915_request **ports, int count)
2054 {
2055 	memset_p((void **)ports, NULL, count);
2056 }
2057 
2058 static inline void
2059 copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2060 {
2061 	/* A memcpy_p() would be very useful here! */
2062 	while (count--)
2063 		WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2064 }
2065 
2066 static void execlists_dequeue(struct intel_engine_cs *engine)
2067 {
2068 	struct intel_engine_execlists * const execlists = &engine->execlists;
2069 	struct i915_request **port = execlists->pending;
2070 	struct i915_request ** const last_port = port + execlists->port_mask;
2071 	struct i915_request * const *active;
2072 	struct i915_request *last;
2073 	struct rb_node *rb;
2074 	bool submit = false;
2075 
2076 	/*
2077 	 * Hardware submission is through 2 ports. Conceptually each port
2078 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2079 	 * static for a context, and unique to each, so we only execute
2080 	 * requests belonging to a single context from each ring. RING_HEAD
2081 	 * is maintained by the CS in the context image, it marks the place
2082 	 * where it got up to last time, and through RING_TAIL we tell the CS
2083 	 * where we want to execute up to this time.
2084 	 *
2085 	 * In this list the requests are in order of execution. Consecutive
2086 	 * requests from the same context are adjacent in the ringbuffer. We
2087 	 * can combine these requests into a single RING_TAIL update:
2088 	 *
2089 	 *              RING_HEAD...req1...req2
2090 	 *                                    ^- RING_TAIL
2091 	 * since to execute req2 the CS must first execute req1.
2092 	 *
2093 	 * Our goal then is to point each port to the end of a consecutive
2094 	 * sequence of requests as being the most optimal (fewest wake ups
2095 	 * and context switches) submission.
2096 	 */
2097 
2098 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2099 		struct virtual_engine *ve =
2100 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2101 		struct i915_request *rq = READ_ONCE(ve->request);
2102 
2103 		if (!rq) { /* lazily cleanup after another engine handled rq */
2104 			rb_erase_cached(rb, &execlists->virtual);
2105 			RB_CLEAR_NODE(rb);
2106 			rb = rb_first_cached(&execlists->virtual);
2107 			continue;
2108 		}
2109 
2110 		if (!virtual_matches(ve, rq, engine)) {
2111 			rb = rb_next(rb);
2112 			continue;
2113 		}
2114 
2115 		break;
2116 	}
2117 
2118 	/*
2119 	 * If the queue is higher priority than the last
2120 	 * request in the currently active context, submit afresh.
2121 	 * We will resubmit again afterwards in case we need to split
2122 	 * the active context to interject the preemption request,
2123 	 * i.e. we will retrigger preemption following the ack in case
2124 	 * of trouble.
2125 	 */
2126 	active = READ_ONCE(execlists->active);
2127 
2128 	/*
2129 	 * In theory we can skip over completed contexts that have not
2130 	 * yet been processed by events (as those events are in flight):
2131 	 *
2132 	 * while ((last = *active) && i915_request_completed(last))
2133 	 *	active++;
2134 	 *
2135 	 * However, the GPU cannot handle this as it will ultimately
2136 	 * find itself trying to jump back into a context it has just
2137 	 * completed and barf.
2138 	 */
2139 
2140 	if ((last = *active)) {
2141 		if (need_preempt(engine, last, rb)) {
2142 			if (i915_request_completed(last)) {
2143 				tasklet_hi_schedule(&execlists->tasklet);
2144 				return;
2145 			}
2146 
2147 			ENGINE_TRACE(engine,
2148 				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2149 				     last->fence.context,
2150 				     last->fence.seqno,
2151 				     last->sched.attr.priority,
2152 				     execlists->queue_priority_hint);
2153 			record_preemption(execlists);
2154 
2155 			/*
2156 			 * Don't let the RING_HEAD advance past the breadcrumb
2157 			 * as we unwind (and until we resubmit) so that we do
2158 			 * not accidentally tell it to go backwards.
2159 			 */
2160 			ring_set_paused(engine, 1);
2161 
2162 			/*
2163 			 * Note that we have not stopped the GPU at this point,
2164 			 * so we are unwinding the incomplete requests as they
2165 			 * remain inflight and so by the time we do complete
2166 			 * the preemption, some of the unwound requests may
2167 			 * complete!
2168 			 */
2169 			__unwind_incomplete_requests(engine);
2170 
2171 			last = NULL;
2172 		} else if (need_timeslice(engine, last, rb) &&
2173 			   timeslice_expired(execlists, last)) {
2174 			if (i915_request_completed(last)) {
2175 				tasklet_hi_schedule(&execlists->tasklet);
2176 				return;
2177 			}
2178 
2179 			ENGINE_TRACE(engine,
2180 				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2181 				     last->fence.context,
2182 				     last->fence.seqno,
2183 				     last->sched.attr.priority,
2184 				     execlists->queue_priority_hint,
2185 				     yesno(timeslice_yield(execlists, last)));
2186 
2187 			ring_set_paused(engine, 1);
2188 			defer_active(engine);
2189 
2190 			/*
2191 			 * Unlike for preemption, if we rewind and continue
2192 			 * executing the same context as previously active,
2193 			 * the order of execution will remain the same and
2194 			 * the tail will only advance. We do not need to
2195 			 * force a full context restore, as a lite-restore
2196 			 * is sufficient to resample the monotonic TAIL.
2197 			 *
2198 			 * If we switch to any other context, similarly we
2199 			 * will not rewind TAIL of current context, and
2200 			 * normal save/restore will preserve state and allow
2201 			 * us to later continue executing the same request.
2202 			 */
2203 			last = NULL;
2204 		} else {
2205 			/*
2206 			 * Otherwise if we already have a request pending
2207 			 * for execution after the current one, we can
2208 			 * just wait until the next CS event before
2209 			 * queuing more. In either case we will force a
2210 			 * lite-restore preemption event, but if we wait
2211 			 * we hopefully coalesce several updates into a single
2212 			 * submission.
2213 			 */
2214 			if (!list_is_last(&last->sched.link,
2215 					  &engine->active.requests)) {
2216 				/*
2217 				 * Even if ELSP[1] is occupied and not worthy
2218 				 * of timeslices, our queue might be.
2219 				 */
2220 				start_timeslice(engine, queue_prio(execlists));
2221 				return;
2222 			}
2223 		}
2224 	}
2225 
2226 	while (rb) { /* XXX virtual is always taking precedence */
2227 		struct virtual_engine *ve =
2228 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2229 		struct i915_request *rq;
2230 
2231 		spin_lock(&ve->base.active.lock);
2232 
2233 		rq = ve->request;
2234 		if (unlikely(!rq)) { /* lost the race to a sibling */
2235 			spin_unlock(&ve->base.active.lock);
2236 			rb_erase_cached(rb, &execlists->virtual);
2237 			RB_CLEAR_NODE(rb);
2238 			rb = rb_first_cached(&execlists->virtual);
2239 			continue;
2240 		}
2241 
2242 		GEM_BUG_ON(rq != ve->request);
2243 		GEM_BUG_ON(rq->engine != &ve->base);
2244 		GEM_BUG_ON(rq->context != &ve->context);
2245 
2246 		if (rq_prio(rq) >= queue_prio(execlists)) {
2247 			if (!virtual_matches(ve, rq, engine)) {
2248 				spin_unlock(&ve->base.active.lock);
2249 				rb = rb_next(rb);
2250 				continue;
2251 			}
2252 
2253 			if (last && !can_merge_rq(last, rq)) {
2254 				spin_unlock(&ve->base.active.lock);
2255 				start_timeslice(engine, rq_prio(rq));
2256 				return; /* leave this for another sibling */
2257 			}
2258 
2259 			ENGINE_TRACE(engine,
2260 				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2261 				     rq->fence.context,
2262 				     rq->fence.seqno,
2263 				     i915_request_completed(rq) ? "!" :
2264 				     i915_request_started(rq) ? "*" :
2265 				     "",
2266 				     yesno(engine != ve->siblings[0]));
2267 
2268 			WRITE_ONCE(ve->request, NULL);
2269 			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2270 				   INT_MIN);
2271 			rb_erase_cached(rb, &execlists->virtual);
2272 			RB_CLEAR_NODE(rb);
2273 
2274 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2275 			WRITE_ONCE(rq->engine, engine);
2276 
2277 			if (__i915_request_submit(rq)) {
2278 				/*
2279 				 * Only after we confirm that we will submit
2280 				 * this request (i.e. it has not already
2281 				 * completed), do we want to update the context.
2282 				 *
2283 				 * This serves two purposes. It avoids
2284 				 * unnecessary work if we are resubmitting an
2285 				 * already completed request after timeslicing.
2286 				 * But more importantly, it prevents us altering
2287 				 * ve->siblings[] on an idle context, where
2288 				 * we may be using ve->siblings[] in
2289 				 * virtual_context_enter / virtual_context_exit.
2290 				 */
2291 				virtual_xfer_context(ve, engine);
2292 				GEM_BUG_ON(ve->siblings[0] != engine);
2293 
2294 				submit = true;
2295 				last = rq;
2296 			}
2297 			i915_request_put(rq);
2298 
2299 			/*
2300 			 * Hmm, we have a bunch of virtual engine requests,
2301 			 * but the first one was already completed (thanks
2302 			 * preempt-to-busy!). Keep looking at the veng queue
2303 			 * until we have no more relevant requests (i.e.
2304 			 * the normal submit queue has higher priority).
2305 			 */
2306 			if (!submit) {
2307 				spin_unlock(&ve->base.active.lock);
2308 				rb = rb_first_cached(&execlists->virtual);
2309 				continue;
2310 			}
2311 		}
2312 
2313 		spin_unlock(&ve->base.active.lock);
2314 		break;
2315 	}
2316 
2317 	while ((rb = rb_first_cached(&execlists->queue))) {
2318 		struct i915_priolist *p = to_priolist(rb);
2319 		struct i915_request *rq, *rn;
2320 		int i;
2321 
2322 		priolist_for_each_request_consume(rq, rn, p, i) {
2323 			bool merge = true;
2324 
2325 			/*
2326 			 * Can we combine this request with the current port?
2327 			 * It has to be the same context/ringbuffer and not
2328 			 * have any exceptions (e.g. GVT saying never to
2329 			 * combine contexts).
2330 			 *
2331 			 * If we can combine the requests, we can execute both
2332 			 * by updating the RING_TAIL to point to the end of the
2333 			 * second request, and so we never need to tell the
2334 			 * hardware about the first.
2335 			 */
2336 			if (last && !can_merge_rq(last, rq)) {
2337 				/*
2338 				 * If we are on the second port and cannot
2339 				 * combine this request with the last, then we
2340 				 * are done.
2341 				 */
2342 				if (port == last_port)
2343 					goto done;
2344 
2345 				/*
2346 				 * We must not populate both ELSP[] with the
2347 				 * same LRCA, i.e. we must submit 2 different
2348 				 * contexts if we submit 2 ELSP.
2349 				 */
2350 				if (last->context == rq->context)
2351 					goto done;
2352 
2353 				if (i915_request_has_sentinel(last))
2354 					goto done;
2355 
2356 				/*
2357 				 * If GVT overrides us we only ever submit
2358 				 * port[0], leaving port[1] empty. Note that we
2359 				 * also have to be careful that we don't queue
2360 				 * the same context (even though a different
2361 				 * request) to the second port.
2362 				 */
2363 				if (ctx_single_port_submission(last->context) ||
2364 				    ctx_single_port_submission(rq->context))
2365 					goto done;
2366 
2367 				merge = false;
2368 			}
2369 
2370 			if (__i915_request_submit(rq)) {
2371 				if (!merge) {
2372 					*port = execlists_schedule_in(last, port - execlists->pending);
2373 					port++;
2374 					last = NULL;
2375 				}
2376 
2377 				GEM_BUG_ON(last &&
2378 					   !can_merge_ctx(last->context,
2379 							  rq->context));
2380 				GEM_BUG_ON(last &&
2381 					   i915_seqno_passed(last->fence.seqno,
2382 							     rq->fence.seqno));
2383 
2384 				submit = true;
2385 				last = rq;
2386 			}
2387 		}
2388 
2389 		rb_erase_cached(&p->node, &execlists->queue);
2390 		i915_priolist_free(p);
2391 	}
2392 
2393 done:
2394 	/*
2395 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2396 	 *
2397 	 * We choose the priority hint such that if we add a request of greater
2398 	 * priority than this, we kick the submission tasklet to decide on
2399 	 * the right order of submitting the requests to hardware. We must
2400 	 * also be prepared to reorder requests as they are in-flight on the
2401 	 * HW. We derive the priority hint then as the first "hole" in
2402 	 * the HW submission ports and if there are no available slots,
2403 	 * the priority of the lowest executing request, i.e. last.
2404 	 *
2405 	 * When we do receive a higher priority request ready to run from the
2406 	 * user, see queue_request(), the priority hint is bumped to that
2407 	 * request triggering preemption on the next dequeue (or subsequent
2408 	 * interrupt for secondary ports).
2409 	 */
2410 	execlists->queue_priority_hint = queue_prio(execlists);
2411 
2412 	if (submit) {
2413 		*port = execlists_schedule_in(last, port - execlists->pending);
2414 		execlists->switch_priority_hint =
2415 			switch_prio(engine, *execlists->pending);
2416 
2417 		/*
2418 		 * Skip if we ended up with exactly the same set of requests,
2419 		 * e.g. trying to timeslice a pair of ordered contexts
2420 		 */
2421 		if (!memcmp(active, execlists->pending,
2422 			    (port - execlists->pending + 1) * sizeof(*port))) {
2423 			do
2424 				execlists_schedule_out(fetch_and_zero(port));
2425 			while (port-- != execlists->pending);
2426 
2427 			goto skip_submit;
2428 		}
2429 		clear_ports(port + 1, last_port - port);
2430 
2431 		WRITE_ONCE(execlists->yield, -1);
2432 		set_preempt_timeout(engine, *active);
2433 		execlists_submit_ports(engine);
2434 	} else {
2435 		start_timeslice(engine, execlists->queue_priority_hint);
2436 skip_submit:
2437 		ring_set_paused(engine, 0);
2438 	}
2439 }
2440 
2441 static void
2442 cancel_port_requests(struct intel_engine_execlists * const execlists)
2443 {
2444 	struct i915_request * const *port;
2445 
2446 	for (port = execlists->pending; *port; port++)
2447 		execlists_schedule_out(*port);
2448 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2449 
2450 	/* Mark the end of active before we overwrite *active */
2451 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2452 		execlists_schedule_out(*port);
2453 	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2454 
2455 	smp_wmb(); /* complete the seqlock for execlists_active() */
2456 	WRITE_ONCE(execlists->active, execlists->inflight);
2457 }
2458 
2459 static inline void
2460 invalidate_csb_entries(const u64 *first, const u64 *last)
2461 {
2462 	clflush((void *)first);
2463 	clflush((void *)last);
2464 }
2465 
2466 /*
2467  * Starting with Gen12, the status has a new format:
2468  *
2469  *     bit  0:     switched to new queue
2470  *     bit  1:     reserved
2471  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2472  *                 switch detail is set to "wait on semaphore"
2473  *     bits 3-5:   engine class
2474  *     bits 6-11:  engine instance
2475  *     bits 12-14: reserved
2476  *     bits 15-25: sw context id of the lrc the GT switched to
2477  *     bits 26-31: sw counter of the lrc the GT switched to
2478  *     bits 32-35: context switch detail
2479  *                  - 0: ctx complete
2480  *                  - 1: wait on sync flip
2481  *                  - 2: wait on vblank
2482  *                  - 3: wait on scanline
2483  *                  - 4: wait on semaphore
2484  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2485  *                       WAIT_FOR_EVENT)
2486  *     bit  36:    reserved
2487  *     bits 37-43: wait detail (for switch detail 1 to 4)
2488  *     bits 44-46: reserved
2489  *     bits 47-57: sw context id of the lrc the GT switched away from
2490  *     bits 58-63: sw counter of the lrc the GT switched away from
2491  */
2492 static inline bool gen12_csb_parse(const u64 csb)
2493 {
2494 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(csb));
2495 	bool new_queue =
2496 		lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2497 
2498 	/*
2499 	 * The context switch detail is not guaranteed to be 5 when a preemption
2500 	 * occurs, so we can't just check for that. The check below works for
2501 	 * all the cases we care about, including preemptions of WAIT
2502 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2503 	 * would require some extra handling, but we don't support that.
2504 	 */
2505 	if (!ctx_away_valid || new_queue) {
2506 		GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(csb)));
2507 		return true;
2508 	}
2509 
2510 	/*
2511 	 * switch detail = 5 is covered by the case above and we do not expect a
2512 	 * context switch on an unsuccessful wait instruction since we always
2513 	 * use polling mode.
2514 	 */
2515 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb)));
2516 	return false;
2517 }
2518 
2519 static inline bool gen8_csb_parse(const u64 csb)
2520 {
2521 	return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2522 }
2523 
2524 static noinline u64
2525 wa_csb_read(const struct intel_engine_cs *engine, u64 * const csb)
2526 {
2527 	u64 entry;
2528 
2529 	/*
2530 	 * Reading from the HWSP has one particular advantage: we can detect
2531 	 * a stale entry. Since the write into HWSP is broken, we have no reason
2532 	 * to trust the HW at all, the mmio entry may equally be unordered, so
2533 	 * we prefer the path that is self-checking and as a last resort,
2534 	 * return the mmio value.
2535 	 *
2536 	 * tgl,dg1:HSDES#22011327657
2537 	 */
2538 	preempt_disable();
2539 	if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 10)) {
2540 		int idx = csb - engine->execlists.csb_status;
2541 		int status;
2542 
2543 		status = GEN8_EXECLISTS_STATUS_BUF;
2544 		if (idx >= 6) {
2545 			status = GEN11_EXECLISTS_STATUS_BUF2;
2546 			idx -= 6;
2547 		}
2548 		status += sizeof(u64) * idx;
2549 
2550 		entry = intel_uncore_read64(engine->uncore,
2551 					    _MMIO(engine->mmio_base + status));
2552 	}
2553 	preempt_enable();
2554 
2555 	return entry;
2556 }
2557 
2558 static inline u64
2559 csb_read(const struct intel_engine_cs *engine, u64 * const csb)
2560 {
2561 	u64 entry = READ_ONCE(*csb);
2562 
2563 	/*
2564 	 * Unfortunately, the GPU does not always serialise its write
2565 	 * of the CSB entries before its write of the CSB pointer, at least
2566 	 * from the perspective of the CPU, using what is known as a Global
2567 	 * Observation Point. We may read a new CSB tail pointer, but then
2568 	 * read the stale CSB entries, causing us to misinterpret the
2569 	 * context-switch events, and eventually declare the GPU hung.
2570 	 *
2571 	 * icl:HSDES#1806554093
2572 	 * tgl:HSDES#22011248461
2573 	 */
2574 	if (unlikely(entry == -1))
2575 		entry = wa_csb_read(engine, csb);
2576 
2577 	/* Consume this entry so that we can spot its future reuse. */
2578 	WRITE_ONCE(*csb, -1);
2579 
2580 	/* ELSP is an implicit wmb() before the GPU wraps and overwrites csb */
2581 	return entry;
2582 }
2583 
2584 static void process_csb(struct intel_engine_cs *engine)
2585 {
2586 	struct intel_engine_execlists * const execlists = &engine->execlists;
2587 	u64 * const buf = execlists->csb_status;
2588 	const u8 num_entries = execlists->csb_size;
2589 	u8 head, tail;
2590 
2591 	/*
2592 	 * As we modify our execlists state tracking we require exclusive
2593 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2594 	 * and we assume that is only inside the reset paths and so serialised.
2595 	 */
2596 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2597 		   !reset_in_progress(execlists));
2598 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2599 
2600 	/*
2601 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2602 	 * When reading from the csb_write mmio register, we have to be
2603 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2604 	 * the low 4bits. As it happens we know the next 4bits are always
2605 	 * zero and so we can simply masked off the low u8 of the register
2606 	 * and treat it identically to reading from the HWSP (without having
2607 	 * to use explicit shifting and masking, and probably bifurcating
2608 	 * the code to handle the legacy mmio read).
2609 	 */
2610 	head = execlists->csb_head;
2611 	tail = READ_ONCE(*execlists->csb_write);
2612 	if (unlikely(head == tail))
2613 		return;
2614 
2615 	/*
2616 	 * We will consume all events from HW, or at least pretend to.
2617 	 *
2618 	 * The sequence of events from the HW is deterministic, and derived
2619 	 * from our writes to the ELSP, with a smidgen of variability for
2620 	 * the arrival of the asynchronous requests wrt to the inflight
2621 	 * execution. If the HW sends an event that does not correspond with
2622 	 * the one we are expecting, we have to abandon all hope as we lose
2623 	 * all tracking of what the engine is actually executing. We will
2624 	 * only detect we are out of sequence with the HW when we get an
2625 	 * 'impossible' event because we have already drained our own
2626 	 * preemption/promotion queue. If this occurs, we know that we likely
2627 	 * lost track of execution earlier and must unwind and restart, the
2628 	 * simplest way is by stop processing the event queue and force the
2629 	 * engine to reset.
2630 	 */
2631 	execlists->csb_head = tail;
2632 	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2633 
2634 	/*
2635 	 * Hopefully paired with a wmb() in HW!
2636 	 *
2637 	 * We must complete the read of the write pointer before any reads
2638 	 * from the CSB, so that we do not see stale values. Without an rmb
2639 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2640 	 * we perform the READ_ONCE(*csb_write).
2641 	 */
2642 	rmb();
2643 	do {
2644 		bool promote;
2645 		u64 csb;
2646 
2647 		if (++head == num_entries)
2648 			head = 0;
2649 
2650 		/*
2651 		 * We are flying near dragons again.
2652 		 *
2653 		 * We hold a reference to the request in execlist_port[]
2654 		 * but no more than that. We are operating in softirq
2655 		 * context and so cannot hold any mutex or sleep. That
2656 		 * prevents us stopping the requests we are processing
2657 		 * in port[] from being retired simultaneously (the
2658 		 * breadcrumb will be complete before we see the
2659 		 * context-switch). As we only hold the reference to the
2660 		 * request, any pointer chasing underneath the request
2661 		 * is subject to a potential use-after-free. Thus we
2662 		 * store all of the bookkeeping within port[] as
2663 		 * required, and avoid using unguarded pointers beneath
2664 		 * request itself. The same applies to the atomic
2665 		 * status notifier.
2666 		 */
2667 
2668 		csb = csb_read(engine, buf + head);
2669 		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2670 			     head, upper_32_bits(csb), lower_32_bits(csb));
2671 
2672 		if (INTEL_GEN(engine->i915) >= 12)
2673 			promote = gen12_csb_parse(csb);
2674 		else
2675 			promote = gen8_csb_parse(csb);
2676 		if (promote) {
2677 			struct i915_request * const *old = execlists->active;
2678 
2679 			if (GEM_WARN_ON(!*execlists->pending)) {
2680 				execlists->error_interrupt |= ERROR_CSB;
2681 				break;
2682 			}
2683 
2684 			ring_set_paused(engine, 0);
2685 
2686 			/* Point active to the new ELSP; prevent overwriting */
2687 			WRITE_ONCE(execlists->active, execlists->pending);
2688 			smp_wmb(); /* notify execlists_active() */
2689 
2690 			/* cancel old inflight, prepare for switch */
2691 			trace_ports(execlists, "preempted", old);
2692 			while (*old)
2693 				execlists_schedule_out(*old++);
2694 
2695 			/* switch pending to inflight */
2696 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2697 			copy_ports(execlists->inflight,
2698 				   execlists->pending,
2699 				   execlists_num_ports(execlists));
2700 			smp_wmb(); /* complete the seqlock */
2701 			WRITE_ONCE(execlists->active, execlists->inflight);
2702 
2703 			/* XXX Magic delay for tgl */
2704 			ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2705 
2706 			WRITE_ONCE(execlists->pending[0], NULL);
2707 		} else {
2708 			if (GEM_WARN_ON(!*execlists->active)) {
2709 				execlists->error_interrupt |= ERROR_CSB;
2710 				break;
2711 			}
2712 
2713 			/* port0 completed, advanced to port1 */
2714 			trace_ports(execlists, "completed", execlists->active);
2715 
2716 			/*
2717 			 * We rely on the hardware being strongly
2718 			 * ordered, that the breadcrumb write is
2719 			 * coherent (visible from the CPU) before the
2720 			 * user interrupt is processed. One might assume
2721 			 * that the breadcrumb write being before the
2722 			 * user interrupt and the CS event for the context
2723 			 * switch would therefore be before the CS event
2724 			 * itself...
2725 			 */
2726 			if (GEM_SHOW_DEBUG() &&
2727 			    !i915_request_completed(*execlists->active)) {
2728 				struct i915_request *rq = *execlists->active;
2729 				const u32 *regs __maybe_unused =
2730 					rq->context->lrc_reg_state;
2731 
2732 				ENGINE_TRACE(engine,
2733 					     "context completed before request!\n");
2734 				ENGINE_TRACE(engine,
2735 					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2736 					     ENGINE_READ(engine, RING_START),
2737 					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2738 					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2739 					     ENGINE_READ(engine, RING_CTL),
2740 					     ENGINE_READ(engine, RING_MI_MODE));
2741 				ENGINE_TRACE(engine,
2742 					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2743 					     i915_ggtt_offset(rq->ring->vma),
2744 					     rq->head, rq->tail,
2745 					     rq->fence.context,
2746 					     lower_32_bits(rq->fence.seqno),
2747 					     hwsp_seqno(rq));
2748 				ENGINE_TRACE(engine,
2749 					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2750 					     regs[CTX_RING_START],
2751 					     regs[CTX_RING_HEAD],
2752 					     regs[CTX_RING_TAIL]);
2753 			}
2754 
2755 			execlists_schedule_out(*execlists->active++);
2756 
2757 			GEM_BUG_ON(execlists->active - execlists->inflight >
2758 				   execlists_num_ports(execlists));
2759 		}
2760 	} while (head != tail);
2761 
2762 	set_timeslice(engine);
2763 
2764 	/*
2765 	 * Gen11 has proven to fail wrt global observation point between
2766 	 * entry and tail update, failing on the ordering and thus
2767 	 * we see an old entry in the context status buffer.
2768 	 *
2769 	 * Forcibly evict out entries for the next gpu csb update,
2770 	 * to increase the odds that we get a fresh entries with non
2771 	 * working hardware. The cost for doing so comes out mostly with
2772 	 * the wash as hardware, working or not, will need to do the
2773 	 * invalidation before.
2774 	 */
2775 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2776 }
2777 
2778 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2779 {
2780 	lockdep_assert_held(&engine->active.lock);
2781 	if (!READ_ONCE(engine->execlists.pending[0])) {
2782 		rcu_read_lock(); /* protect peeking at execlists->active */
2783 		execlists_dequeue(engine);
2784 		rcu_read_unlock();
2785 	}
2786 }
2787 
2788 static void __execlists_hold(struct i915_request *rq)
2789 {
2790 	LIST_HEAD(list);
2791 
2792 	do {
2793 		struct i915_dependency *p;
2794 
2795 		if (i915_request_is_active(rq))
2796 			__i915_request_unsubmit(rq);
2797 
2798 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2799 		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2800 		i915_request_set_hold(rq);
2801 		RQ_TRACE(rq, "on hold\n");
2802 
2803 		for_each_waiter(p, rq) {
2804 			struct i915_request *w =
2805 				container_of(p->waiter, typeof(*w), sched);
2806 
2807 			/* Leave semaphores spinning on the other engines */
2808 			if (w->engine != rq->engine)
2809 				continue;
2810 
2811 			if (!i915_request_is_ready(w))
2812 				continue;
2813 
2814 			if (i915_request_completed(w))
2815 				continue;
2816 
2817 			if (i915_request_on_hold(w))
2818 				continue;
2819 
2820 			list_move_tail(&w->sched.link, &list);
2821 		}
2822 
2823 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2824 	} while (rq);
2825 }
2826 
2827 static bool execlists_hold(struct intel_engine_cs *engine,
2828 			   struct i915_request *rq)
2829 {
2830 	spin_lock_irq(&engine->active.lock);
2831 
2832 	if (i915_request_completed(rq)) { /* too late! */
2833 		rq = NULL;
2834 		goto unlock;
2835 	}
2836 
2837 	if (rq->engine != engine) { /* preempted virtual engine */
2838 		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2839 
2840 		/*
2841 		 * intel_context_inflight() is only protected by virtue
2842 		 * of process_csb() being called only by the tasklet (or
2843 		 * directly from inside reset while the tasklet is suspended).
2844 		 * Assert that neither of those are allowed to run while we
2845 		 * poke at the request queues.
2846 		 */
2847 		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2848 
2849 		/*
2850 		 * An unsubmitted request along a virtual engine will
2851 		 * remain on the active (this) engine until we are able
2852 		 * to process the context switch away (and so mark the
2853 		 * context as no longer in flight). That cannot have happened
2854 		 * yet, otherwise we would not be hanging!
2855 		 */
2856 		spin_lock(&ve->base.active.lock);
2857 		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2858 		GEM_BUG_ON(ve->request != rq);
2859 		ve->request = NULL;
2860 		spin_unlock(&ve->base.active.lock);
2861 		i915_request_put(rq);
2862 
2863 		rq->engine = engine;
2864 	}
2865 
2866 	/*
2867 	 * Transfer this request onto the hold queue to prevent it
2868 	 * being resumbitted to HW (and potentially completed) before we have
2869 	 * released it. Since we may have already submitted following
2870 	 * requests, we need to remove those as well.
2871 	 */
2872 	GEM_BUG_ON(i915_request_on_hold(rq));
2873 	GEM_BUG_ON(rq->engine != engine);
2874 	__execlists_hold(rq);
2875 	GEM_BUG_ON(list_empty(&engine->active.hold));
2876 
2877 unlock:
2878 	spin_unlock_irq(&engine->active.lock);
2879 	return rq;
2880 }
2881 
2882 static bool hold_request(const struct i915_request *rq)
2883 {
2884 	struct i915_dependency *p;
2885 	bool result = false;
2886 
2887 	/*
2888 	 * If one of our ancestors is on hold, we must also be on hold,
2889 	 * otherwise we will bypass it and execute before it.
2890 	 */
2891 	rcu_read_lock();
2892 	for_each_signaler(p, rq) {
2893 		const struct i915_request *s =
2894 			container_of(p->signaler, typeof(*s), sched);
2895 
2896 		if (s->engine != rq->engine)
2897 			continue;
2898 
2899 		result = i915_request_on_hold(s);
2900 		if (result)
2901 			break;
2902 	}
2903 	rcu_read_unlock();
2904 
2905 	return result;
2906 }
2907 
2908 static void __execlists_unhold(struct i915_request *rq)
2909 {
2910 	LIST_HEAD(list);
2911 
2912 	do {
2913 		struct i915_dependency *p;
2914 
2915 		RQ_TRACE(rq, "hold release\n");
2916 
2917 		GEM_BUG_ON(!i915_request_on_hold(rq));
2918 		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2919 
2920 		i915_request_clear_hold(rq);
2921 		list_move_tail(&rq->sched.link,
2922 			       i915_sched_lookup_priolist(rq->engine,
2923 							  rq_prio(rq)));
2924 		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2925 
2926 		/* Also release any children on this engine that are ready */
2927 		for_each_waiter(p, rq) {
2928 			struct i915_request *w =
2929 				container_of(p->waiter, typeof(*w), sched);
2930 
2931 			/* Propagate any change in error status */
2932 			if (rq->fence.error)
2933 				i915_request_set_error_once(w, rq->fence.error);
2934 
2935 			if (w->engine != rq->engine)
2936 				continue;
2937 
2938 			if (!i915_request_on_hold(w))
2939 				continue;
2940 
2941 			/* Check that no other parents are also on hold */
2942 			if (hold_request(w))
2943 				continue;
2944 
2945 			list_move_tail(&w->sched.link, &list);
2946 		}
2947 
2948 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2949 	} while (rq);
2950 }
2951 
2952 static void execlists_unhold(struct intel_engine_cs *engine,
2953 			     struct i915_request *rq)
2954 {
2955 	spin_lock_irq(&engine->active.lock);
2956 
2957 	/*
2958 	 * Move this request back to the priority queue, and all of its
2959 	 * children and grandchildren that were suspended along with it.
2960 	 */
2961 	__execlists_unhold(rq);
2962 
2963 	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2964 		engine->execlists.queue_priority_hint = rq_prio(rq);
2965 		tasklet_hi_schedule(&engine->execlists.tasklet);
2966 	}
2967 
2968 	spin_unlock_irq(&engine->active.lock);
2969 }
2970 
2971 struct execlists_capture {
2972 	struct work_struct work;
2973 	struct i915_request *rq;
2974 	struct i915_gpu_coredump *error;
2975 };
2976 
2977 static void execlists_capture_work(struct work_struct *work)
2978 {
2979 	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2980 	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2981 	struct intel_engine_cs *engine = cap->rq->engine;
2982 	struct intel_gt_coredump *gt = cap->error->gt;
2983 	struct intel_engine_capture_vma *vma;
2984 
2985 	/* Compress all the objects attached to the request, slow! */
2986 	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2987 	if (vma) {
2988 		struct i915_vma_compress *compress =
2989 			i915_vma_capture_prepare(gt);
2990 
2991 		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2992 		i915_vma_capture_finish(gt, compress);
2993 	}
2994 
2995 	gt->simulated = gt->engine->simulated;
2996 	cap->error->simulated = gt->simulated;
2997 
2998 	/* Publish the error state, and announce it to the world */
2999 	i915_error_state_store(cap->error);
3000 	i915_gpu_coredump_put(cap->error);
3001 
3002 	/* Return this request and all that depend upon it for signaling */
3003 	execlists_unhold(engine, cap->rq);
3004 	i915_request_put(cap->rq);
3005 
3006 	kfree(cap);
3007 }
3008 
3009 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
3010 {
3011 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
3012 	struct execlists_capture *cap;
3013 
3014 	cap = kmalloc(sizeof(*cap), gfp);
3015 	if (!cap)
3016 		return NULL;
3017 
3018 	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
3019 	if (!cap->error)
3020 		goto err_cap;
3021 
3022 	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
3023 	if (!cap->error->gt)
3024 		goto err_gpu;
3025 
3026 	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
3027 	if (!cap->error->gt->engine)
3028 		goto err_gt;
3029 
3030 	cap->error->gt->engine->hung = true;
3031 
3032 	return cap;
3033 
3034 err_gt:
3035 	kfree(cap->error->gt);
3036 err_gpu:
3037 	kfree(cap->error);
3038 err_cap:
3039 	kfree(cap);
3040 	return NULL;
3041 }
3042 
3043 static struct i915_request *
3044 active_context(struct intel_engine_cs *engine, u32 ccid)
3045 {
3046 	const struct intel_engine_execlists * const el = &engine->execlists;
3047 	struct i915_request * const *port, *rq;
3048 
3049 	/*
3050 	 * Use the most recent result from process_csb(), but just in case
3051 	 * we trigger an error (via interrupt) before the first CS event has
3052 	 * been written, peek at the next submission.
3053 	 */
3054 
3055 	for (port = el->active; (rq = *port); port++) {
3056 		if (rq->context->lrc.ccid == ccid) {
3057 			ENGINE_TRACE(engine,
3058 				     "ccid found at active:%zd\n",
3059 				     port - el->active);
3060 			return rq;
3061 		}
3062 	}
3063 
3064 	for (port = el->pending; (rq = *port); port++) {
3065 		if (rq->context->lrc.ccid == ccid) {
3066 			ENGINE_TRACE(engine,
3067 				     "ccid found at pending:%zd\n",
3068 				     port - el->pending);
3069 			return rq;
3070 		}
3071 	}
3072 
3073 	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3074 	return NULL;
3075 }
3076 
3077 static u32 active_ccid(struct intel_engine_cs *engine)
3078 {
3079 	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3080 }
3081 
3082 static void execlists_capture(struct intel_engine_cs *engine)
3083 {
3084 	struct execlists_capture *cap;
3085 
3086 	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3087 		return;
3088 
3089 	/*
3090 	 * We need to _quickly_ capture the engine state before we reset.
3091 	 * We are inside an atomic section (softirq) here and we are delaying
3092 	 * the forced preemption event.
3093 	 */
3094 	cap = capture_regs(engine);
3095 	if (!cap)
3096 		return;
3097 
3098 	spin_lock_irq(&engine->active.lock);
3099 	cap->rq = active_context(engine, active_ccid(engine));
3100 	if (cap->rq) {
3101 		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3102 		cap->rq = i915_request_get_rcu(cap->rq);
3103 	}
3104 	spin_unlock_irq(&engine->active.lock);
3105 	if (!cap->rq)
3106 		goto err_free;
3107 
3108 	/*
3109 	 * Remove the request from the execlists queue, and take ownership
3110 	 * of the request. We pass it to our worker who will _slowly_ compress
3111 	 * all the pages the _user_ requested for debugging their batch, after
3112 	 * which we return it to the queue for signaling.
3113 	 *
3114 	 * By removing them from the execlists queue, we also remove the
3115 	 * requests from being processed by __unwind_incomplete_requests()
3116 	 * during the intel_engine_reset(), and so they will *not* be replayed
3117 	 * afterwards.
3118 	 *
3119 	 * Note that because we have not yet reset the engine at this point,
3120 	 * it is possible for the request that we have identified as being
3121 	 * guilty, did in fact complete and we will then hit an arbitration
3122 	 * point allowing the outstanding preemption to succeed. The likelihood
3123 	 * of that is very low (as capturing of the engine registers should be
3124 	 * fast enough to run inside an irq-off atomic section!), so we will
3125 	 * simply hold that request accountable for being non-preemptible
3126 	 * long enough to force the reset.
3127 	 */
3128 	if (!execlists_hold(engine, cap->rq))
3129 		goto err_rq;
3130 
3131 	INIT_WORK(&cap->work, execlists_capture_work);
3132 	schedule_work(&cap->work);
3133 	return;
3134 
3135 err_rq:
3136 	i915_request_put(cap->rq);
3137 err_free:
3138 	i915_gpu_coredump_put(cap->error);
3139 	kfree(cap);
3140 }
3141 
3142 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3143 {
3144 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3145 	unsigned long *lock = &engine->gt->reset.flags;
3146 
3147 	if (!intel_has_reset_engine(engine->gt))
3148 		return;
3149 
3150 	if (test_and_set_bit(bit, lock))
3151 		return;
3152 
3153 	ENGINE_TRACE(engine, "reset for %s\n", msg);
3154 
3155 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3156 	tasklet_disable_nosync(&engine->execlists.tasklet);
3157 
3158 	ring_set_paused(engine, 1); /* Freeze the current request in place */
3159 	execlists_capture(engine);
3160 	intel_engine_reset(engine, msg);
3161 
3162 	tasklet_enable(&engine->execlists.tasklet);
3163 	clear_and_wake_up_bit(bit, lock);
3164 }
3165 
3166 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3167 {
3168 	const struct timer_list *t = &engine->execlists.preempt;
3169 
3170 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3171 		return false;
3172 
3173 	if (!timer_expired(t))
3174 		return false;
3175 
3176 	return READ_ONCE(engine->execlists.pending[0]);
3177 }
3178 
3179 /*
3180  * Check the unread Context Status Buffers and manage the submission of new
3181  * contexts to the ELSP accordingly.
3182  */
3183 static void execlists_submission_tasklet(unsigned long data)
3184 {
3185 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3186 	bool timeout = preempt_timeout(engine);
3187 
3188 	process_csb(engine);
3189 
3190 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3191 		const char *msg;
3192 
3193 		/* Generate the error message in priority wrt to the user! */
3194 		if (engine->execlists.error_interrupt & GENMASK(15, 0))
3195 			msg = "CS error"; /* thrown by a user payload */
3196 		else if (engine->execlists.error_interrupt & ERROR_CSB)
3197 			msg = "invalid CSB event";
3198 		else
3199 			msg = "internal error";
3200 
3201 		engine->execlists.error_interrupt = 0;
3202 		execlists_reset(engine, msg);
3203 	}
3204 
3205 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3206 		unsigned long flags;
3207 
3208 		spin_lock_irqsave(&engine->active.lock, flags);
3209 		__execlists_submission_tasklet(engine);
3210 		spin_unlock_irqrestore(&engine->active.lock, flags);
3211 
3212 		/* Recheck after serialising with direct-submission */
3213 		if (unlikely(timeout && preempt_timeout(engine)))
3214 			execlists_reset(engine, "preemption time out");
3215 	}
3216 }
3217 
3218 static void __execlists_kick(struct intel_engine_execlists *execlists)
3219 {
3220 	/* Kick the tasklet for some interrupt coalescing and reset handling */
3221 	tasklet_hi_schedule(&execlists->tasklet);
3222 }
3223 
3224 #define execlists_kick(t, member) \
3225 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3226 
3227 static void execlists_timeslice(struct timer_list *timer)
3228 {
3229 	execlists_kick(timer, timer);
3230 }
3231 
3232 static void execlists_preempt(struct timer_list *timer)
3233 {
3234 	execlists_kick(timer, preempt);
3235 }
3236 
3237 static void queue_request(struct intel_engine_cs *engine,
3238 			  struct i915_request *rq)
3239 {
3240 	GEM_BUG_ON(!list_empty(&rq->sched.link));
3241 	list_add_tail(&rq->sched.link,
3242 		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3243 	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3244 }
3245 
3246 static void __submit_queue_imm(struct intel_engine_cs *engine)
3247 {
3248 	struct intel_engine_execlists * const execlists = &engine->execlists;
3249 
3250 	if (reset_in_progress(execlists))
3251 		return; /* defer until we restart the engine following reset */
3252 
3253 	__execlists_submission_tasklet(engine);
3254 }
3255 
3256 static void submit_queue(struct intel_engine_cs *engine,
3257 			 const struct i915_request *rq)
3258 {
3259 	struct intel_engine_execlists *execlists = &engine->execlists;
3260 
3261 	if (rq_prio(rq) <= execlists->queue_priority_hint)
3262 		return;
3263 
3264 	execlists->queue_priority_hint = rq_prio(rq);
3265 	__submit_queue_imm(engine);
3266 }
3267 
3268 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3269 			     const struct i915_request *rq)
3270 {
3271 	GEM_BUG_ON(i915_request_on_hold(rq));
3272 	return !list_empty(&engine->active.hold) && hold_request(rq);
3273 }
3274 
3275 static void flush_csb(struct intel_engine_cs *engine)
3276 {
3277 	struct intel_engine_execlists *el = &engine->execlists;
3278 
3279 	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3280 		if (!reset_in_progress(el))
3281 			process_csb(engine);
3282 		tasklet_unlock(&el->tasklet);
3283 	}
3284 }
3285 
3286 static void execlists_submit_request(struct i915_request *request)
3287 {
3288 	struct intel_engine_cs *engine = request->engine;
3289 	unsigned long flags;
3290 
3291 	/* Hopefully we clear execlists->pending[] to let us through */
3292 	flush_csb(engine);
3293 
3294 	/* Will be called from irq-context when using foreign fences. */
3295 	spin_lock_irqsave(&engine->active.lock, flags);
3296 
3297 	if (unlikely(ancestor_on_hold(engine, request))) {
3298 		RQ_TRACE(request, "ancestor on hold\n");
3299 		list_add_tail(&request->sched.link, &engine->active.hold);
3300 		i915_request_set_hold(request);
3301 	} else {
3302 		queue_request(engine, request);
3303 
3304 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3305 		GEM_BUG_ON(list_empty(&request->sched.link));
3306 
3307 		submit_queue(engine, request);
3308 	}
3309 
3310 	spin_unlock_irqrestore(&engine->active.lock, flags);
3311 }
3312 
3313 static void __execlists_context_fini(struct intel_context *ce)
3314 {
3315 	intel_ring_put(ce->ring);
3316 	i915_vma_put(ce->state);
3317 }
3318 
3319 static void execlists_context_destroy(struct kref *kref)
3320 {
3321 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3322 
3323 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3324 	GEM_BUG_ON(intel_context_is_pinned(ce));
3325 
3326 	if (ce->state)
3327 		__execlists_context_fini(ce);
3328 
3329 	intel_context_fini(ce);
3330 	intel_context_free(ce);
3331 }
3332 
3333 static void
3334 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3335 {
3336 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3337 		return;
3338 
3339 	vaddr += engine->context_size;
3340 
3341 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3342 }
3343 
3344 static void
3345 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3346 {
3347 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3348 		return;
3349 
3350 	vaddr += engine->context_size;
3351 
3352 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3353 		drm_err_once(&engine->i915->drm,
3354 			     "%s context redzone overwritten!\n",
3355 			     engine->name);
3356 }
3357 
3358 static void execlists_context_unpin(struct intel_context *ce)
3359 {
3360 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3361 		      ce->engine);
3362 }
3363 
3364 static void execlists_context_post_unpin(struct intel_context *ce)
3365 {
3366 	i915_gem_object_unpin_map(ce->state->obj);
3367 }
3368 
3369 static u32 *
3370 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3371 {
3372 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3373 		MI_SRM_LRM_GLOBAL_GTT |
3374 		MI_LRI_LRM_CS_MMIO;
3375 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3376 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3377 		CTX_TIMESTAMP * sizeof(u32);
3378 	*cs++ = 0;
3379 
3380 	*cs++ = MI_LOAD_REGISTER_REG |
3381 		MI_LRR_SOURCE_CS_MMIO |
3382 		MI_LRI_LRM_CS_MMIO;
3383 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3384 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3385 
3386 	*cs++ = MI_LOAD_REGISTER_REG |
3387 		MI_LRR_SOURCE_CS_MMIO |
3388 		MI_LRI_LRM_CS_MMIO;
3389 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3390 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3391 
3392 	return cs;
3393 }
3394 
3395 static u32 *
3396 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3397 {
3398 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3399 
3400 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3401 		MI_SRM_LRM_GLOBAL_GTT |
3402 		MI_LRI_LRM_CS_MMIO;
3403 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3404 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3405 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3406 	*cs++ = 0;
3407 
3408 	return cs;
3409 }
3410 
3411 static u32 *
3412 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3413 {
3414 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3415 
3416 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3417 		MI_SRM_LRM_GLOBAL_GTT |
3418 		MI_LRI_LRM_CS_MMIO;
3419 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3420 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3421 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3422 	*cs++ = 0;
3423 
3424 	*cs++ = MI_LOAD_REGISTER_REG |
3425 		MI_LRR_SOURCE_CS_MMIO |
3426 		MI_LRI_LRM_CS_MMIO;
3427 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3428 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3429 
3430 	return cs;
3431 }
3432 
3433 static u32 *
3434 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3435 {
3436 	cs = gen12_emit_timestamp_wa(ce, cs);
3437 	cs = gen12_emit_cmd_buf_wa(ce, cs);
3438 	cs = gen12_emit_restore_scratch(ce, cs);
3439 
3440 	return cs;
3441 }
3442 
3443 static u32 *
3444 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3445 {
3446 	cs = gen12_emit_timestamp_wa(ce, cs);
3447 	cs = gen12_emit_restore_scratch(ce, cs);
3448 
3449 	return cs;
3450 }
3451 
3452 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3453 {
3454 	return PAGE_SIZE * ce->wa_bb_page;
3455 }
3456 
3457 static u32 *context_indirect_bb(const struct intel_context *ce)
3458 {
3459 	void *ptr;
3460 
3461 	GEM_BUG_ON(!ce->wa_bb_page);
3462 
3463 	ptr = ce->lrc_reg_state;
3464 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3465 	ptr += context_wa_bb_offset(ce);
3466 
3467 	return ptr;
3468 }
3469 
3470 static void
3471 setup_indirect_ctx_bb(const struct intel_context *ce,
3472 		      const struct intel_engine_cs *engine,
3473 		      u32 *(*emit)(const struct intel_context *, u32 *))
3474 {
3475 	u32 * const start = context_indirect_bb(ce);
3476 	u32 *cs;
3477 
3478 	cs = emit(ce, start);
3479 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3480 	while ((unsigned long)cs % CACHELINE_BYTES)
3481 		*cs++ = MI_NOOP;
3482 
3483 	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3484 				    i915_ggtt_offset(ce->state) +
3485 				    context_wa_bb_offset(ce),
3486 				    (cs - start) * sizeof(*cs));
3487 }
3488 
3489 static void
3490 __execlists_update_reg_state(const struct intel_context *ce,
3491 			     const struct intel_engine_cs *engine,
3492 			     u32 head)
3493 {
3494 	struct intel_ring *ring = ce->ring;
3495 	u32 *regs = ce->lrc_reg_state;
3496 
3497 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3498 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3499 
3500 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3501 	regs[CTX_RING_HEAD] = head;
3502 	regs[CTX_RING_TAIL] = ring->tail;
3503 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3504 
3505 	/* RPCS */
3506 	if (engine->class == RENDER_CLASS) {
3507 		regs[CTX_R_PWR_CLK_STATE] =
3508 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3509 
3510 		i915_oa_init_reg_state(ce, engine);
3511 	}
3512 
3513 	if (ce->wa_bb_page) {
3514 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3515 
3516 		fn = gen12_emit_indirect_ctx_xcs;
3517 		if (ce->engine->class == RENDER_CLASS)
3518 			fn = gen12_emit_indirect_ctx_rcs;
3519 
3520 		/* Mutually exclusive wrt to global indirect bb */
3521 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3522 		setup_indirect_ctx_bb(ce, engine, fn);
3523 	}
3524 }
3525 
3526 static int
3527 execlists_context_pre_pin(struct intel_context *ce,
3528 			  struct i915_gem_ww_ctx *ww, void **vaddr)
3529 {
3530 	GEM_BUG_ON(!ce->state);
3531 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3532 
3533 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
3534 					i915_coherent_map_type(ce->engine->i915) |
3535 					I915_MAP_OVERRIDE);
3536 
3537 	return PTR_ERR_OR_ZERO(*vaddr);
3538 }
3539 
3540 static int
3541 __execlists_context_pin(struct intel_context *ce,
3542 			struct intel_engine_cs *engine,
3543 			void *vaddr)
3544 {
3545 	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3546 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3547 	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3548 
3549 	return 0;
3550 }
3551 
3552 static int execlists_context_pin(struct intel_context *ce, void *vaddr)
3553 {
3554 	return __execlists_context_pin(ce, ce->engine, vaddr);
3555 }
3556 
3557 static int execlists_context_alloc(struct intel_context *ce)
3558 {
3559 	return __execlists_context_alloc(ce, ce->engine);
3560 }
3561 
3562 static void execlists_context_reset(struct intel_context *ce)
3563 {
3564 	CE_TRACE(ce, "reset\n");
3565 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3566 
3567 	intel_ring_reset(ce->ring, ce->ring->emit);
3568 
3569 	/* Scrub away the garbage */
3570 	execlists_init_reg_state(ce->lrc_reg_state,
3571 				 ce, ce->engine, ce->ring, true);
3572 	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3573 
3574 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3575 }
3576 
3577 static const struct intel_context_ops execlists_context_ops = {
3578 	.alloc = execlists_context_alloc,
3579 
3580 	.pre_pin = execlists_context_pre_pin,
3581 	.pin = execlists_context_pin,
3582 	.unpin = execlists_context_unpin,
3583 	.post_unpin = execlists_context_post_unpin,
3584 
3585 	.enter = intel_context_enter_engine,
3586 	.exit = intel_context_exit_engine,
3587 
3588 	.reset = execlists_context_reset,
3589 	.destroy = execlists_context_destroy,
3590 };
3591 
3592 static u32 hwsp_offset(const struct i915_request *rq)
3593 {
3594 	const struct intel_timeline_cacheline *cl;
3595 
3596 	/* Before the request is executed, the timeline/cachline is fixed */
3597 
3598 	cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
3599 	if (cl)
3600 		return cl->ggtt_offset;
3601 
3602 	return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
3603 }
3604 
3605 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3606 {
3607 	u32 *cs;
3608 
3609 	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3610 	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3611 		return 0;
3612 
3613 	cs = intel_ring_begin(rq, 6);
3614 	if (IS_ERR(cs))
3615 		return PTR_ERR(cs);
3616 
3617 	/*
3618 	 * Check if we have been preempted before we even get started.
3619 	 *
3620 	 * After this point i915_request_started() reports true, even if
3621 	 * we get preempted and so are no longer running.
3622 	 */
3623 	*cs++ = MI_ARB_CHECK;
3624 	*cs++ = MI_NOOP;
3625 
3626 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3627 	*cs++ = hwsp_offset(rq);
3628 	*cs++ = 0;
3629 	*cs++ = rq->fence.seqno - 1;
3630 
3631 	intel_ring_advance(rq, cs);
3632 
3633 	/* Record the updated position of the request's payload */
3634 	rq->infix = intel_ring_offset(rq, cs);
3635 
3636 	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3637 
3638 	return 0;
3639 }
3640 
3641 static int emit_pdps(struct i915_request *rq)
3642 {
3643 	const struct intel_engine_cs * const engine = rq->engine;
3644 	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3645 	int err, i;
3646 	u32 *cs;
3647 
3648 	GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3649 
3650 	/*
3651 	 * Beware ye of the dragons, this sequence is magic!
3652 	 *
3653 	 * Small changes to this sequence can cause anything from
3654 	 * GPU hangs to forcewake errors and machine lockups!
3655 	 */
3656 
3657 	/* Flush any residual operations from the context load */
3658 	err = engine->emit_flush(rq, EMIT_FLUSH);
3659 	if (err)
3660 		return err;
3661 
3662 	/* Magic required to prevent forcewake errors! */
3663 	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3664 	if (err)
3665 		return err;
3666 
3667 	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3668 	if (IS_ERR(cs))
3669 		return PTR_ERR(cs);
3670 
3671 	/* Ensure the LRI have landed before we invalidate & continue */
3672 	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3673 	for (i = GEN8_3LVL_PDPES; i--; ) {
3674 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3675 		u32 base = engine->mmio_base;
3676 
3677 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3678 		*cs++ = upper_32_bits(pd_daddr);
3679 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3680 		*cs++ = lower_32_bits(pd_daddr);
3681 	}
3682 	*cs++ = MI_NOOP;
3683 
3684 	intel_ring_advance(rq, cs);
3685 
3686 	return 0;
3687 }
3688 
3689 static int execlists_request_alloc(struct i915_request *request)
3690 {
3691 	int ret;
3692 
3693 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3694 
3695 	/*
3696 	 * Flush enough space to reduce the likelihood of waiting after
3697 	 * we start building the request - in which case we will just
3698 	 * have to repeat work.
3699 	 */
3700 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3701 
3702 	/*
3703 	 * Note that after this point, we have committed to using
3704 	 * this request as it is being used to both track the
3705 	 * state of engine initialisation and liveness of the
3706 	 * golden renderstate above. Think twice before you try
3707 	 * to cancel/unwind this request now.
3708 	 */
3709 
3710 	if (!i915_vm_is_4lvl(request->context->vm)) {
3711 		ret = emit_pdps(request);
3712 		if (ret)
3713 			return ret;
3714 	}
3715 
3716 	/* Unconditionally invalidate GPU caches and TLBs. */
3717 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3718 	if (ret)
3719 		return ret;
3720 
3721 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3722 	return 0;
3723 }
3724 
3725 /*
3726  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3727  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3728  * but there is a slight complication as this is applied in WA batch where the
3729  * values are only initialized once so we cannot take register value at the
3730  * beginning and reuse it further; hence we save its value to memory, upload a
3731  * constant value with bit21 set and then we restore it back with the saved value.
3732  * To simplify the WA, a constant value is formed by using the default value
3733  * of this register. This shouldn't be a problem because we are only modifying
3734  * it for a short period and this batch in non-premptible. We can ofcourse
3735  * use additional instructions that read the actual value of the register
3736  * at that time and set our bit of interest but it makes the WA complicated.
3737  *
3738  * This WA is also required for Gen9 so extracting as a function avoids
3739  * code duplication.
3740  */
3741 static u32 *
3742 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3743 {
3744 	/* NB no one else is allowed to scribble over scratch + 256! */
3745 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3746 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3747 	*batch++ = intel_gt_scratch_offset(engine->gt,
3748 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3749 	*batch++ = 0;
3750 
3751 	*batch++ = MI_LOAD_REGISTER_IMM(1);
3752 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3753 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3754 
3755 	batch = gen8_emit_pipe_control(batch,
3756 				       PIPE_CONTROL_CS_STALL |
3757 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3758 				       0);
3759 
3760 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3761 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3762 	*batch++ = intel_gt_scratch_offset(engine->gt,
3763 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3764 	*batch++ = 0;
3765 
3766 	return batch;
3767 }
3768 
3769 /*
3770  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3771  * initialized at the beginning and shared across all contexts but this field
3772  * helps us to have multiple batches at different offsets and select them based
3773  * on a criteria. At the moment this batch always start at the beginning of the page
3774  * and at this point we don't have multiple wa_ctx batch buffers.
3775  *
3776  * The number of WA applied are not known at the beginning; we use this field
3777  * to return the no of DWORDS written.
3778  *
3779  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3780  * so it adds NOOPs as padding to make it cacheline aligned.
3781  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3782  * makes a complete batch buffer.
3783  */
3784 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3785 {
3786 	/* WaDisableCtxRestoreArbitration:bdw,chv */
3787 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3788 
3789 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3790 	if (IS_BROADWELL(engine->i915))
3791 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3792 
3793 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3794 	/* Actual scratch location is at 128 bytes offset */
3795 	batch = gen8_emit_pipe_control(batch,
3796 				       PIPE_CONTROL_FLUSH_L3 |
3797 				       PIPE_CONTROL_STORE_DATA_INDEX |
3798 				       PIPE_CONTROL_CS_STALL |
3799 				       PIPE_CONTROL_QW_WRITE,
3800 				       LRC_PPHWSP_SCRATCH_ADDR);
3801 
3802 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3803 
3804 	/* Pad to end of cacheline */
3805 	while ((unsigned long)batch % CACHELINE_BYTES)
3806 		*batch++ = MI_NOOP;
3807 
3808 	/*
3809 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3810 	 * execution depends on the length specified in terms of cache lines
3811 	 * in the register CTX_RCS_INDIRECT_CTX
3812 	 */
3813 
3814 	return batch;
3815 }
3816 
3817 struct lri {
3818 	i915_reg_t reg;
3819 	u32 value;
3820 };
3821 
3822 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3823 {
3824 	GEM_BUG_ON(!count || count > 63);
3825 
3826 	*batch++ = MI_LOAD_REGISTER_IMM(count);
3827 	do {
3828 		*batch++ = i915_mmio_reg_offset(lri->reg);
3829 		*batch++ = lri->value;
3830 	} while (lri++, --count);
3831 	*batch++ = MI_NOOP;
3832 
3833 	return batch;
3834 }
3835 
3836 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3837 {
3838 	static const struct lri lri[] = {
3839 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3840 		{
3841 			COMMON_SLICE_CHICKEN2,
3842 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3843 				       0),
3844 		},
3845 
3846 		/* BSpec: 11391 */
3847 		{
3848 			FF_SLICE_CHICKEN,
3849 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3850 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3851 		},
3852 
3853 		/* BSpec: 11299 */
3854 		{
3855 			_3D_CHICKEN3,
3856 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3857 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3858 		}
3859 	};
3860 
3861 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3862 
3863 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3864 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3865 
3866 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3867 	batch = gen8_emit_pipe_control(batch,
3868 				       PIPE_CONTROL_FLUSH_L3 |
3869 				       PIPE_CONTROL_STORE_DATA_INDEX |
3870 				       PIPE_CONTROL_CS_STALL |
3871 				       PIPE_CONTROL_QW_WRITE,
3872 				       LRC_PPHWSP_SCRATCH_ADDR);
3873 
3874 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3875 
3876 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3877 	if (HAS_POOLED_EU(engine->i915)) {
3878 		/*
3879 		 * EU pool configuration is setup along with golden context
3880 		 * during context initialization. This value depends on
3881 		 * device type (2x6 or 3x6) and needs to be updated based
3882 		 * on which subslice is disabled especially for 2x6
3883 		 * devices, however it is safe to load default
3884 		 * configuration of 3x6 device instead of masking off
3885 		 * corresponding bits because HW ignores bits of a disabled
3886 		 * subslice and drops down to appropriate config. Please
3887 		 * see render_state_setup() in i915_gem_render_state.c for
3888 		 * possible configurations, to avoid duplication they are
3889 		 * not shown here again.
3890 		 */
3891 		*batch++ = GEN9_MEDIA_POOL_STATE;
3892 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3893 		*batch++ = 0x00777000;
3894 		*batch++ = 0;
3895 		*batch++ = 0;
3896 		*batch++ = 0;
3897 	}
3898 
3899 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3900 
3901 	/* Pad to end of cacheline */
3902 	while ((unsigned long)batch % CACHELINE_BYTES)
3903 		*batch++ = MI_NOOP;
3904 
3905 	return batch;
3906 }
3907 
3908 static u32 *
3909 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3910 {
3911 	int i;
3912 
3913 	/*
3914 	 * WaPipeControlBefore3DStateSamplePattern: cnl
3915 	 *
3916 	 * Ensure the engine is idle prior to programming a
3917 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3918 	 */
3919 	batch = gen8_emit_pipe_control(batch,
3920 				       PIPE_CONTROL_CS_STALL,
3921 				       0);
3922 	/*
3923 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3924 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3925 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3926 	 * confusing. Since gen8_emit_pipe_control() already advances the
3927 	 * batch by 6 dwords, we advance the other 10 here, completing a
3928 	 * cacheline. It's not clear if the workaround requires this padding
3929 	 * before other commands, or if it's just the regular padding we would
3930 	 * already have for the workaround bb, so leave it here for now.
3931 	 */
3932 	for (i = 0; i < 10; i++)
3933 		*batch++ = MI_NOOP;
3934 
3935 	/* Pad to end of cacheline */
3936 	while ((unsigned long)batch % CACHELINE_BYTES)
3937 		*batch++ = MI_NOOP;
3938 
3939 	return batch;
3940 }
3941 
3942 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3943 
3944 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3945 {
3946 	struct drm_i915_gem_object *obj;
3947 	struct i915_vma *vma;
3948 	int err;
3949 
3950 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3951 	if (IS_ERR(obj))
3952 		return PTR_ERR(obj);
3953 
3954 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3955 	if (IS_ERR(vma)) {
3956 		err = PTR_ERR(vma);
3957 		goto err;
3958 	}
3959 
3960 	err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
3961 	if (err)
3962 		goto err;
3963 
3964 	engine->wa_ctx.vma = vma;
3965 	return 0;
3966 
3967 err:
3968 	i915_gem_object_put(obj);
3969 	return err;
3970 }
3971 
3972 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3973 {
3974 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3975 }
3976 
3977 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3978 
3979 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3980 {
3981 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3982 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3983 					    &wa_ctx->per_ctx };
3984 	wa_bb_func_t wa_bb_fn[2];
3985 	void *batch, *batch_ptr;
3986 	unsigned int i;
3987 	int ret;
3988 
3989 	if (engine->class != RENDER_CLASS)
3990 		return 0;
3991 
3992 	switch (INTEL_GEN(engine->i915)) {
3993 	case 12:
3994 	case 11:
3995 		return 0;
3996 	case 10:
3997 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3998 		wa_bb_fn[1] = NULL;
3999 		break;
4000 	case 9:
4001 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
4002 		wa_bb_fn[1] = NULL;
4003 		break;
4004 	case 8:
4005 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
4006 		wa_bb_fn[1] = NULL;
4007 		break;
4008 	default:
4009 		MISSING_CASE(INTEL_GEN(engine->i915));
4010 		return 0;
4011 	}
4012 
4013 	ret = lrc_setup_wa_ctx(engine);
4014 	if (ret) {
4015 		drm_dbg(&engine->i915->drm,
4016 			"Failed to setup context WA page: %d\n", ret);
4017 		return ret;
4018 	}
4019 
4020 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
4021 
4022 	/*
4023 	 * Emit the two workaround batch buffers, recording the offset from the
4024 	 * start of the workaround batch buffer object for each and their
4025 	 * respective sizes.
4026 	 */
4027 	batch_ptr = batch;
4028 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
4029 		wa_bb[i]->offset = batch_ptr - batch;
4030 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
4031 						  CACHELINE_BYTES))) {
4032 			ret = -EINVAL;
4033 			break;
4034 		}
4035 		if (wa_bb_fn[i])
4036 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
4037 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
4038 	}
4039 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
4040 
4041 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
4042 	__i915_gem_object_release_map(wa_ctx->vma->obj);
4043 	if (ret)
4044 		lrc_destroy_wa_ctx(engine);
4045 
4046 	return ret;
4047 }
4048 
4049 static void reset_csb_pointers(struct intel_engine_cs *engine)
4050 {
4051 	struct intel_engine_execlists * const execlists = &engine->execlists;
4052 	const unsigned int reset_value = execlists->csb_size - 1;
4053 
4054 	ring_set_paused(engine, 0);
4055 
4056 	/*
4057 	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
4058 	 * Bludgeon them with a mmio update to be sure.
4059 	 */
4060 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4061 		     0xffff << 16 | reset_value << 8 | reset_value);
4062 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4063 
4064 	/*
4065 	 * After a reset, the HW starts writing into CSB entry [0]. We
4066 	 * therefore have to set our HEAD pointer back one entry so that
4067 	 * the *first* entry we check is entry 0. To complicate this further,
4068 	 * as we don't wait for the first interrupt after reset, we have to
4069 	 * fake the HW write to point back to the last entry so that our
4070 	 * inline comparison of our cached head position against the last HW
4071 	 * write works even before the first interrupt.
4072 	 */
4073 	execlists->csb_head = reset_value;
4074 	WRITE_ONCE(*execlists->csb_write, reset_value);
4075 	wmb(); /* Make sure this is visible to HW (paranoia?) */
4076 
4077 	/* Check that the GPU does indeed update the CSB entries! */
4078 	memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64));
4079 	invalidate_csb_entries(&execlists->csb_status[0],
4080 			       &execlists->csb_status[reset_value]);
4081 
4082 	/* Once more for luck and our trusty paranoia */
4083 	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4084 		     0xffff << 16 | reset_value << 8 | reset_value);
4085 	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4086 
4087 	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4088 }
4089 
4090 static void execlists_sanitize(struct intel_engine_cs *engine)
4091 {
4092 	GEM_BUG_ON(execlists_active(&engine->execlists));
4093 
4094 	/*
4095 	 * Poison residual state on resume, in case the suspend didn't!
4096 	 *
4097 	 * We have to assume that across suspend/resume (or other loss
4098 	 * of control) that the contents of our pinned buffers has been
4099 	 * lost, replaced by garbage. Since this doesn't always happen,
4100 	 * let's poison such state so that we more quickly spot when
4101 	 * we falsely assume it has been preserved.
4102 	 */
4103 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4104 		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4105 
4106 	reset_csb_pointers(engine);
4107 
4108 	/*
4109 	 * The kernel_context HWSP is stored in the status_page. As above,
4110 	 * that may be lost on resume/initialisation, and so we need to
4111 	 * reset the value in the HWSP.
4112 	 */
4113 	intel_timeline_reset_seqno(engine->kernel_context->timeline);
4114 
4115 	/* And scrub the dirty cachelines for the HWSP */
4116 	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4117 }
4118 
4119 static void enable_error_interrupt(struct intel_engine_cs *engine)
4120 {
4121 	u32 status;
4122 
4123 	engine->execlists.error_interrupt = 0;
4124 	ENGINE_WRITE(engine, RING_EMR, ~0u);
4125 	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4126 
4127 	status = ENGINE_READ(engine, RING_ESR);
4128 	if (unlikely(status)) {
4129 		drm_err(&engine->i915->drm,
4130 			"engine '%s' resumed still in error: %08x\n",
4131 			engine->name, status);
4132 		__intel_gt_reset(engine->gt, engine->mask);
4133 	}
4134 
4135 	/*
4136 	 * On current gen8+, we have 2 signals to play with
4137 	 *
4138 	 * - I915_ERROR_INSTUCTION (bit 0)
4139 	 *
4140 	 *    Generate an error if the command parser encounters an invalid
4141 	 *    instruction
4142 	 *
4143 	 *    This is a fatal error.
4144 	 *
4145 	 * - CP_PRIV (bit 2)
4146 	 *
4147 	 *    Generate an error on privilege violation (where the CP replaces
4148 	 *    the instruction with a no-op). This also fires for writes into
4149 	 *    read-only scratch pages.
4150 	 *
4151 	 *    This is a non-fatal error, parsing continues.
4152 	 *
4153 	 * * there are a few others defined for odd HW that we do not use
4154 	 *
4155 	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4156 	 * error (as the HW is validating and suppressing the mistakes), we
4157 	 * only unmask the instruction error bit.
4158 	 */
4159 	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4160 }
4161 
4162 static void enable_execlists(struct intel_engine_cs *engine)
4163 {
4164 	u32 mode;
4165 
4166 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4167 
4168 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4169 
4170 	if (INTEL_GEN(engine->i915) >= 11)
4171 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4172 	else
4173 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4174 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4175 
4176 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4177 
4178 	ENGINE_WRITE_FW(engine,
4179 			RING_HWS_PGA,
4180 			i915_ggtt_offset(engine->status_page.vma));
4181 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4182 
4183 	enable_error_interrupt(engine);
4184 
4185 	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4186 }
4187 
4188 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4189 {
4190 	bool unexpected = false;
4191 
4192 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4193 		drm_dbg(&engine->i915->drm,
4194 			"STOP_RING still set in RING_MI_MODE\n");
4195 		unexpected = true;
4196 	}
4197 
4198 	return unexpected;
4199 }
4200 
4201 static int execlists_resume(struct intel_engine_cs *engine)
4202 {
4203 	intel_mocs_init_engine(engine);
4204 
4205 	intel_breadcrumbs_reset(engine->breadcrumbs);
4206 
4207 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4208 		struct drm_printer p = drm_debug_printer(__func__);
4209 
4210 		intel_engine_dump(engine, &p, NULL);
4211 	}
4212 
4213 	enable_execlists(engine);
4214 
4215 	return 0;
4216 }
4217 
4218 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4219 {
4220 	struct intel_engine_execlists * const execlists = &engine->execlists;
4221 	unsigned long flags;
4222 
4223 	ENGINE_TRACE(engine, "depth<-%d\n",
4224 		     atomic_read(&execlists->tasklet.count));
4225 
4226 	/*
4227 	 * Prevent request submission to the hardware until we have
4228 	 * completed the reset in i915_gem_reset_finish(). If a request
4229 	 * is completed by one engine, it may then queue a request
4230 	 * to a second via its execlists->tasklet *just* as we are
4231 	 * calling engine->resume() and also writing the ELSP.
4232 	 * Turning off the execlists->tasklet until the reset is over
4233 	 * prevents the race.
4234 	 */
4235 	__tasklet_disable_sync_once(&execlists->tasklet);
4236 	GEM_BUG_ON(!reset_in_progress(execlists));
4237 
4238 	/* And flush any current direct submission. */
4239 	spin_lock_irqsave(&engine->active.lock, flags);
4240 	spin_unlock_irqrestore(&engine->active.lock, flags);
4241 
4242 	/*
4243 	 * We stop engines, otherwise we might get failed reset and a
4244 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4245 	 * from system hang if batchbuffer is progressing when
4246 	 * the reset is issued, regardless of READY_TO_RESET ack.
4247 	 * Thus assume it is best to stop engines on all gens
4248 	 * where we have a gpu reset.
4249 	 *
4250 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4251 	 *
4252 	 * FIXME: Wa for more modern gens needs to be validated
4253 	 */
4254 	ring_set_paused(engine, 1);
4255 	intel_engine_stop_cs(engine);
4256 
4257 	engine->execlists.reset_ccid = active_ccid(engine);
4258 }
4259 
4260 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4261 {
4262 	int x;
4263 
4264 	x = lrc_ring_mi_mode(engine);
4265 	if (x != -1) {
4266 		regs[x + 1] &= ~STOP_RING;
4267 		regs[x + 1] |= STOP_RING << 16;
4268 	}
4269 }
4270 
4271 static void __execlists_reset_reg_state(const struct intel_context *ce,
4272 					const struct intel_engine_cs *engine)
4273 {
4274 	u32 *regs = ce->lrc_reg_state;
4275 
4276 	__reset_stop_ring(regs, engine);
4277 }
4278 
4279 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4280 {
4281 	struct intel_engine_execlists * const execlists = &engine->execlists;
4282 	struct intel_context *ce;
4283 	struct i915_request *rq;
4284 	u32 head;
4285 
4286 	mb(); /* paranoia: read the CSB pointers from after the reset */
4287 	clflush(execlists->csb_write);
4288 	mb();
4289 
4290 	process_csb(engine); /* drain preemption events */
4291 
4292 	/* Following the reset, we need to reload the CSB read/write pointers */
4293 	reset_csb_pointers(engine);
4294 
4295 	/*
4296 	 * Save the currently executing context, even if we completed
4297 	 * its request, it was still running at the time of the
4298 	 * reset and will have been clobbered.
4299 	 */
4300 	rq = active_context(engine, engine->execlists.reset_ccid);
4301 	if (!rq)
4302 		goto unwind;
4303 
4304 	ce = rq->context;
4305 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4306 
4307 	if (i915_request_completed(rq)) {
4308 		/* Idle context; tidy up the ring so we can restart afresh */
4309 		head = intel_ring_wrap(ce->ring, rq->tail);
4310 		goto out_replay;
4311 	}
4312 
4313 	/* We still have requests in-flight; the engine should be active */
4314 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4315 
4316 	/* Context has requests still in-flight; it should not be idle! */
4317 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4318 
4319 	rq = active_request(ce->timeline, rq);
4320 	head = intel_ring_wrap(ce->ring, rq->head);
4321 	GEM_BUG_ON(head == ce->ring->tail);
4322 
4323 	/*
4324 	 * If this request hasn't started yet, e.g. it is waiting on a
4325 	 * semaphore, we need to avoid skipping the request or else we
4326 	 * break the signaling chain. However, if the context is corrupt
4327 	 * the request will not restart and we will be stuck with a wedged
4328 	 * device. It is quite often the case that if we issue a reset
4329 	 * while the GPU is loading the context image, that the context
4330 	 * image becomes corrupt.
4331 	 *
4332 	 * Otherwise, if we have not started yet, the request should replay
4333 	 * perfectly and we do not need to flag the result as being erroneous.
4334 	 */
4335 	if (!i915_request_started(rq))
4336 		goto out_replay;
4337 
4338 	/*
4339 	 * If the request was innocent, we leave the request in the ELSP
4340 	 * and will try to replay it on restarting. The context image may
4341 	 * have been corrupted by the reset, in which case we may have
4342 	 * to service a new GPU hang, but more likely we can continue on
4343 	 * without impact.
4344 	 *
4345 	 * If the request was guilty, we presume the context is corrupt
4346 	 * and have to at least restore the RING register in the context
4347 	 * image back to the expected values to skip over the guilty request.
4348 	 */
4349 	__i915_request_reset(rq, stalled);
4350 
4351 	/*
4352 	 * We want a simple context + ring to execute the breadcrumb update.
4353 	 * We cannot rely on the context being intact across the GPU hang,
4354 	 * so clear it and rebuild just what we need for the breadcrumb.
4355 	 * All pending requests for this context will be zapped, and any
4356 	 * future request will be after userspace has had the opportunity
4357 	 * to recreate its own state.
4358 	 */
4359 out_replay:
4360 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4361 		     head, ce->ring->tail);
4362 	__execlists_reset_reg_state(ce, engine);
4363 	__execlists_update_reg_state(ce, engine, head);
4364 	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4365 
4366 unwind:
4367 	/* Push back any incomplete requests for replay after the reset. */
4368 	cancel_port_requests(execlists);
4369 	__unwind_incomplete_requests(engine);
4370 }
4371 
4372 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4373 {
4374 	unsigned long flags;
4375 
4376 	ENGINE_TRACE(engine, "\n");
4377 
4378 	spin_lock_irqsave(&engine->active.lock, flags);
4379 
4380 	__execlists_reset(engine, stalled);
4381 
4382 	spin_unlock_irqrestore(&engine->active.lock, flags);
4383 }
4384 
4385 static void nop_submission_tasklet(unsigned long data)
4386 {
4387 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4388 
4389 	/* The driver is wedged; don't process any more events. */
4390 	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4391 }
4392 
4393 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4394 {
4395 	struct intel_engine_execlists * const execlists = &engine->execlists;
4396 	struct i915_request *rq, *rn;
4397 	struct rb_node *rb;
4398 	unsigned long flags;
4399 
4400 	ENGINE_TRACE(engine, "\n");
4401 
4402 	/*
4403 	 * Before we call engine->cancel_requests(), we should have exclusive
4404 	 * access to the submission state. This is arranged for us by the
4405 	 * caller disabling the interrupt generation, the tasklet and other
4406 	 * threads that may then access the same state, giving us a free hand
4407 	 * to reset state. However, we still need to let lockdep be aware that
4408 	 * we know this state may be accessed in hardirq context, so we
4409 	 * disable the irq around this manipulation and we want to keep
4410 	 * the spinlock focused on its duties and not accidentally conflate
4411 	 * coverage to the submission's irq state. (Similarly, although we
4412 	 * shouldn't need to disable irq around the manipulation of the
4413 	 * submission's irq state, we also wish to remind ourselves that
4414 	 * it is irq state.)
4415 	 */
4416 	spin_lock_irqsave(&engine->active.lock, flags);
4417 
4418 	__execlists_reset(engine, true);
4419 
4420 	/* Mark all executing requests as skipped. */
4421 	list_for_each_entry(rq, &engine->active.requests, sched.link)
4422 		mark_eio(rq);
4423 	intel_engine_signal_breadcrumbs(engine);
4424 
4425 	/* Flush the queued requests to the timeline list (for retiring). */
4426 	while ((rb = rb_first_cached(&execlists->queue))) {
4427 		struct i915_priolist *p = to_priolist(rb);
4428 		int i;
4429 
4430 		priolist_for_each_request_consume(rq, rn, p, i) {
4431 			mark_eio(rq);
4432 			__i915_request_submit(rq);
4433 		}
4434 
4435 		rb_erase_cached(&p->node, &execlists->queue);
4436 		i915_priolist_free(p);
4437 	}
4438 
4439 	/* On-hold requests will be flushed to timeline upon their release */
4440 	list_for_each_entry(rq, &engine->active.hold, sched.link)
4441 		mark_eio(rq);
4442 
4443 	/* Cancel all attached virtual engines */
4444 	while ((rb = rb_first_cached(&execlists->virtual))) {
4445 		struct virtual_engine *ve =
4446 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4447 
4448 		rb_erase_cached(rb, &execlists->virtual);
4449 		RB_CLEAR_NODE(rb);
4450 
4451 		spin_lock(&ve->base.active.lock);
4452 		rq = fetch_and_zero(&ve->request);
4453 		if (rq) {
4454 			mark_eio(rq);
4455 
4456 			rq->engine = engine;
4457 			__i915_request_submit(rq);
4458 			i915_request_put(rq);
4459 
4460 			ve->base.execlists.queue_priority_hint = INT_MIN;
4461 		}
4462 		spin_unlock(&ve->base.active.lock);
4463 	}
4464 
4465 	/* Remaining _unready_ requests will be nop'ed when submitted */
4466 
4467 	execlists->queue_priority_hint = INT_MIN;
4468 	execlists->queue = RB_ROOT_CACHED;
4469 
4470 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4471 	execlists->tasklet.func = nop_submission_tasklet;
4472 
4473 	spin_unlock_irqrestore(&engine->active.lock, flags);
4474 }
4475 
4476 static void execlists_reset_finish(struct intel_engine_cs *engine)
4477 {
4478 	struct intel_engine_execlists * const execlists = &engine->execlists;
4479 
4480 	/*
4481 	 * After a GPU reset, we may have requests to replay. Do so now while
4482 	 * we still have the forcewake to be sure that the GPU is not allowed
4483 	 * to sleep before we restart and reload a context.
4484 	 */
4485 	GEM_BUG_ON(!reset_in_progress(execlists));
4486 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4487 		execlists->tasklet.func(execlists->tasklet.data);
4488 
4489 	if (__tasklet_enable(&execlists->tasklet))
4490 		/* And kick in case we missed a new request submission. */
4491 		tasklet_hi_schedule(&execlists->tasklet);
4492 	ENGINE_TRACE(engine, "depth->%d\n",
4493 		     atomic_read(&execlists->tasklet.count));
4494 }
4495 
4496 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4497 				    u64 offset, u32 len,
4498 				    const unsigned int flags)
4499 {
4500 	u32 *cs;
4501 
4502 	cs = intel_ring_begin(rq, 4);
4503 	if (IS_ERR(cs))
4504 		return PTR_ERR(cs);
4505 
4506 	/*
4507 	 * WaDisableCtxRestoreArbitration:bdw,chv
4508 	 *
4509 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4510 	 * particular all the gen that do not need the w/a at all!), if we
4511 	 * took care to make sure that on every switch into this context
4512 	 * (both ordinary and for preemption) that arbitrartion was enabled
4513 	 * we would be fine.  However, for gen8 there is another w/a that
4514 	 * requires us to not preempt inside GPGPU execution, so we keep
4515 	 * arbitration disabled for gen8 batches. Arbitration will be
4516 	 * re-enabled before we close the request
4517 	 * (engine->emit_fini_breadcrumb).
4518 	 */
4519 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4520 
4521 	/* FIXME(BDW+): Address space and security selectors. */
4522 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4523 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4524 	*cs++ = lower_32_bits(offset);
4525 	*cs++ = upper_32_bits(offset);
4526 
4527 	intel_ring_advance(rq, cs);
4528 
4529 	return 0;
4530 }
4531 
4532 static int gen8_emit_bb_start(struct i915_request *rq,
4533 			      u64 offset, u32 len,
4534 			      const unsigned int flags)
4535 {
4536 	u32 *cs;
4537 
4538 	cs = intel_ring_begin(rq, 6);
4539 	if (IS_ERR(cs))
4540 		return PTR_ERR(cs);
4541 
4542 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4543 
4544 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4545 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4546 	*cs++ = lower_32_bits(offset);
4547 	*cs++ = upper_32_bits(offset);
4548 
4549 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4550 	*cs++ = MI_NOOP;
4551 
4552 	intel_ring_advance(rq, cs);
4553 
4554 	return 0;
4555 }
4556 
4557 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4558 {
4559 	ENGINE_WRITE(engine, RING_IMR,
4560 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4561 	ENGINE_POSTING_READ(engine, RING_IMR);
4562 }
4563 
4564 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4565 {
4566 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4567 }
4568 
4569 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4570 {
4571 	u32 cmd, *cs;
4572 
4573 	cs = intel_ring_begin(request, 4);
4574 	if (IS_ERR(cs))
4575 		return PTR_ERR(cs);
4576 
4577 	cmd = MI_FLUSH_DW + 1;
4578 
4579 	/* We always require a command barrier so that subsequent
4580 	 * commands, such as breadcrumb interrupts, are strictly ordered
4581 	 * wrt the contents of the write cache being flushed to memory
4582 	 * (and thus being coherent from the CPU).
4583 	 */
4584 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4585 
4586 	if (mode & EMIT_INVALIDATE) {
4587 		cmd |= MI_INVALIDATE_TLB;
4588 		if (request->engine->class == VIDEO_DECODE_CLASS)
4589 			cmd |= MI_INVALIDATE_BSD;
4590 	}
4591 
4592 	*cs++ = cmd;
4593 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4594 	*cs++ = 0; /* upper addr */
4595 	*cs++ = 0; /* value */
4596 	intel_ring_advance(request, cs);
4597 
4598 	return 0;
4599 }
4600 
4601 static int gen8_emit_flush_render(struct i915_request *request,
4602 				  u32 mode)
4603 {
4604 	bool vf_flush_wa = false, dc_flush_wa = false;
4605 	u32 *cs, flags = 0;
4606 	int len;
4607 
4608 	flags |= PIPE_CONTROL_CS_STALL;
4609 
4610 	if (mode & EMIT_FLUSH) {
4611 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4612 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4613 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4614 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4615 	}
4616 
4617 	if (mode & EMIT_INVALIDATE) {
4618 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4619 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4620 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4621 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4622 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4623 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4624 		flags |= PIPE_CONTROL_QW_WRITE;
4625 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4626 
4627 		/*
4628 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4629 		 * pipe control.
4630 		 */
4631 		if (IS_GEN(request->engine->i915, 9))
4632 			vf_flush_wa = true;
4633 
4634 		/* WaForGAMHang:kbl */
4635 		if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0))
4636 			dc_flush_wa = true;
4637 	}
4638 
4639 	len = 6;
4640 
4641 	if (vf_flush_wa)
4642 		len += 6;
4643 
4644 	if (dc_flush_wa)
4645 		len += 12;
4646 
4647 	cs = intel_ring_begin(request, len);
4648 	if (IS_ERR(cs))
4649 		return PTR_ERR(cs);
4650 
4651 	if (vf_flush_wa)
4652 		cs = gen8_emit_pipe_control(cs, 0, 0);
4653 
4654 	if (dc_flush_wa)
4655 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4656 					    0);
4657 
4658 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4659 
4660 	if (dc_flush_wa)
4661 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4662 
4663 	intel_ring_advance(request, cs);
4664 
4665 	return 0;
4666 }
4667 
4668 static int gen11_emit_flush_render(struct i915_request *request,
4669 				   u32 mode)
4670 {
4671 	if (mode & EMIT_FLUSH) {
4672 		u32 *cs;
4673 		u32 flags = 0;
4674 
4675 		flags |= PIPE_CONTROL_CS_STALL;
4676 
4677 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4678 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4679 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4680 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4681 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4682 		flags |= PIPE_CONTROL_QW_WRITE;
4683 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4684 
4685 		cs = intel_ring_begin(request, 6);
4686 		if (IS_ERR(cs))
4687 			return PTR_ERR(cs);
4688 
4689 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4690 		intel_ring_advance(request, cs);
4691 	}
4692 
4693 	if (mode & EMIT_INVALIDATE) {
4694 		u32 *cs;
4695 		u32 flags = 0;
4696 
4697 		flags |= PIPE_CONTROL_CS_STALL;
4698 
4699 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4700 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4701 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4702 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4703 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4704 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4705 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4706 		flags |= PIPE_CONTROL_QW_WRITE;
4707 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4708 
4709 		cs = intel_ring_begin(request, 6);
4710 		if (IS_ERR(cs))
4711 			return PTR_ERR(cs);
4712 
4713 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4714 		intel_ring_advance(request, cs);
4715 	}
4716 
4717 	return 0;
4718 }
4719 
4720 static u32 preparser_disable(bool state)
4721 {
4722 	return MI_ARB_CHECK | 1 << 8 | state;
4723 }
4724 
4725 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4726 {
4727 	static const i915_reg_t vd[] = {
4728 		GEN12_VD0_AUX_NV,
4729 		GEN12_VD1_AUX_NV,
4730 		GEN12_VD2_AUX_NV,
4731 		GEN12_VD3_AUX_NV,
4732 	};
4733 
4734 	static const i915_reg_t ve[] = {
4735 		GEN12_VE0_AUX_NV,
4736 		GEN12_VE1_AUX_NV,
4737 	};
4738 
4739 	if (engine->class == VIDEO_DECODE_CLASS)
4740 		return vd[engine->instance];
4741 
4742 	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4743 		return ve[engine->instance];
4744 
4745 	GEM_BUG_ON("unknown aux_inv_reg\n");
4746 
4747 	return INVALID_MMIO_REG;
4748 }
4749 
4750 static u32 *
4751 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4752 {
4753 	*cs++ = MI_LOAD_REGISTER_IMM(1);
4754 	*cs++ = i915_mmio_reg_offset(inv_reg);
4755 	*cs++ = AUX_INV;
4756 	*cs++ = MI_NOOP;
4757 
4758 	return cs;
4759 }
4760 
4761 static int gen12_emit_flush_render(struct i915_request *request,
4762 				   u32 mode)
4763 {
4764 	if (mode & EMIT_FLUSH) {
4765 		u32 flags = 0;
4766 		u32 *cs;
4767 
4768 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4769 		flags |= PIPE_CONTROL_FLUSH_L3;
4770 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4771 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4772 		/* Wa_1409600907:tgl */
4773 		flags |= PIPE_CONTROL_DEPTH_STALL;
4774 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4775 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4776 
4777 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4778 		flags |= PIPE_CONTROL_QW_WRITE;
4779 
4780 		flags |= PIPE_CONTROL_CS_STALL;
4781 
4782 		cs = intel_ring_begin(request, 6);
4783 		if (IS_ERR(cs))
4784 			return PTR_ERR(cs);
4785 
4786 		cs = gen12_emit_pipe_control(cs,
4787 					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4788 					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4789 		intel_ring_advance(request, cs);
4790 	}
4791 
4792 	if (mode & EMIT_INVALIDATE) {
4793 		u32 flags = 0;
4794 		u32 *cs;
4795 
4796 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4797 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4798 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4799 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4800 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4801 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4802 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4803 
4804 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4805 		flags |= PIPE_CONTROL_QW_WRITE;
4806 
4807 		flags |= PIPE_CONTROL_CS_STALL;
4808 
4809 		cs = intel_ring_begin(request, 8 + 4);
4810 		if (IS_ERR(cs))
4811 			return PTR_ERR(cs);
4812 
4813 		/*
4814 		 * Prevent the pre-parser from skipping past the TLB
4815 		 * invalidate and loading a stale page for the batch
4816 		 * buffer / request payload.
4817 		 */
4818 		*cs++ = preparser_disable(true);
4819 
4820 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4821 
4822 		/* hsdes: 1809175790 */
4823 		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4824 
4825 		*cs++ = preparser_disable(false);
4826 		intel_ring_advance(request, cs);
4827 	}
4828 
4829 	return 0;
4830 }
4831 
4832 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4833 {
4834 	intel_engine_mask_t aux_inv = 0;
4835 	u32 cmd, *cs;
4836 
4837 	cmd = 4;
4838 	if (mode & EMIT_INVALIDATE)
4839 		cmd += 2;
4840 	if (mode & EMIT_INVALIDATE)
4841 		aux_inv = request->engine->mask & ~BIT(BCS0);
4842 	if (aux_inv)
4843 		cmd += 2 * hweight8(aux_inv) + 2;
4844 
4845 	cs = intel_ring_begin(request, cmd);
4846 	if (IS_ERR(cs))
4847 		return PTR_ERR(cs);
4848 
4849 	if (mode & EMIT_INVALIDATE)
4850 		*cs++ = preparser_disable(true);
4851 
4852 	cmd = MI_FLUSH_DW + 1;
4853 
4854 	/* We always require a command barrier so that subsequent
4855 	 * commands, such as breadcrumb interrupts, are strictly ordered
4856 	 * wrt the contents of the write cache being flushed to memory
4857 	 * (and thus being coherent from the CPU).
4858 	 */
4859 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4860 
4861 	if (mode & EMIT_INVALIDATE) {
4862 		cmd |= MI_INVALIDATE_TLB;
4863 		if (request->engine->class == VIDEO_DECODE_CLASS)
4864 			cmd |= MI_INVALIDATE_BSD;
4865 	}
4866 
4867 	*cs++ = cmd;
4868 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4869 	*cs++ = 0; /* upper addr */
4870 	*cs++ = 0; /* value */
4871 
4872 	if (aux_inv) { /* hsdes: 1809175790 */
4873 		struct intel_engine_cs *engine;
4874 		unsigned int tmp;
4875 
4876 		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4877 		for_each_engine_masked(engine, request->engine->gt,
4878 				       aux_inv, tmp) {
4879 			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4880 			*cs++ = AUX_INV;
4881 		}
4882 		*cs++ = MI_NOOP;
4883 	}
4884 
4885 	if (mode & EMIT_INVALIDATE)
4886 		*cs++ = preparser_disable(false);
4887 
4888 	intel_ring_advance(request, cs);
4889 
4890 	return 0;
4891 }
4892 
4893 static void assert_request_valid(struct i915_request *rq)
4894 {
4895 	struct intel_ring *ring __maybe_unused = rq->ring;
4896 
4897 	/* Can we unwind this request without appearing to go forwards? */
4898 	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4899 }
4900 
4901 /*
4902  * Reserve space for 2 NOOPs at the end of each request to be
4903  * used as a workaround for not being allowed to do lite
4904  * restore with HEAD==TAIL (WaIdleLiteRestore).
4905  */
4906 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4907 {
4908 	/* Ensure there's always at least one preemption point per-request. */
4909 	*cs++ = MI_ARB_CHECK;
4910 	*cs++ = MI_NOOP;
4911 	request->wa_tail = intel_ring_offset(request, cs);
4912 
4913 	/* Check that entire request is less than half the ring */
4914 	assert_request_valid(request);
4915 
4916 	return cs;
4917 }
4918 
4919 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4920 {
4921 	*cs++ = MI_SEMAPHORE_WAIT |
4922 		MI_SEMAPHORE_GLOBAL_GTT |
4923 		MI_SEMAPHORE_POLL |
4924 		MI_SEMAPHORE_SAD_EQ_SDD;
4925 	*cs++ = 0;
4926 	*cs++ = intel_hws_preempt_address(request->engine);
4927 	*cs++ = 0;
4928 
4929 	return cs;
4930 }
4931 
4932 static __always_inline u32*
4933 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4934 {
4935 	*cs++ = MI_USER_INTERRUPT;
4936 
4937 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4938 	if (intel_engine_has_semaphores(request->engine))
4939 		cs = emit_preempt_busywait(request, cs);
4940 
4941 	request->tail = intel_ring_offset(request, cs);
4942 	assert_ring_tail_valid(request->ring, request->tail);
4943 
4944 	return gen8_emit_wa_tail(request, cs);
4945 }
4946 
4947 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
4948 {
4949 	return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
4950 }
4951 
4952 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4953 {
4954 	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4955 }
4956 
4957 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4958 {
4959 	cs = gen8_emit_pipe_control(cs,
4960 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4961 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4962 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4963 				    0);
4964 
4965 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4966 	cs = gen8_emit_ggtt_write_rcs(cs,
4967 				      request->fence.seqno,
4968 				      hwsp_offset(request),
4969 				      PIPE_CONTROL_FLUSH_ENABLE |
4970 				      PIPE_CONTROL_CS_STALL);
4971 
4972 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4973 }
4974 
4975 static u32 *
4976 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4977 {
4978 	cs = gen8_emit_ggtt_write_rcs(cs,
4979 				      request->fence.seqno,
4980 				      hwsp_offset(request),
4981 				      PIPE_CONTROL_CS_STALL |
4982 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4983 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4984 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4985 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4986 				      PIPE_CONTROL_FLUSH_ENABLE);
4987 
4988 	return gen8_emit_fini_breadcrumb_tail(request, cs);
4989 }
4990 
4991 /*
4992  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4993  * flush and will continue pre-fetching the instructions after it before the
4994  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4995  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4996  * of the next request before the memory has been flushed, we're guaranteed that
4997  * we won't access the batch itself too early.
4998  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4999  * so, if the current request is modifying an instruction in the next request on
5000  * the same intel_context, we might pre-fetch and then execute the pre-update
5001  * instruction. To avoid this, the users of self-modifying code should either
5002  * disable the parser around the code emitting the memory writes, via a new flag
5003  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
5004  * the in-kernel use-cases we've opted to use a separate context, see
5005  * reloc_gpu() as an example.
5006  * All the above applies only to the instructions themselves. Non-inline data
5007  * used by the instructions is not pre-fetched.
5008  */
5009 
5010 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
5011 {
5012 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
5013 		MI_SEMAPHORE_GLOBAL_GTT |
5014 		MI_SEMAPHORE_POLL |
5015 		MI_SEMAPHORE_SAD_EQ_SDD;
5016 	*cs++ = 0;
5017 	*cs++ = intel_hws_preempt_address(request->engine);
5018 	*cs++ = 0;
5019 	*cs++ = 0;
5020 	*cs++ = MI_NOOP;
5021 
5022 	return cs;
5023 }
5024 
5025 static __always_inline u32*
5026 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
5027 {
5028 	*cs++ = MI_USER_INTERRUPT;
5029 
5030 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
5031 	if (intel_engine_has_semaphores(request->engine))
5032 		cs = gen12_emit_preempt_busywait(request, cs);
5033 
5034 	request->tail = intel_ring_offset(request, cs);
5035 	assert_ring_tail_valid(request->ring, request->tail);
5036 
5037 	return gen8_emit_wa_tail(request, cs);
5038 }
5039 
5040 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
5041 {
5042 	/* XXX Stalling flush before seqno write; post-sync not */
5043 	cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
5044 	return gen12_emit_fini_breadcrumb_tail(rq, cs);
5045 }
5046 
5047 static u32 *
5048 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
5049 {
5050 	cs = gen12_emit_ggtt_write_rcs(cs,
5051 				       request->fence.seqno,
5052 				       hwsp_offset(request),
5053 				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
5054 				       PIPE_CONTROL_CS_STALL |
5055 				       PIPE_CONTROL_TILE_CACHE_FLUSH |
5056 				       PIPE_CONTROL_FLUSH_L3 |
5057 				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5058 				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5059 				       /* Wa_1409600907:tgl */
5060 				       PIPE_CONTROL_DEPTH_STALL |
5061 				       PIPE_CONTROL_DC_FLUSH_ENABLE |
5062 				       PIPE_CONTROL_FLUSH_ENABLE);
5063 
5064 	return gen12_emit_fini_breadcrumb_tail(request, cs);
5065 }
5066 
5067 static void execlists_park(struct intel_engine_cs *engine)
5068 {
5069 	cancel_timer(&engine->execlists.timer);
5070 	cancel_timer(&engine->execlists.preempt);
5071 }
5072 
5073 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
5074 {
5075 	engine->submit_request = execlists_submit_request;
5076 	engine->schedule = i915_schedule;
5077 	engine->execlists.tasklet.func = execlists_submission_tasklet;
5078 
5079 	engine->reset.prepare = execlists_reset_prepare;
5080 	engine->reset.rewind = execlists_reset_rewind;
5081 	engine->reset.cancel = execlists_reset_cancel;
5082 	engine->reset.finish = execlists_reset_finish;
5083 
5084 	engine->park = execlists_park;
5085 	engine->unpark = NULL;
5086 
5087 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5088 	if (!intel_vgpu_active(engine->i915)) {
5089 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5090 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5091 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5092 			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5093 				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5094 		}
5095 	}
5096 
5097 	if (INTEL_GEN(engine->i915) >= 12)
5098 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5099 
5100 	if (intel_engine_has_preemption(engine))
5101 		engine->emit_bb_start = gen8_emit_bb_start;
5102 	else
5103 		engine->emit_bb_start = gen8_emit_bb_start_noarb;
5104 }
5105 
5106 static void execlists_shutdown(struct intel_engine_cs *engine)
5107 {
5108 	/* Synchronise with residual timers and any softirq they raise */
5109 	del_timer_sync(&engine->execlists.timer);
5110 	del_timer_sync(&engine->execlists.preempt);
5111 	tasklet_kill(&engine->execlists.tasklet);
5112 }
5113 
5114 static void execlists_release(struct intel_engine_cs *engine)
5115 {
5116 	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5117 
5118 	execlists_shutdown(engine);
5119 
5120 	intel_engine_cleanup_common(engine);
5121 	lrc_destroy_wa_ctx(engine);
5122 }
5123 
5124 static void
5125 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5126 {
5127 	/* Default vfuncs which can be overriden by each engine. */
5128 
5129 	engine->resume = execlists_resume;
5130 
5131 	engine->cops = &execlists_context_ops;
5132 	engine->request_alloc = execlists_request_alloc;
5133 
5134 	engine->emit_flush = gen8_emit_flush;
5135 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5136 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5137 	if (INTEL_GEN(engine->i915) >= 12) {
5138 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5139 		engine->emit_flush = gen12_emit_flush;
5140 	}
5141 	engine->set_default_submission = intel_execlists_set_default_submission;
5142 
5143 	if (INTEL_GEN(engine->i915) < 11) {
5144 		engine->irq_enable = gen8_logical_ring_enable_irq;
5145 		engine->irq_disable = gen8_logical_ring_disable_irq;
5146 	} else {
5147 		/*
5148 		 * TODO: On Gen11 interrupt masks need to be clear
5149 		 * to allow C6 entry. Keep interrupts enabled at
5150 		 * and take the hit of generating extra interrupts
5151 		 * until a more refined solution exists.
5152 		 */
5153 	}
5154 }
5155 
5156 static inline void
5157 logical_ring_default_irqs(struct intel_engine_cs *engine)
5158 {
5159 	unsigned int shift = 0;
5160 
5161 	if (INTEL_GEN(engine->i915) < 11) {
5162 		const u8 irq_shifts[] = {
5163 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5164 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5165 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5166 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5167 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5168 		};
5169 
5170 		shift = irq_shifts[engine->id];
5171 	}
5172 
5173 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5174 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5175 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5176 	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5177 }
5178 
5179 static void rcs_submission_override(struct intel_engine_cs *engine)
5180 {
5181 	switch (INTEL_GEN(engine->i915)) {
5182 	case 12:
5183 		engine->emit_flush = gen12_emit_flush_render;
5184 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5185 		break;
5186 	case 11:
5187 		engine->emit_flush = gen11_emit_flush_render;
5188 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5189 		break;
5190 	default:
5191 		engine->emit_flush = gen8_emit_flush_render;
5192 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5193 		break;
5194 	}
5195 }
5196 
5197 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5198 {
5199 	struct intel_engine_execlists * const execlists = &engine->execlists;
5200 	struct drm_i915_private *i915 = engine->i915;
5201 	struct intel_uncore *uncore = engine->uncore;
5202 	u32 base = engine->mmio_base;
5203 
5204 	tasklet_init(&engine->execlists.tasklet,
5205 		     execlists_submission_tasklet, (unsigned long)engine);
5206 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5207 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5208 
5209 	logical_ring_default_vfuncs(engine);
5210 	logical_ring_default_irqs(engine);
5211 
5212 	if (engine->class == RENDER_CLASS)
5213 		rcs_submission_override(engine);
5214 
5215 	if (intel_init_workaround_bb(engine))
5216 		/*
5217 		 * We continue even if we fail to initialize WA batch
5218 		 * because we only expect rare glitches but nothing
5219 		 * critical to prevent us from using GPU
5220 		 */
5221 		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5222 
5223 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5224 		execlists->submit_reg = uncore->regs +
5225 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5226 		execlists->ctrl_reg = uncore->regs +
5227 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5228 	} else {
5229 		execlists->submit_reg = uncore->regs +
5230 			i915_mmio_reg_offset(RING_ELSP(base));
5231 	}
5232 
5233 	execlists->csb_status =
5234 		(u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5235 
5236 	execlists->csb_write =
5237 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5238 
5239 	if (INTEL_GEN(i915) < 11)
5240 		execlists->csb_size = GEN8_CSB_ENTRIES;
5241 	else
5242 		execlists->csb_size = GEN11_CSB_ENTRIES;
5243 
5244 	if (INTEL_GEN(engine->i915) >= 11) {
5245 		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5246 		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5247 	}
5248 
5249 	/* Finally, take ownership and responsibility for cleanup! */
5250 	engine->sanitize = execlists_sanitize;
5251 	engine->release = execlists_release;
5252 
5253 	return 0;
5254 }
5255 
5256 static void init_common_reg_state(u32 * const regs,
5257 				  const struct intel_engine_cs *engine,
5258 				  const struct intel_ring *ring,
5259 				  bool inhibit)
5260 {
5261 	u32 ctl;
5262 
5263 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5264 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5265 	if (inhibit)
5266 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5267 	if (INTEL_GEN(engine->i915) < 11)
5268 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5269 					   CTX_CTRL_RS_CTX_ENABLE);
5270 	regs[CTX_CONTEXT_CONTROL] = ctl;
5271 
5272 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5273 	regs[CTX_TIMESTAMP] = 0;
5274 }
5275 
5276 static void init_wa_bb_reg_state(u32 * const regs,
5277 				 const struct intel_engine_cs *engine)
5278 {
5279 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5280 
5281 	if (wa_ctx->per_ctx.size) {
5282 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5283 
5284 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5285 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5286 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5287 	}
5288 
5289 	if (wa_ctx->indirect_ctx.size) {
5290 		lrc_ring_setup_indirect_ctx(regs, engine,
5291 					    i915_ggtt_offset(wa_ctx->vma) +
5292 					    wa_ctx->indirect_ctx.offset,
5293 					    wa_ctx->indirect_ctx.size);
5294 	}
5295 }
5296 
5297 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5298 {
5299 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5300 		/* 64b PPGTT (48bit canonical)
5301 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5302 		 * other PDP Descriptors are ignored.
5303 		 */
5304 		ASSIGN_CTX_PML4(ppgtt, regs);
5305 	} else {
5306 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5307 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5308 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5309 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5310 	}
5311 }
5312 
5313 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5314 {
5315 	if (i915_is_ggtt(vm))
5316 		return i915_vm_to_ggtt(vm)->alias;
5317 	else
5318 		return i915_vm_to_ppgtt(vm);
5319 }
5320 
5321 static void execlists_init_reg_state(u32 *regs,
5322 				     const struct intel_context *ce,
5323 				     const struct intel_engine_cs *engine,
5324 				     const struct intel_ring *ring,
5325 				     bool inhibit)
5326 {
5327 	/*
5328 	 * A context is actually a big batch buffer with several
5329 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5330 	 * values we are setting here are only for the first context restore:
5331 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5332 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5333 	 * we are not initializing here).
5334 	 *
5335 	 * Must keep consistent with virtual_update_register_offsets().
5336 	 */
5337 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5338 
5339 	init_common_reg_state(regs, engine, ring, inhibit);
5340 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5341 
5342 	init_wa_bb_reg_state(regs, engine);
5343 
5344 	__reset_stop_ring(regs, engine);
5345 }
5346 
5347 static int
5348 populate_lr_context(struct intel_context *ce,
5349 		    struct drm_i915_gem_object *ctx_obj,
5350 		    struct intel_engine_cs *engine,
5351 		    struct intel_ring *ring)
5352 {
5353 	bool inhibit = true;
5354 	void *vaddr;
5355 
5356 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5357 	if (IS_ERR(vaddr)) {
5358 		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5359 		return PTR_ERR(vaddr);
5360 	}
5361 
5362 	set_redzone(vaddr, engine);
5363 
5364 	if (engine->default_state) {
5365 		shmem_read(engine->default_state, 0,
5366 			   vaddr, engine->context_size);
5367 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5368 		inhibit = false;
5369 	}
5370 
5371 	/* Clear the ppHWSP (inc. per-context counters) */
5372 	memset(vaddr, 0, PAGE_SIZE);
5373 
5374 	/*
5375 	 * The second page of the context object contains some registers which
5376 	 * must be set up prior to the first execution.
5377 	 */
5378 	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5379 				 ce, engine, ring, inhibit);
5380 
5381 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5382 	i915_gem_object_unpin_map(ctx_obj);
5383 	return 0;
5384 }
5385 
5386 static struct intel_timeline *pinned_timeline(struct intel_context *ce)
5387 {
5388 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
5389 
5390 	return intel_timeline_create_from_engine(ce->engine,
5391 						 page_unmask_bits(tl));
5392 }
5393 
5394 static int __execlists_context_alloc(struct intel_context *ce,
5395 				     struct intel_engine_cs *engine)
5396 {
5397 	struct drm_i915_gem_object *ctx_obj;
5398 	struct intel_ring *ring;
5399 	struct i915_vma *vma;
5400 	u32 context_size;
5401 	int ret;
5402 
5403 	GEM_BUG_ON(ce->state);
5404 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5405 
5406 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5407 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5408 
5409 	if (INTEL_GEN(engine->i915) == 12) {
5410 		ce->wa_bb_page = context_size / PAGE_SIZE;
5411 		context_size += PAGE_SIZE;
5412 	}
5413 
5414 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5415 	if (IS_ERR(ctx_obj))
5416 		return PTR_ERR(ctx_obj);
5417 
5418 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5419 	if (IS_ERR(vma)) {
5420 		ret = PTR_ERR(vma);
5421 		goto error_deref_obj;
5422 	}
5423 
5424 	if (!page_mask_bits(ce->timeline)) {
5425 		struct intel_timeline *tl;
5426 
5427 		/*
5428 		 * Use the static global HWSP for the kernel context, and
5429 		 * a dynamically allocated cacheline for everyone else.
5430 		 */
5431 		if (unlikely(ce->timeline))
5432 			tl = pinned_timeline(ce);
5433 		else
5434 			tl = intel_timeline_create(engine->gt);
5435 		if (IS_ERR(tl)) {
5436 			ret = PTR_ERR(tl);
5437 			goto error_deref_obj;
5438 		}
5439 
5440 		ce->timeline = tl;
5441 	}
5442 
5443 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5444 	if (IS_ERR(ring)) {
5445 		ret = PTR_ERR(ring);
5446 		goto error_deref_obj;
5447 	}
5448 
5449 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5450 	if (ret) {
5451 		drm_dbg(&engine->i915->drm,
5452 			"Failed to populate LRC: %d\n", ret);
5453 		goto error_ring_free;
5454 	}
5455 
5456 	ce->ring = ring;
5457 	ce->state = vma;
5458 
5459 	return 0;
5460 
5461 error_ring_free:
5462 	intel_ring_put(ring);
5463 error_deref_obj:
5464 	i915_gem_object_put(ctx_obj);
5465 	return ret;
5466 }
5467 
5468 static struct list_head *virtual_queue(struct virtual_engine *ve)
5469 {
5470 	return &ve->base.execlists.default_priolist.requests[0];
5471 }
5472 
5473 static void virtual_context_destroy(struct kref *kref)
5474 {
5475 	struct virtual_engine *ve =
5476 		container_of(kref, typeof(*ve), context.ref);
5477 	unsigned int n;
5478 
5479 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5480 	GEM_BUG_ON(ve->request);
5481 	GEM_BUG_ON(ve->context.inflight);
5482 
5483 	for (n = 0; n < ve->num_siblings; n++) {
5484 		struct intel_engine_cs *sibling = ve->siblings[n];
5485 		struct rb_node *node = &ve->nodes[sibling->id].rb;
5486 		unsigned long flags;
5487 
5488 		if (RB_EMPTY_NODE(node))
5489 			continue;
5490 
5491 		spin_lock_irqsave(&sibling->active.lock, flags);
5492 
5493 		/* Detachment is lazily performed in the execlists tasklet */
5494 		if (!RB_EMPTY_NODE(node))
5495 			rb_erase_cached(node, &sibling->execlists.virtual);
5496 
5497 		spin_unlock_irqrestore(&sibling->active.lock, flags);
5498 	}
5499 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5500 
5501 	if (ve->context.state)
5502 		__execlists_context_fini(&ve->context);
5503 	intel_context_fini(&ve->context);
5504 
5505 	intel_breadcrumbs_free(ve->base.breadcrumbs);
5506 	intel_engine_free_request_pool(&ve->base);
5507 
5508 	kfree(ve->bonds);
5509 	kfree(ve);
5510 }
5511 
5512 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5513 {
5514 	int swp;
5515 
5516 	/*
5517 	 * Pick a random sibling on starting to help spread the load around.
5518 	 *
5519 	 * New contexts are typically created with exactly the same order
5520 	 * of siblings, and often started in batches. Due to the way we iterate
5521 	 * the array of sibling when submitting requests, sibling[0] is
5522 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5523 	 * randomised across the system, we also help spread the load by the
5524 	 * first engine we inspect being different each time.
5525 	 *
5526 	 * NB This does not force us to execute on this engine, it will just
5527 	 * typically be the first we inspect for submission.
5528 	 */
5529 	swp = prandom_u32_max(ve->num_siblings);
5530 	if (swp)
5531 		swap(ve->siblings[swp], ve->siblings[0]);
5532 }
5533 
5534 static int virtual_context_alloc(struct intel_context *ce)
5535 {
5536 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5537 
5538 	return __execlists_context_alloc(ce, ve->siblings[0]);
5539 }
5540 
5541 static int virtual_context_pin(struct intel_context *ce, void *vaddr)
5542 {
5543 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5544 
5545 	/* Note: we must use a real engine class for setting up reg state */
5546 	return __execlists_context_pin(ce, ve->siblings[0], vaddr);
5547 }
5548 
5549 static void virtual_context_enter(struct intel_context *ce)
5550 {
5551 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5552 	unsigned int n;
5553 
5554 	for (n = 0; n < ve->num_siblings; n++)
5555 		intel_engine_pm_get(ve->siblings[n]);
5556 
5557 	intel_timeline_enter(ce->timeline);
5558 }
5559 
5560 static void virtual_context_exit(struct intel_context *ce)
5561 {
5562 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5563 	unsigned int n;
5564 
5565 	intel_timeline_exit(ce->timeline);
5566 
5567 	for (n = 0; n < ve->num_siblings; n++)
5568 		intel_engine_pm_put(ve->siblings[n]);
5569 }
5570 
5571 static const struct intel_context_ops virtual_context_ops = {
5572 	.alloc = virtual_context_alloc,
5573 
5574 	.pre_pin = execlists_context_pre_pin,
5575 	.pin = virtual_context_pin,
5576 	.unpin = execlists_context_unpin,
5577 	.post_unpin = execlists_context_post_unpin,
5578 
5579 	.enter = virtual_context_enter,
5580 	.exit = virtual_context_exit,
5581 
5582 	.destroy = virtual_context_destroy,
5583 };
5584 
5585 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5586 {
5587 	struct i915_request *rq;
5588 	intel_engine_mask_t mask;
5589 
5590 	rq = READ_ONCE(ve->request);
5591 	if (!rq)
5592 		return 0;
5593 
5594 	/* The rq is ready for submission; rq->execution_mask is now stable. */
5595 	mask = rq->execution_mask;
5596 	if (unlikely(!mask)) {
5597 		/* Invalid selection, submit to a random engine in error */
5598 		i915_request_set_error_once(rq, -ENODEV);
5599 		mask = ve->siblings[0]->mask;
5600 	}
5601 
5602 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5603 		     rq->fence.context, rq->fence.seqno,
5604 		     mask, ve->base.execlists.queue_priority_hint);
5605 
5606 	return mask;
5607 }
5608 
5609 static void virtual_submission_tasklet(unsigned long data)
5610 {
5611 	struct virtual_engine * const ve = (struct virtual_engine *)data;
5612 	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5613 	intel_engine_mask_t mask;
5614 	unsigned int n;
5615 
5616 	rcu_read_lock();
5617 	mask = virtual_submission_mask(ve);
5618 	rcu_read_unlock();
5619 	if (unlikely(!mask))
5620 		return;
5621 
5622 	local_irq_disable();
5623 	for (n = 0; n < ve->num_siblings; n++) {
5624 		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5625 		struct ve_node * const node = &ve->nodes[sibling->id];
5626 		struct rb_node **parent, *rb;
5627 		bool first;
5628 
5629 		if (!READ_ONCE(ve->request))
5630 			break; /* already handled by a sibling's tasklet */
5631 
5632 		if (unlikely(!(mask & sibling->mask))) {
5633 			if (!RB_EMPTY_NODE(&node->rb)) {
5634 				spin_lock(&sibling->active.lock);
5635 				rb_erase_cached(&node->rb,
5636 						&sibling->execlists.virtual);
5637 				RB_CLEAR_NODE(&node->rb);
5638 				spin_unlock(&sibling->active.lock);
5639 			}
5640 			continue;
5641 		}
5642 
5643 		spin_lock(&sibling->active.lock);
5644 
5645 		if (!RB_EMPTY_NODE(&node->rb)) {
5646 			/*
5647 			 * Cheat and avoid rebalancing the tree if we can
5648 			 * reuse this node in situ.
5649 			 */
5650 			first = rb_first_cached(&sibling->execlists.virtual) ==
5651 				&node->rb;
5652 			if (prio == node->prio || (prio > node->prio && first))
5653 				goto submit_engine;
5654 
5655 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5656 		}
5657 
5658 		rb = NULL;
5659 		first = true;
5660 		parent = &sibling->execlists.virtual.rb_root.rb_node;
5661 		while (*parent) {
5662 			struct ve_node *other;
5663 
5664 			rb = *parent;
5665 			other = rb_entry(rb, typeof(*other), rb);
5666 			if (prio > other->prio) {
5667 				parent = &rb->rb_left;
5668 			} else {
5669 				parent = &rb->rb_right;
5670 				first = false;
5671 			}
5672 		}
5673 
5674 		rb_link_node(&node->rb, rb, parent);
5675 		rb_insert_color_cached(&node->rb,
5676 				       &sibling->execlists.virtual,
5677 				       first);
5678 
5679 submit_engine:
5680 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5681 		node->prio = prio;
5682 		if (first && prio > sibling->execlists.queue_priority_hint)
5683 			tasklet_hi_schedule(&sibling->execlists.tasklet);
5684 
5685 		spin_unlock(&sibling->active.lock);
5686 	}
5687 	local_irq_enable();
5688 }
5689 
5690 static void virtual_submit_request(struct i915_request *rq)
5691 {
5692 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5693 	struct i915_request *old;
5694 	unsigned long flags;
5695 
5696 	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5697 		     rq->fence.context,
5698 		     rq->fence.seqno);
5699 
5700 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5701 
5702 	spin_lock_irqsave(&ve->base.active.lock, flags);
5703 
5704 	old = ve->request;
5705 	if (old) { /* background completion event from preempt-to-busy */
5706 		GEM_BUG_ON(!i915_request_completed(old));
5707 		__i915_request_submit(old);
5708 		i915_request_put(old);
5709 	}
5710 
5711 	if (i915_request_completed(rq)) {
5712 		__i915_request_submit(rq);
5713 
5714 		ve->base.execlists.queue_priority_hint = INT_MIN;
5715 		ve->request = NULL;
5716 	} else {
5717 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5718 		ve->request = i915_request_get(rq);
5719 
5720 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5721 		list_move_tail(&rq->sched.link, virtual_queue(ve));
5722 
5723 		tasklet_hi_schedule(&ve->base.execlists.tasklet);
5724 	}
5725 
5726 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5727 }
5728 
5729 static struct ve_bond *
5730 virtual_find_bond(struct virtual_engine *ve,
5731 		  const struct intel_engine_cs *master)
5732 {
5733 	int i;
5734 
5735 	for (i = 0; i < ve->num_bonds; i++) {
5736 		if (ve->bonds[i].master == master)
5737 			return &ve->bonds[i];
5738 	}
5739 
5740 	return NULL;
5741 }
5742 
5743 static void
5744 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5745 {
5746 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5747 	intel_engine_mask_t allowed, exec;
5748 	struct ve_bond *bond;
5749 
5750 	allowed = ~to_request(signal)->engine->mask;
5751 
5752 	bond = virtual_find_bond(ve, to_request(signal)->engine);
5753 	if (bond)
5754 		allowed &= bond->sibling_mask;
5755 
5756 	/* Restrict the bonded request to run on only the available engines */
5757 	exec = READ_ONCE(rq->execution_mask);
5758 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5759 		;
5760 
5761 	/* Prevent the master from being re-run on the bonded engines */
5762 	to_request(signal)->execution_mask &= ~allowed;
5763 }
5764 
5765 struct intel_context *
5766 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5767 			       unsigned int count)
5768 {
5769 	struct virtual_engine *ve;
5770 	unsigned int n;
5771 	int err;
5772 
5773 	if (count == 0)
5774 		return ERR_PTR(-EINVAL);
5775 
5776 	if (count == 1)
5777 		return intel_context_create(siblings[0]);
5778 
5779 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5780 	if (!ve)
5781 		return ERR_PTR(-ENOMEM);
5782 
5783 	ve->base.i915 = siblings[0]->i915;
5784 	ve->base.gt = siblings[0]->gt;
5785 	ve->base.uncore = siblings[0]->uncore;
5786 	ve->base.id = -1;
5787 
5788 	ve->base.class = OTHER_CLASS;
5789 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5790 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5791 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5792 
5793 	/*
5794 	 * The decision on whether to submit a request using semaphores
5795 	 * depends on the saturated state of the engine. We only compute
5796 	 * this during HW submission of the request, and we need for this
5797 	 * state to be globally applied to all requests being submitted
5798 	 * to this engine. Virtual engines encompass more than one physical
5799 	 * engine and so we cannot accurately tell in advance if one of those
5800 	 * engines is already saturated and so cannot afford to use a semaphore
5801 	 * and be pessimized in priority for doing so -- if we are the only
5802 	 * context using semaphores after all other clients have stopped, we
5803 	 * will be starved on the saturated system. Such a global switch for
5804 	 * semaphores is less than ideal, but alas is the current compromise.
5805 	 */
5806 	ve->base.saturated = ALL_ENGINES;
5807 
5808 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5809 
5810 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5811 	intel_engine_init_execlists(&ve->base);
5812 
5813 	ve->base.cops = &virtual_context_ops;
5814 	ve->base.request_alloc = execlists_request_alloc;
5815 
5816 	ve->base.schedule = i915_schedule;
5817 	ve->base.submit_request = virtual_submit_request;
5818 	ve->base.bond_execute = virtual_bond_execute;
5819 
5820 	INIT_LIST_HEAD(virtual_queue(ve));
5821 	ve->base.execlists.queue_priority_hint = INT_MIN;
5822 	tasklet_init(&ve->base.execlists.tasklet,
5823 		     virtual_submission_tasklet,
5824 		     (unsigned long)ve);
5825 
5826 	intel_context_init(&ve->context, &ve->base);
5827 
5828 	ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
5829 	if (!ve->base.breadcrumbs) {
5830 		err = -ENOMEM;
5831 		goto err_put;
5832 	}
5833 
5834 	for (n = 0; n < count; n++) {
5835 		struct intel_engine_cs *sibling = siblings[n];
5836 
5837 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5838 		if (sibling->mask & ve->base.mask) {
5839 			DRM_DEBUG("duplicate %s entry in load balancer\n",
5840 				  sibling->name);
5841 			err = -EINVAL;
5842 			goto err_put;
5843 		}
5844 
5845 		/*
5846 		 * The virtual engine implementation is tightly coupled to
5847 		 * the execlists backend -- we push out request directly
5848 		 * into a tree inside each physical engine. We could support
5849 		 * layering if we handle cloning of the requests and
5850 		 * submitting a copy into each backend.
5851 		 */
5852 		if (sibling->execlists.tasklet.func !=
5853 		    execlists_submission_tasklet) {
5854 			err = -ENODEV;
5855 			goto err_put;
5856 		}
5857 
5858 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5859 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5860 
5861 		ve->siblings[ve->num_siblings++] = sibling;
5862 		ve->base.mask |= sibling->mask;
5863 
5864 		/*
5865 		 * All physical engines must be compatible for their emission
5866 		 * functions (as we build the instructions during request
5867 		 * construction and do not alter them before submission
5868 		 * on the physical engine). We use the engine class as a guide
5869 		 * here, although that could be refined.
5870 		 */
5871 		if (ve->base.class != OTHER_CLASS) {
5872 			if (ve->base.class != sibling->class) {
5873 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5874 					  sibling->class, ve->base.class);
5875 				err = -EINVAL;
5876 				goto err_put;
5877 			}
5878 			continue;
5879 		}
5880 
5881 		ve->base.class = sibling->class;
5882 		ve->base.uabi_class = sibling->uabi_class;
5883 		snprintf(ve->base.name, sizeof(ve->base.name),
5884 			 "v%dx%d", ve->base.class, count);
5885 		ve->base.context_size = sibling->context_size;
5886 
5887 		ve->base.emit_bb_start = sibling->emit_bb_start;
5888 		ve->base.emit_flush = sibling->emit_flush;
5889 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5890 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5891 		ve->base.emit_fini_breadcrumb_dw =
5892 			sibling->emit_fini_breadcrumb_dw;
5893 
5894 		ve->base.flags = sibling->flags;
5895 	}
5896 
5897 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5898 
5899 	virtual_engine_initial_hint(ve);
5900 	return &ve->context;
5901 
5902 err_put:
5903 	intel_context_put(&ve->context);
5904 	return ERR_PTR(err);
5905 }
5906 
5907 struct intel_context *
5908 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5909 {
5910 	struct virtual_engine *se = to_virtual_engine(src);
5911 	struct intel_context *dst;
5912 
5913 	dst = intel_execlists_create_virtual(se->siblings,
5914 					     se->num_siblings);
5915 	if (IS_ERR(dst))
5916 		return dst;
5917 
5918 	if (se->num_bonds) {
5919 		struct virtual_engine *de = to_virtual_engine(dst->engine);
5920 
5921 		de->bonds = kmemdup(se->bonds,
5922 				    sizeof(*se->bonds) * se->num_bonds,
5923 				    GFP_KERNEL);
5924 		if (!de->bonds) {
5925 			intel_context_put(dst);
5926 			return ERR_PTR(-ENOMEM);
5927 		}
5928 
5929 		de->num_bonds = se->num_bonds;
5930 	}
5931 
5932 	return dst;
5933 }
5934 
5935 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5936 				     const struct intel_engine_cs *master,
5937 				     const struct intel_engine_cs *sibling)
5938 {
5939 	struct virtual_engine *ve = to_virtual_engine(engine);
5940 	struct ve_bond *bond;
5941 	int n;
5942 
5943 	/* Sanity check the sibling is part of the virtual engine */
5944 	for (n = 0; n < ve->num_siblings; n++)
5945 		if (sibling == ve->siblings[n])
5946 			break;
5947 	if (n == ve->num_siblings)
5948 		return -EINVAL;
5949 
5950 	bond = virtual_find_bond(ve, master);
5951 	if (bond) {
5952 		bond->sibling_mask |= sibling->mask;
5953 		return 0;
5954 	}
5955 
5956 	bond = krealloc(ve->bonds,
5957 			sizeof(*bond) * (ve->num_bonds + 1),
5958 			GFP_KERNEL);
5959 	if (!bond)
5960 		return -ENOMEM;
5961 
5962 	bond[ve->num_bonds].master = master;
5963 	bond[ve->num_bonds].sibling_mask = sibling->mask;
5964 
5965 	ve->bonds = bond;
5966 	ve->num_bonds++;
5967 
5968 	return 0;
5969 }
5970 
5971 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5972 				   struct drm_printer *m,
5973 				   void (*show_request)(struct drm_printer *m,
5974 							const struct i915_request *rq,
5975 							const char *prefix,
5976 							int indent),
5977 				   unsigned int max)
5978 {
5979 	const struct intel_engine_execlists *execlists = &engine->execlists;
5980 	struct i915_request *rq, *last;
5981 	unsigned long flags;
5982 	unsigned int count;
5983 	struct rb_node *rb;
5984 
5985 	spin_lock_irqsave(&engine->active.lock, flags);
5986 
5987 	last = NULL;
5988 	count = 0;
5989 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5990 		if (count++ < max - 1)
5991 			show_request(m, rq, "\t\t", 0);
5992 		else
5993 			last = rq;
5994 	}
5995 	if (last) {
5996 		if (count > max) {
5997 			drm_printf(m,
5998 				   "\t\t...skipping %d executing requests...\n",
5999 				   count - max);
6000 		}
6001 		show_request(m, last, "\t\t", 0);
6002 	}
6003 
6004 	if (execlists->switch_priority_hint != INT_MIN)
6005 		drm_printf(m, "\t\tSwitch priority hint: %d\n",
6006 			   READ_ONCE(execlists->switch_priority_hint));
6007 	if (execlists->queue_priority_hint != INT_MIN)
6008 		drm_printf(m, "\t\tQueue priority hint: %d\n",
6009 			   READ_ONCE(execlists->queue_priority_hint));
6010 
6011 	last = NULL;
6012 	count = 0;
6013 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
6014 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
6015 		int i;
6016 
6017 		priolist_for_each_request(rq, p, i) {
6018 			if (count++ < max - 1)
6019 				show_request(m, rq, "\t\t", 0);
6020 			else
6021 				last = rq;
6022 		}
6023 	}
6024 	if (last) {
6025 		if (count > max) {
6026 			drm_printf(m,
6027 				   "\t\t...skipping %d queued requests...\n",
6028 				   count - max);
6029 		}
6030 		show_request(m, last, "\t\t", 0);
6031 	}
6032 
6033 	last = NULL;
6034 	count = 0;
6035 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
6036 		struct virtual_engine *ve =
6037 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
6038 		struct i915_request *rq = READ_ONCE(ve->request);
6039 
6040 		if (rq) {
6041 			if (count++ < max - 1)
6042 				show_request(m, rq, "\t\t", 0);
6043 			else
6044 				last = rq;
6045 		}
6046 	}
6047 	if (last) {
6048 		if (count > max) {
6049 			drm_printf(m,
6050 				   "\t\t...skipping %d virtual requests...\n",
6051 				   count - max);
6052 		}
6053 		show_request(m, last, "\t\t", 0);
6054 	}
6055 
6056 	spin_unlock_irqrestore(&engine->active.lock, flags);
6057 }
6058 
6059 void intel_lr_context_reset(struct intel_engine_cs *engine,
6060 			    struct intel_context *ce,
6061 			    u32 head,
6062 			    bool scrub)
6063 {
6064 	GEM_BUG_ON(!intel_context_is_pinned(ce));
6065 
6066 	/*
6067 	 * We want a simple context + ring to execute the breadcrumb update.
6068 	 * We cannot rely on the context being intact across the GPU hang,
6069 	 * so clear it and rebuild just what we need for the breadcrumb.
6070 	 * All pending requests for this context will be zapped, and any
6071 	 * future request will be after userspace has had the opportunity
6072 	 * to recreate its own state.
6073 	 */
6074 	if (scrub)
6075 		restore_default_state(ce, engine);
6076 
6077 	/* Rerun the request; its payload has been neutered (if guilty). */
6078 	__execlists_update_reg_state(ce, engine, head);
6079 }
6080 
6081 bool
6082 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6083 {
6084 	return engine->set_default_submission ==
6085 	       intel_execlists_set_default_submission;
6086 }
6087 
6088 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6089 #include "selftest_lrc.c"
6090 #endif
6091