xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 4d1356ac)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "gem/i915_gem_context.h"
137 
138 #include "i915_drv.h"
139 #include "i915_perf.h"
140 #include "i915_trace.h"
141 #include "i915_vgpu.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_gt_requests.h"
146 #include "intel_lrc_reg.h"
147 #include "intel_mocs.h"
148 #include "intel_reset.h"
149 #include "intel_ring.h"
150 #include "intel_workarounds.h"
151 
152 #define RING_EXECLIST_QFULL		(1 << 0x2)
153 #define RING_EXECLIST1_VALID		(1 << 0x3)
154 #define RING_EXECLIST0_VALID		(1 << 0x4)
155 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
156 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
157 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
158 
159 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
160 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
162 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
163 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
164 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
165 
166 #define GEN8_CTX_STATUS_COMPLETED_MASK \
167 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
168 
169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
170 
171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
173 #define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
174 #define GEN12_IDLE_CTX_ID		0x7FF
175 #define GEN12_CSB_CTX_VALID(csb_dw) \
176 	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
177 
178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
180 #define WA_TAIL_DWORDS 2
181 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
182 
183 struct virtual_engine {
184 	struct intel_engine_cs base;
185 	struct intel_context context;
186 
187 	/*
188 	 * We allow only a single request through the virtual engine at a time
189 	 * (each request in the timeline waits for the completion fence of
190 	 * the previous before being submitted). By restricting ourselves to
191 	 * only submitting a single request, each request is placed on to a
192 	 * physical to maximise load spreading (by virtue of the late greedy
193 	 * scheduling -- each real engine takes the next available request
194 	 * upon idling).
195 	 */
196 	struct i915_request *request;
197 
198 	/*
199 	 * We keep a rbtree of available virtual engines inside each physical
200 	 * engine, sorted by priority. Here we preallocate the nodes we need
201 	 * for the virtual engine, indexed by physical_engine->id.
202 	 */
203 	struct ve_node {
204 		struct rb_node rb;
205 		int prio;
206 	} nodes[I915_NUM_ENGINES];
207 
208 	/*
209 	 * Keep track of bonded pairs -- restrictions upon on our selection
210 	 * of physical engines any particular request may be submitted to.
211 	 * If we receive a submit-fence from a master engine, we will only
212 	 * use one of sibling_mask physical engines.
213 	 */
214 	struct ve_bond {
215 		const struct intel_engine_cs *master;
216 		intel_engine_mask_t sibling_mask;
217 	} *bonds;
218 	unsigned int num_bonds;
219 
220 	/* And finally, which physical engines this virtual engine maps onto. */
221 	unsigned int num_siblings;
222 	struct intel_engine_cs *siblings[0];
223 };
224 
225 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
226 {
227 	GEM_BUG_ON(!intel_engine_is_virtual(engine));
228 	return container_of(engine, struct virtual_engine, base);
229 }
230 
231 static int __execlists_context_alloc(struct intel_context *ce,
232 				     struct intel_engine_cs *engine);
233 
234 static void execlists_init_reg_state(u32 *reg_state,
235 				     const struct intel_context *ce,
236 				     const struct intel_engine_cs *engine,
237 				     const struct intel_ring *ring,
238 				     bool close);
239 static void
240 __execlists_update_reg_state(const struct intel_context *ce,
241 			     const struct intel_engine_cs *engine);
242 
243 static void mark_eio(struct i915_request *rq)
244 {
245 	if (i915_request_completed(rq))
246 		return;
247 
248 	GEM_BUG_ON(i915_request_signaled(rq));
249 
250 	dma_fence_set_error(&rq->fence, -EIO);
251 	i915_request_mark_complete(rq);
252 }
253 
254 static struct i915_request *
255 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
256 {
257 	struct i915_request *active = rq;
258 
259 	rcu_read_lock();
260 	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
261 		if (i915_request_completed(rq))
262 			break;
263 
264 		active = rq;
265 	}
266 	rcu_read_unlock();
267 
268 	return active;
269 }
270 
271 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
272 {
273 	return (i915_ggtt_offset(engine->status_page.vma) +
274 		I915_GEM_HWS_PREEMPT_ADDR);
275 }
276 
277 static inline void
278 ring_set_paused(const struct intel_engine_cs *engine, int state)
279 {
280 	/*
281 	 * We inspect HWS_PREEMPT with a semaphore inside
282 	 * engine->emit_fini_breadcrumb. If the dword is true,
283 	 * the ring is paused as the semaphore will busywait
284 	 * until the dword is false.
285 	 */
286 	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
287 	if (state)
288 		wmb();
289 }
290 
291 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
292 {
293 	return rb_entry(rb, struct i915_priolist, node);
294 }
295 
296 static inline int rq_prio(const struct i915_request *rq)
297 {
298 	return rq->sched.attr.priority;
299 }
300 
301 static int effective_prio(const struct i915_request *rq)
302 {
303 	int prio = rq_prio(rq);
304 
305 	/*
306 	 * If this request is special and must not be interrupted at any
307 	 * cost, so be it. Note we are only checking the most recent request
308 	 * in the context and so may be masking an earlier vip request. It
309 	 * is hoped that under the conditions where nopreempt is used, this
310 	 * will not matter (i.e. all requests to that context will be
311 	 * nopreempt for as long as desired).
312 	 */
313 	if (i915_request_has_nopreempt(rq))
314 		prio = I915_PRIORITY_UNPREEMPTABLE;
315 
316 	/*
317 	 * On unwinding the active request, we give it a priority bump
318 	 * if it has completed waiting on any semaphore. If we know that
319 	 * the request has already started, we can prevent an unwanted
320 	 * preempt-to-idle cycle by taking that into account now.
321 	 */
322 	if (__i915_request_has_started(rq))
323 		prio |= I915_PRIORITY_NOSEMAPHORE;
324 
325 	/* Restrict mere WAIT boosts from triggering preemption */
326 	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
327 	return prio | __NO_PREEMPTION;
328 }
329 
330 static int queue_prio(const struct intel_engine_execlists *execlists)
331 {
332 	struct i915_priolist *p;
333 	struct rb_node *rb;
334 
335 	rb = rb_first_cached(&execlists->queue);
336 	if (!rb)
337 		return INT_MIN;
338 
339 	/*
340 	 * As the priolist[] are inverted, with the highest priority in [0],
341 	 * we have to flip the index value to become priority.
342 	 */
343 	p = to_priolist(rb);
344 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
345 }
346 
347 static inline bool need_preempt(const struct intel_engine_cs *engine,
348 				const struct i915_request *rq,
349 				struct rb_node *rb)
350 {
351 	int last_prio;
352 
353 	if (!intel_engine_has_semaphores(engine))
354 		return false;
355 
356 	/*
357 	 * Check if the current priority hint merits a preemption attempt.
358 	 *
359 	 * We record the highest value priority we saw during rescheduling
360 	 * prior to this dequeue, therefore we know that if it is strictly
361 	 * less than the current tail of ESLP[0], we do not need to force
362 	 * a preempt-to-idle cycle.
363 	 *
364 	 * However, the priority hint is a mere hint that we may need to
365 	 * preempt. If that hint is stale or we may be trying to preempt
366 	 * ourselves, ignore the request.
367 	 *
368 	 * More naturally we would write
369 	 *      prio >= max(0, last);
370 	 * except that we wish to prevent triggering preemption at the same
371 	 * priority level: the task that is running should remain running
372 	 * to preserve FIFO ordering of dependencies.
373 	 */
374 	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
375 	if (engine->execlists.queue_priority_hint <= last_prio)
376 		return false;
377 
378 	/*
379 	 * Check against the first request in ELSP[1], it will, thanks to the
380 	 * power of PI, be the highest priority of that context.
381 	 */
382 	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
383 	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
384 		return true;
385 
386 	if (rb) {
387 		struct virtual_engine *ve =
388 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
389 		bool preempt = false;
390 
391 		if (engine == ve->siblings[0]) { /* only preempt one sibling */
392 			struct i915_request *next;
393 
394 			rcu_read_lock();
395 			next = READ_ONCE(ve->request);
396 			if (next)
397 				preempt = rq_prio(next) > last_prio;
398 			rcu_read_unlock();
399 		}
400 
401 		if (preempt)
402 			return preempt;
403 	}
404 
405 	/*
406 	 * If the inflight context did not trigger the preemption, then maybe
407 	 * it was the set of queued requests? Pick the highest priority in
408 	 * the queue (the first active priolist) and see if it deserves to be
409 	 * running instead of ELSP[0].
410 	 *
411 	 * The highest priority request in the queue can not be either
412 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
413 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
414 	 */
415 	return queue_prio(&engine->execlists) > last_prio;
416 }
417 
418 __maybe_unused static inline bool
419 assert_priority_queue(const struct i915_request *prev,
420 		      const struct i915_request *next)
421 {
422 	/*
423 	 * Without preemption, the prev may refer to the still active element
424 	 * which we refuse to let go.
425 	 *
426 	 * Even with preemption, there are times when we think it is better not
427 	 * to preempt and leave an ostensibly lower priority request in flight.
428 	 */
429 	if (i915_request_is_active(prev))
430 		return true;
431 
432 	return rq_prio(prev) >= rq_prio(next);
433 }
434 
435 /*
436  * The context descriptor encodes various attributes of a context,
437  * including its GTT address and some flags. Because it's fairly
438  * expensive to calculate, we'll just do it once and cache the result,
439  * which remains valid until the context is unpinned.
440  *
441  * This is what a descriptor looks like, from LSB to MSB::
442  *
443  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
444  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
445  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
446  *      bits 53-54:    mbz, reserved for use by hardware
447  *      bits 55-63:    group ID, currently unused and set to 0
448  *
449  * Starting from Gen11, the upper dword of the descriptor has a new format:
450  *
451  *      bits 32-36:    reserved
452  *      bits 37-47:    SW context ID
453  *      bits 48:53:    engine instance
454  *      bit 54:        mbz, reserved for use by hardware
455  *      bits 55-60:    SW counter
456  *      bits 61-63:    engine class
457  *
458  * engine info, SW context ID and SW counter need to form a unique number
459  * (Context ID) per lrc.
460  */
461 static u64
462 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
463 {
464 	u64 desc;
465 
466 	desc = INTEL_LEGACY_32B_CONTEXT;
467 	if (i915_vm_is_4lvl(ce->vm))
468 		desc = INTEL_LEGACY_64B_CONTEXT;
469 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
470 
471 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
472 	if (IS_GEN(engine->i915, 8))
473 		desc |= GEN8_CTX_L3LLC_COHERENT;
474 
475 	desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
476 	/*
477 	 * The following 32bits are copied into the OA reports (dword 2).
478 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
479 	 * anything below.
480 	 */
481 	if (INTEL_GEN(engine->i915) >= 11) {
482 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
483 								/* bits 48-53 */
484 
485 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
486 								/* bits 61-63 */
487 	}
488 
489 	return desc;
490 }
491 
492 static u32 *set_offsets(u32 *regs,
493 			const u8 *data,
494 			const struct intel_engine_cs *engine)
495 #define NOP(x) (BIT(7) | (x))
496 #define LRI(count, flags) ((flags) << 6 | (count))
497 #define POSTED BIT(0)
498 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
499 #define REG16(x) \
500 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
501 	(((x) >> 2) & 0x7f)
502 #define END() 0
503 {
504 	const u32 base = engine->mmio_base;
505 
506 	while (*data) {
507 		u8 count, flags;
508 
509 		if (*data & BIT(7)) { /* skip */
510 			regs += *data++ & ~BIT(7);
511 			continue;
512 		}
513 
514 		count = *data & 0x3f;
515 		flags = *data >> 6;
516 		data++;
517 
518 		*regs = MI_LOAD_REGISTER_IMM(count);
519 		if (flags & POSTED)
520 			*regs |= MI_LRI_FORCE_POSTED;
521 		if (INTEL_GEN(engine->i915) >= 11)
522 			*regs |= MI_LRI_CS_MMIO;
523 		regs++;
524 
525 		GEM_BUG_ON(!count);
526 		do {
527 			u32 offset = 0;
528 			u8 v;
529 
530 			do {
531 				v = *data++;
532 				offset <<= 7;
533 				offset |= v & ~BIT(7);
534 			} while (v & BIT(7));
535 
536 			*regs = base + (offset << 2);
537 			regs += 2;
538 		} while (--count);
539 	}
540 
541 	return regs;
542 }
543 
544 static const u8 gen8_xcs_offsets[] = {
545 	NOP(1),
546 	LRI(11, 0),
547 	REG16(0x244),
548 	REG(0x034),
549 	REG(0x030),
550 	REG(0x038),
551 	REG(0x03c),
552 	REG(0x168),
553 	REG(0x140),
554 	REG(0x110),
555 	REG(0x11c),
556 	REG(0x114),
557 	REG(0x118),
558 
559 	NOP(9),
560 	LRI(9, 0),
561 	REG16(0x3a8),
562 	REG16(0x28c),
563 	REG16(0x288),
564 	REG16(0x284),
565 	REG16(0x280),
566 	REG16(0x27c),
567 	REG16(0x278),
568 	REG16(0x274),
569 	REG16(0x270),
570 
571 	NOP(13),
572 	LRI(2, 0),
573 	REG16(0x200),
574 	REG(0x028),
575 
576 	END(),
577 };
578 
579 static const u8 gen9_xcs_offsets[] = {
580 	NOP(1),
581 	LRI(14, POSTED),
582 	REG16(0x244),
583 	REG(0x034),
584 	REG(0x030),
585 	REG(0x038),
586 	REG(0x03c),
587 	REG(0x168),
588 	REG(0x140),
589 	REG(0x110),
590 	REG(0x11c),
591 	REG(0x114),
592 	REG(0x118),
593 	REG(0x1c0),
594 	REG(0x1c4),
595 	REG(0x1c8),
596 
597 	NOP(3),
598 	LRI(9, POSTED),
599 	REG16(0x3a8),
600 	REG16(0x28c),
601 	REG16(0x288),
602 	REG16(0x284),
603 	REG16(0x280),
604 	REG16(0x27c),
605 	REG16(0x278),
606 	REG16(0x274),
607 	REG16(0x270),
608 
609 	NOP(13),
610 	LRI(1, POSTED),
611 	REG16(0x200),
612 
613 	NOP(13),
614 	LRI(44, POSTED),
615 	REG(0x028),
616 	REG(0x09c),
617 	REG(0x0c0),
618 	REG(0x178),
619 	REG(0x17c),
620 	REG16(0x358),
621 	REG(0x170),
622 	REG(0x150),
623 	REG(0x154),
624 	REG(0x158),
625 	REG16(0x41c),
626 	REG16(0x600),
627 	REG16(0x604),
628 	REG16(0x608),
629 	REG16(0x60c),
630 	REG16(0x610),
631 	REG16(0x614),
632 	REG16(0x618),
633 	REG16(0x61c),
634 	REG16(0x620),
635 	REG16(0x624),
636 	REG16(0x628),
637 	REG16(0x62c),
638 	REG16(0x630),
639 	REG16(0x634),
640 	REG16(0x638),
641 	REG16(0x63c),
642 	REG16(0x640),
643 	REG16(0x644),
644 	REG16(0x648),
645 	REG16(0x64c),
646 	REG16(0x650),
647 	REG16(0x654),
648 	REG16(0x658),
649 	REG16(0x65c),
650 	REG16(0x660),
651 	REG16(0x664),
652 	REG16(0x668),
653 	REG16(0x66c),
654 	REG16(0x670),
655 	REG16(0x674),
656 	REG16(0x678),
657 	REG16(0x67c),
658 	REG(0x068),
659 
660 	END(),
661 };
662 
663 static const u8 gen12_xcs_offsets[] = {
664 	NOP(1),
665 	LRI(13, POSTED),
666 	REG16(0x244),
667 	REG(0x034),
668 	REG(0x030),
669 	REG(0x038),
670 	REG(0x03c),
671 	REG(0x168),
672 	REG(0x140),
673 	REG(0x110),
674 	REG(0x1c0),
675 	REG(0x1c4),
676 	REG(0x1c8),
677 	REG(0x180),
678 	REG16(0x2b4),
679 
680 	NOP(5),
681 	LRI(9, POSTED),
682 	REG16(0x3a8),
683 	REG16(0x28c),
684 	REG16(0x288),
685 	REG16(0x284),
686 	REG16(0x280),
687 	REG16(0x27c),
688 	REG16(0x278),
689 	REG16(0x274),
690 	REG16(0x270),
691 
692 	END(),
693 };
694 
695 static const u8 gen8_rcs_offsets[] = {
696 	NOP(1),
697 	LRI(14, POSTED),
698 	REG16(0x244),
699 	REG(0x034),
700 	REG(0x030),
701 	REG(0x038),
702 	REG(0x03c),
703 	REG(0x168),
704 	REG(0x140),
705 	REG(0x110),
706 	REG(0x11c),
707 	REG(0x114),
708 	REG(0x118),
709 	REG(0x1c0),
710 	REG(0x1c4),
711 	REG(0x1c8),
712 
713 	NOP(3),
714 	LRI(9, POSTED),
715 	REG16(0x3a8),
716 	REG16(0x28c),
717 	REG16(0x288),
718 	REG16(0x284),
719 	REG16(0x280),
720 	REG16(0x27c),
721 	REG16(0x278),
722 	REG16(0x274),
723 	REG16(0x270),
724 
725 	NOP(13),
726 	LRI(1, 0),
727 	REG(0x0c8),
728 
729 	END(),
730 };
731 
732 static const u8 gen11_rcs_offsets[] = {
733 	NOP(1),
734 	LRI(15, POSTED),
735 	REG16(0x244),
736 	REG(0x034),
737 	REG(0x030),
738 	REG(0x038),
739 	REG(0x03c),
740 	REG(0x168),
741 	REG(0x140),
742 	REG(0x110),
743 	REG(0x11c),
744 	REG(0x114),
745 	REG(0x118),
746 	REG(0x1c0),
747 	REG(0x1c4),
748 	REG(0x1c8),
749 	REG(0x180),
750 
751 	NOP(1),
752 	LRI(9, POSTED),
753 	REG16(0x3a8),
754 	REG16(0x28c),
755 	REG16(0x288),
756 	REG16(0x284),
757 	REG16(0x280),
758 	REG16(0x27c),
759 	REG16(0x278),
760 	REG16(0x274),
761 	REG16(0x270),
762 
763 	LRI(1, POSTED),
764 	REG(0x1b0),
765 
766 	NOP(10),
767 	LRI(1, 0),
768 	REG(0x0c8),
769 
770 	END(),
771 };
772 
773 static const u8 gen12_rcs_offsets[] = {
774 	NOP(1),
775 	LRI(13, POSTED),
776 	REG16(0x244),
777 	REG(0x034),
778 	REG(0x030),
779 	REG(0x038),
780 	REG(0x03c),
781 	REG(0x168),
782 	REG(0x140),
783 	REG(0x110),
784 	REG(0x1c0),
785 	REG(0x1c4),
786 	REG(0x1c8),
787 	REG(0x180),
788 	REG16(0x2b4),
789 
790 	NOP(5),
791 	LRI(9, POSTED),
792 	REG16(0x3a8),
793 	REG16(0x28c),
794 	REG16(0x288),
795 	REG16(0x284),
796 	REG16(0x280),
797 	REG16(0x27c),
798 	REG16(0x278),
799 	REG16(0x274),
800 	REG16(0x270),
801 
802 	LRI(3, POSTED),
803 	REG(0x1b0),
804 	REG16(0x5a8),
805 	REG16(0x5ac),
806 
807 	NOP(6),
808 	LRI(1, 0),
809 	REG(0x0c8),
810 
811 	END(),
812 };
813 
814 #undef END
815 #undef REG16
816 #undef REG
817 #undef LRI
818 #undef NOP
819 
820 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
821 {
822 	/*
823 	 * The gen12+ lists only have the registers we program in the basic
824 	 * default state. We rely on the context image using relative
825 	 * addressing to automatic fixup the register state between the
826 	 * physical engines for virtual engine.
827 	 */
828 	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
829 		   !intel_engine_has_relative_mmio(engine));
830 
831 	if (engine->class == RENDER_CLASS) {
832 		if (INTEL_GEN(engine->i915) >= 12)
833 			return gen12_rcs_offsets;
834 		else if (INTEL_GEN(engine->i915) >= 11)
835 			return gen11_rcs_offsets;
836 		else
837 			return gen8_rcs_offsets;
838 	} else {
839 		if (INTEL_GEN(engine->i915) >= 12)
840 			return gen12_xcs_offsets;
841 		else if (INTEL_GEN(engine->i915) >= 9)
842 			return gen9_xcs_offsets;
843 		else
844 			return gen8_xcs_offsets;
845 	}
846 }
847 
848 static struct i915_request *
849 __unwind_incomplete_requests(struct intel_engine_cs *engine)
850 {
851 	struct i915_request *rq, *rn, *active = NULL;
852 	struct list_head *uninitialized_var(pl);
853 	int prio = I915_PRIORITY_INVALID;
854 
855 	lockdep_assert_held(&engine->active.lock);
856 
857 	list_for_each_entry_safe_reverse(rq, rn,
858 					 &engine->active.requests,
859 					 sched.link) {
860 		if (i915_request_completed(rq))
861 			continue; /* XXX */
862 
863 		__i915_request_unsubmit(rq);
864 
865 		/*
866 		 * Push the request back into the queue for later resubmission.
867 		 * If this request is not native to this physical engine (i.e.
868 		 * it came from a virtual source), push it back onto the virtual
869 		 * engine so that it can be moved across onto another physical
870 		 * engine as load dictates.
871 		 */
872 		if (likely(rq->execution_mask == engine->mask)) {
873 			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
874 			if (rq_prio(rq) != prio) {
875 				prio = rq_prio(rq);
876 				pl = i915_sched_lookup_priolist(engine, prio);
877 			}
878 			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
879 
880 			list_move(&rq->sched.link, pl);
881 			active = rq;
882 		} else {
883 			struct intel_engine_cs *owner = rq->hw_context->engine;
884 
885 			/*
886 			 * Decouple the virtual breadcrumb before moving it
887 			 * back to the virtual engine -- we don't want the
888 			 * request to complete in the background and try
889 			 * and cancel the breadcrumb on the virtual engine
890 			 * (instead of the old engine where it is linked)!
891 			 */
892 			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
893 				     &rq->fence.flags)) {
894 				spin_lock_nested(&rq->lock,
895 						 SINGLE_DEPTH_NESTING);
896 				i915_request_cancel_breadcrumb(rq);
897 				spin_unlock(&rq->lock);
898 			}
899 			rq->engine = owner;
900 			owner->submit_request(rq);
901 			active = NULL;
902 		}
903 	}
904 
905 	return active;
906 }
907 
908 struct i915_request *
909 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
910 {
911 	struct intel_engine_cs *engine =
912 		container_of(execlists, typeof(*engine), execlists);
913 
914 	return __unwind_incomplete_requests(engine);
915 }
916 
917 static inline void
918 execlists_context_status_change(struct i915_request *rq, unsigned long status)
919 {
920 	/*
921 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
922 	 * The compiler should eliminate this function as dead-code.
923 	 */
924 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
925 		return;
926 
927 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
928 				   status, rq);
929 }
930 
931 static void intel_engine_context_in(struct intel_engine_cs *engine)
932 {
933 	unsigned long flags;
934 
935 	if (READ_ONCE(engine->stats.enabled) == 0)
936 		return;
937 
938 	write_seqlock_irqsave(&engine->stats.lock, flags);
939 
940 	if (engine->stats.enabled > 0) {
941 		if (engine->stats.active++ == 0)
942 			engine->stats.start = ktime_get();
943 		GEM_BUG_ON(engine->stats.active == 0);
944 	}
945 
946 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
947 }
948 
949 static void intel_engine_context_out(struct intel_engine_cs *engine)
950 {
951 	unsigned long flags;
952 
953 	if (READ_ONCE(engine->stats.enabled) == 0)
954 		return;
955 
956 	write_seqlock_irqsave(&engine->stats.lock, flags);
957 
958 	if (engine->stats.enabled > 0) {
959 		ktime_t last;
960 
961 		if (engine->stats.active && --engine->stats.active == 0) {
962 			/*
963 			 * Decrement the active context count and in case GPU
964 			 * is now idle add up to the running total.
965 			 */
966 			last = ktime_sub(ktime_get(), engine->stats.start);
967 
968 			engine->stats.total = ktime_add(engine->stats.total,
969 							last);
970 		} else if (engine->stats.active == 0) {
971 			/*
972 			 * After turning on engine stats, context out might be
973 			 * the first event in which case we account from the
974 			 * time stats gathering was turned on.
975 			 */
976 			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
977 
978 			engine->stats.total = ktime_add(engine->stats.total,
979 							last);
980 		}
981 	}
982 
983 	write_sequnlock_irqrestore(&engine->stats.lock, flags);
984 }
985 
986 static void restore_default_state(struct intel_context *ce,
987 				  struct intel_engine_cs *engine)
988 {
989 	u32 *regs = ce->lrc_reg_state;
990 
991 	if (engine->pinned_default_state)
992 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
993 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
994 		       engine->context_size - PAGE_SIZE);
995 
996 	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
997 }
998 
999 static void reset_active(struct i915_request *rq,
1000 			 struct intel_engine_cs *engine)
1001 {
1002 	struct intel_context * const ce = rq->hw_context;
1003 	u32 head;
1004 
1005 	/*
1006 	 * The executing context has been cancelled. We want to prevent
1007 	 * further execution along this context and propagate the error on
1008 	 * to anything depending on its results.
1009 	 *
1010 	 * In __i915_request_submit(), we apply the -EIO and remove the
1011 	 * requests' payloads for any banned requests. But first, we must
1012 	 * rewind the context back to the start of the incomplete request so
1013 	 * that we do not jump back into the middle of the batch.
1014 	 *
1015 	 * We preserve the breadcrumbs and semaphores of the incomplete
1016 	 * requests so that inter-timeline dependencies (i.e other timelines)
1017 	 * remain correctly ordered. And we defer to __i915_request_submit()
1018 	 * so that all asynchronous waits are correctly handled.
1019 	 */
1020 	GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
1021 		  __func__, engine->name, rq->fence.context, rq->fence.seqno);
1022 
1023 	/* On resubmission of the active request, payload will be scrubbed */
1024 	if (i915_request_completed(rq))
1025 		head = rq->tail;
1026 	else
1027 		head = active_request(ce->timeline, rq)->head;
1028 	ce->ring->head = intel_ring_wrap(ce->ring, head);
1029 	intel_ring_update_space(ce->ring);
1030 
1031 	/* Scrub the context image to prevent replaying the previous batch */
1032 	restore_default_state(ce, engine);
1033 	__execlists_update_reg_state(ce, engine);
1034 
1035 	/* We've switched away, so this should be a no-op, but intent matters */
1036 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1037 }
1038 
1039 static inline struct intel_engine_cs *
1040 __execlists_schedule_in(struct i915_request *rq)
1041 {
1042 	struct intel_engine_cs * const engine = rq->engine;
1043 	struct intel_context * const ce = rq->hw_context;
1044 
1045 	intel_context_get(ce);
1046 
1047 	if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
1048 		reset_active(rq, engine);
1049 
1050 	if (ce->tag) {
1051 		/* Use a fixed tag for OA and friends */
1052 		ce->lrc_desc |= (u64)ce->tag << 32;
1053 	} else {
1054 		/* We don't need a strict matching tag, just different values */
1055 		ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1056 		ce->lrc_desc |=
1057 			(u64)(engine->context_tag++ % NUM_CONTEXT_TAG) <<
1058 			GEN11_SW_CTX_ID_SHIFT;
1059 		BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1060 	}
1061 
1062 	intel_gt_pm_get(engine->gt);
1063 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1064 	intel_engine_context_in(engine);
1065 
1066 	return engine;
1067 }
1068 
1069 static inline struct i915_request *
1070 execlists_schedule_in(struct i915_request *rq, int idx)
1071 {
1072 	struct intel_context * const ce = rq->hw_context;
1073 	struct intel_engine_cs *old;
1074 
1075 	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1076 	trace_i915_request_in(rq, idx);
1077 
1078 	old = READ_ONCE(ce->inflight);
1079 	do {
1080 		if (!old) {
1081 			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1082 			break;
1083 		}
1084 	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1085 
1086 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1087 	return i915_request_get(rq);
1088 }
1089 
1090 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1091 {
1092 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1093 	struct i915_request *next = READ_ONCE(ve->request);
1094 
1095 	if (next && next->execution_mask & ~rq->execution_mask)
1096 		tasklet_schedule(&ve->base.execlists.tasklet);
1097 }
1098 
1099 static inline void
1100 __execlists_schedule_out(struct i915_request *rq,
1101 			 struct intel_engine_cs * const engine)
1102 {
1103 	struct intel_context * const ce = rq->hw_context;
1104 
1105 	/*
1106 	 * NB process_csb() is not under the engine->active.lock and hence
1107 	 * schedule_out can race with schedule_in meaning that we should
1108 	 * refrain from doing non-trivial work here.
1109 	 */
1110 
1111 	/*
1112 	 * If we have just completed this context, the engine may now be
1113 	 * idle and we want to re-enter powersaving.
1114 	 */
1115 	if (list_is_last(&rq->link, &ce->timeline->requests) &&
1116 	    i915_request_completed(rq))
1117 		intel_engine_add_retire(engine, ce->timeline);
1118 
1119 	intel_engine_context_out(engine);
1120 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1121 	intel_gt_pm_put_async(engine->gt);
1122 
1123 	/*
1124 	 * If this is part of a virtual engine, its next request may
1125 	 * have been blocked waiting for access to the active context.
1126 	 * We have to kick all the siblings again in case we need to
1127 	 * switch (e.g. the next request is not runnable on this
1128 	 * engine). Hopefully, we will already have submitted the next
1129 	 * request before the tasklet runs and do not need to rebuild
1130 	 * each virtual tree and kick everyone again.
1131 	 */
1132 	if (ce->engine != engine)
1133 		kick_siblings(rq, ce);
1134 
1135 	intel_context_put(ce);
1136 }
1137 
1138 static inline void
1139 execlists_schedule_out(struct i915_request *rq)
1140 {
1141 	struct intel_context * const ce = rq->hw_context;
1142 	struct intel_engine_cs *cur, *old;
1143 
1144 	trace_i915_request_out(rq);
1145 
1146 	old = READ_ONCE(ce->inflight);
1147 	do
1148 		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1149 	while (!try_cmpxchg(&ce->inflight, &old, cur));
1150 	if (!cur)
1151 		__execlists_schedule_out(rq, old);
1152 
1153 	i915_request_put(rq);
1154 }
1155 
1156 static u64 execlists_update_context(struct i915_request *rq)
1157 {
1158 	struct intel_context *ce = rq->hw_context;
1159 	u64 desc = ce->lrc_desc;
1160 	u32 tail;
1161 
1162 	/*
1163 	 * WaIdleLiteRestore:bdw,skl
1164 	 *
1165 	 * We should never submit the context with the same RING_TAIL twice
1166 	 * just in case we submit an empty ring, which confuses the HW.
1167 	 *
1168 	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1169 	 * the normal request to be able to always advance the RING_TAIL on
1170 	 * subsequent resubmissions (for lite restore). Should that fail us,
1171 	 * and we try and submit the same tail again, force the context
1172 	 * reload.
1173 	 */
1174 	tail = intel_ring_set_tail(rq->ring, rq->tail);
1175 	if (unlikely(ce->lrc_reg_state[CTX_RING_TAIL] == tail))
1176 		desc |= CTX_DESC_FORCE_RESTORE;
1177 	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1178 	rq->tail = rq->wa_tail;
1179 
1180 	/*
1181 	 * Make sure the context image is complete before we submit it to HW.
1182 	 *
1183 	 * Ostensibly, writes (including the WCB) should be flushed prior to
1184 	 * an uncached write such as our mmio register access, the empirical
1185 	 * evidence (esp. on Braswell) suggests that the WC write into memory
1186 	 * may not be visible to the HW prior to the completion of the UC
1187 	 * register write and that we may begin execution from the context
1188 	 * before its image is complete leading to invalid PD chasing.
1189 	 *
1190 	 * Furthermore, Braswell, at least, wants a full mb to be sure that
1191 	 * the writes are coherent in memory (visible to the GPU) prior to
1192 	 * execution, and not just visible to other CPUs (as is the result of
1193 	 * wmb).
1194 	 */
1195 	mb();
1196 
1197 	/* Wa_1607138340:tgl */
1198 	if (IS_TGL_REVID(rq->i915, TGL_REVID_A0, TGL_REVID_A0))
1199 		desc |= CTX_DESC_FORCE_RESTORE;
1200 
1201 	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1202 	return desc;
1203 }
1204 
1205 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1206 {
1207 	if (execlists->ctrl_reg) {
1208 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1209 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1210 	} else {
1211 		writel(upper_32_bits(desc), execlists->submit_reg);
1212 		writel(lower_32_bits(desc), execlists->submit_reg);
1213 	}
1214 }
1215 
1216 static __maybe_unused void
1217 trace_ports(const struct intel_engine_execlists *execlists,
1218 	    const char *msg,
1219 	    struct i915_request * const *ports)
1220 {
1221 	const struct intel_engine_cs *engine =
1222 		container_of(execlists, typeof(*engine), execlists);
1223 
1224 	if (!ports[0])
1225 		return;
1226 
1227 	GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n",
1228 		  engine->name, msg,
1229 		  ports[0]->fence.context,
1230 		  ports[0]->fence.seqno,
1231 		  i915_request_completed(ports[0]) ? "!" :
1232 		  i915_request_started(ports[0]) ? "*" :
1233 		  "",
1234 		  ports[1] ? ports[1]->fence.context : 0,
1235 		  ports[1] ? ports[1]->fence.seqno : 0);
1236 }
1237 
1238 static __maybe_unused bool
1239 assert_pending_valid(const struct intel_engine_execlists *execlists,
1240 		     const char *msg)
1241 {
1242 	struct i915_request * const *port, *rq;
1243 	struct intel_context *ce = NULL;
1244 
1245 	trace_ports(execlists, msg, execlists->pending);
1246 
1247 	if (!execlists->pending[0]) {
1248 		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1249 		return false;
1250 	}
1251 
1252 	if (execlists->pending[execlists_num_ports(execlists)]) {
1253 		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1254 			      execlists_num_ports(execlists));
1255 		return false;
1256 	}
1257 
1258 	for (port = execlists->pending; (rq = *port); port++) {
1259 		if (ce == rq->hw_context) {
1260 			GEM_TRACE_ERR("Duplicate context in pending[%zd]\n",
1261 				      port - execlists->pending);
1262 			return false;
1263 		}
1264 
1265 		ce = rq->hw_context;
1266 		if (i915_request_completed(rq))
1267 			continue;
1268 
1269 		if (i915_active_is_idle(&ce->active)) {
1270 			GEM_TRACE_ERR("Inactive context in pending[%zd]\n",
1271 				      port - execlists->pending);
1272 			return false;
1273 		}
1274 
1275 		if (!i915_vma_is_pinned(ce->state)) {
1276 			GEM_TRACE_ERR("Unpinned context in pending[%zd]\n",
1277 				      port - execlists->pending);
1278 			return false;
1279 		}
1280 
1281 		if (!i915_vma_is_pinned(ce->ring->vma)) {
1282 			GEM_TRACE_ERR("Unpinned ringbuffer in pending[%zd]\n",
1283 				      port - execlists->pending);
1284 			return false;
1285 		}
1286 	}
1287 
1288 	return ce;
1289 }
1290 
1291 static void execlists_submit_ports(struct intel_engine_cs *engine)
1292 {
1293 	struct intel_engine_execlists *execlists = &engine->execlists;
1294 	unsigned int n;
1295 
1296 	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1297 
1298 	/*
1299 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1300 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1301 	 * not be relinquished until the device is idle (see
1302 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1303 	 * that all ELSP are drained i.e. we have processed the CSB,
1304 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1305 	 */
1306 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1307 
1308 	/*
1309 	 * ELSQ note: the submit queue is not cleared after being submitted
1310 	 * to the HW so we need to make sure we always clean it up. This is
1311 	 * currently ensured by the fact that we always write the same number
1312 	 * of elsq entries, keep this in mind before changing the loop below.
1313 	 */
1314 	for (n = execlists_num_ports(execlists); n--; ) {
1315 		struct i915_request *rq = execlists->pending[n];
1316 
1317 		write_desc(execlists,
1318 			   rq ? execlists_update_context(rq) : 0,
1319 			   n);
1320 	}
1321 
1322 	/* we need to manually load the submit queue */
1323 	if (execlists->ctrl_reg)
1324 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1325 }
1326 
1327 static bool ctx_single_port_submission(const struct intel_context *ce)
1328 {
1329 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1330 		i915_gem_context_force_single_submission(ce->gem_context));
1331 }
1332 
1333 static bool can_merge_ctx(const struct intel_context *prev,
1334 			  const struct intel_context *next)
1335 {
1336 	if (prev != next)
1337 		return false;
1338 
1339 	if (ctx_single_port_submission(prev))
1340 		return false;
1341 
1342 	return true;
1343 }
1344 
1345 static bool can_merge_rq(const struct i915_request *prev,
1346 			 const struct i915_request *next)
1347 {
1348 	GEM_BUG_ON(prev == next);
1349 	GEM_BUG_ON(!assert_priority_queue(prev, next));
1350 
1351 	/*
1352 	 * We do not submit known completed requests. Therefore if the next
1353 	 * request is already completed, we can pretend to merge it in
1354 	 * with the previous context (and we will skip updating the ELSP
1355 	 * and tracking). Thus hopefully keeping the ELSP full with active
1356 	 * contexts, despite the best efforts of preempt-to-busy to confuse
1357 	 * us.
1358 	 */
1359 	if (i915_request_completed(next))
1360 		return true;
1361 
1362 	if (unlikely((prev->flags ^ next->flags) &
1363 		     (I915_REQUEST_NOPREEMPT | I915_REQUEST_SENTINEL)))
1364 		return false;
1365 
1366 	if (!can_merge_ctx(prev->hw_context, next->hw_context))
1367 		return false;
1368 
1369 	return true;
1370 }
1371 
1372 static void virtual_update_register_offsets(u32 *regs,
1373 					    struct intel_engine_cs *engine)
1374 {
1375 	set_offsets(regs, reg_offsets(engine), engine);
1376 }
1377 
1378 static bool virtual_matches(const struct virtual_engine *ve,
1379 			    const struct i915_request *rq,
1380 			    const struct intel_engine_cs *engine)
1381 {
1382 	const struct intel_engine_cs *inflight;
1383 
1384 	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1385 		return false;
1386 
1387 	/*
1388 	 * We track when the HW has completed saving the context image
1389 	 * (i.e. when we have seen the final CS event switching out of
1390 	 * the context) and must not overwrite the context image before
1391 	 * then. This restricts us to only using the active engine
1392 	 * while the previous virtualized request is inflight (so
1393 	 * we reuse the register offsets). This is a very small
1394 	 * hystersis on the greedy seelction algorithm.
1395 	 */
1396 	inflight = intel_context_inflight(&ve->context);
1397 	if (inflight && inflight != engine)
1398 		return false;
1399 
1400 	return true;
1401 }
1402 
1403 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1404 				     struct intel_engine_cs *engine)
1405 {
1406 	struct intel_engine_cs *old = ve->siblings[0];
1407 
1408 	/* All unattached (rq->engine == old) must already be completed */
1409 
1410 	spin_lock(&old->breadcrumbs.irq_lock);
1411 	if (!list_empty(&ve->context.signal_link)) {
1412 		list_move_tail(&ve->context.signal_link,
1413 			       &engine->breadcrumbs.signalers);
1414 		intel_engine_queue_breadcrumbs(engine);
1415 	}
1416 	spin_unlock(&old->breadcrumbs.irq_lock);
1417 }
1418 
1419 static struct i915_request *
1420 last_active(const struct intel_engine_execlists *execlists)
1421 {
1422 	struct i915_request * const *last = READ_ONCE(execlists->active);
1423 
1424 	while (*last && i915_request_completed(*last))
1425 		last++;
1426 
1427 	return *last;
1428 }
1429 
1430 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1431 {
1432 	LIST_HEAD(list);
1433 
1434 	/*
1435 	 * We want to move the interrupted request to the back of
1436 	 * the round-robin list (i.e. its priority level), but
1437 	 * in doing so, we must then move all requests that were in
1438 	 * flight and were waiting for the interrupted request to
1439 	 * be run after it again.
1440 	 */
1441 	do {
1442 		struct i915_dependency *p;
1443 
1444 		GEM_BUG_ON(i915_request_is_active(rq));
1445 		list_move_tail(&rq->sched.link, pl);
1446 
1447 		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
1448 			struct i915_request *w =
1449 				container_of(p->waiter, typeof(*w), sched);
1450 
1451 			/* Leave semaphores spinning on the other engines */
1452 			if (w->engine != rq->engine)
1453 				continue;
1454 
1455 			/* No waiter should start before its signaler */
1456 			GEM_BUG_ON(i915_request_started(w) &&
1457 				   !i915_request_completed(rq));
1458 
1459 			GEM_BUG_ON(i915_request_is_active(w));
1460 			if (list_empty(&w->sched.link))
1461 				continue; /* Not yet submitted; unready */
1462 
1463 			if (rq_prio(w) < rq_prio(rq))
1464 				continue;
1465 
1466 			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1467 			list_move_tail(&w->sched.link, &list);
1468 		}
1469 
1470 		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1471 	} while (rq);
1472 }
1473 
1474 static void defer_active(struct intel_engine_cs *engine)
1475 {
1476 	struct i915_request *rq;
1477 
1478 	rq = __unwind_incomplete_requests(engine);
1479 	if (!rq)
1480 		return;
1481 
1482 	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1483 }
1484 
1485 static bool
1486 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1487 {
1488 	int hint;
1489 
1490 	if (!intel_engine_has_timeslices(engine))
1491 		return false;
1492 
1493 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1494 		return false;
1495 
1496 	hint = max(rq_prio(list_next_entry(rq, sched.link)),
1497 		   engine->execlists.queue_priority_hint);
1498 
1499 	return hint >= effective_prio(rq);
1500 }
1501 
1502 static int
1503 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1504 {
1505 	if (list_is_last(&rq->sched.link, &engine->active.requests))
1506 		return INT_MIN;
1507 
1508 	return rq_prio(list_next_entry(rq, sched.link));
1509 }
1510 
1511 static inline unsigned long
1512 timeslice(const struct intel_engine_cs *engine)
1513 {
1514 	return READ_ONCE(engine->props.timeslice_duration_ms);
1515 }
1516 
1517 static unsigned long
1518 active_timeslice(const struct intel_engine_cs *engine)
1519 {
1520 	const struct i915_request *rq = *engine->execlists.active;
1521 
1522 	if (i915_request_completed(rq))
1523 		return 0;
1524 
1525 	if (engine->execlists.switch_priority_hint < effective_prio(rq))
1526 		return 0;
1527 
1528 	return timeslice(engine);
1529 }
1530 
1531 static void set_timeslice(struct intel_engine_cs *engine)
1532 {
1533 	if (!intel_engine_has_timeslices(engine))
1534 		return;
1535 
1536 	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1537 }
1538 
1539 static void record_preemption(struct intel_engine_execlists *execlists)
1540 {
1541 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1542 }
1543 
1544 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine)
1545 {
1546 	struct i915_request *rq;
1547 
1548 	rq = last_active(&engine->execlists);
1549 	if (!rq)
1550 		return 0;
1551 
1552 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1553 	if (unlikely(i915_gem_context_is_banned(rq->gem_context)))
1554 		return 1;
1555 
1556 	return READ_ONCE(engine->props.preempt_timeout_ms);
1557 }
1558 
1559 static void set_preempt_timeout(struct intel_engine_cs *engine)
1560 {
1561 	if (!intel_engine_has_preempt_reset(engine))
1562 		return;
1563 
1564 	set_timer_ms(&engine->execlists.preempt,
1565 		     active_preempt_timeout(engine));
1566 }
1567 
1568 static void execlists_dequeue(struct intel_engine_cs *engine)
1569 {
1570 	struct intel_engine_execlists * const execlists = &engine->execlists;
1571 	struct i915_request **port = execlists->pending;
1572 	struct i915_request ** const last_port = port + execlists->port_mask;
1573 	struct i915_request *last;
1574 	struct rb_node *rb;
1575 	bool submit = false;
1576 
1577 	/*
1578 	 * Hardware submission is through 2 ports. Conceptually each port
1579 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1580 	 * static for a context, and unique to each, so we only execute
1581 	 * requests belonging to a single context from each ring. RING_HEAD
1582 	 * is maintained by the CS in the context image, it marks the place
1583 	 * where it got up to last time, and through RING_TAIL we tell the CS
1584 	 * where we want to execute up to this time.
1585 	 *
1586 	 * In this list the requests are in order of execution. Consecutive
1587 	 * requests from the same context are adjacent in the ringbuffer. We
1588 	 * can combine these requests into a single RING_TAIL update:
1589 	 *
1590 	 *              RING_HEAD...req1...req2
1591 	 *                                    ^- RING_TAIL
1592 	 * since to execute req2 the CS must first execute req1.
1593 	 *
1594 	 * Our goal then is to point each port to the end of a consecutive
1595 	 * sequence of requests as being the most optimal (fewest wake ups
1596 	 * and context switches) submission.
1597 	 */
1598 
1599 	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1600 		struct virtual_engine *ve =
1601 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1602 		struct i915_request *rq = READ_ONCE(ve->request);
1603 
1604 		if (!rq) { /* lazily cleanup after another engine handled rq */
1605 			rb_erase_cached(rb, &execlists->virtual);
1606 			RB_CLEAR_NODE(rb);
1607 			rb = rb_first_cached(&execlists->virtual);
1608 			continue;
1609 		}
1610 
1611 		if (!virtual_matches(ve, rq, engine)) {
1612 			rb = rb_next(rb);
1613 			continue;
1614 		}
1615 
1616 		break;
1617 	}
1618 
1619 	/*
1620 	 * If the queue is higher priority than the last
1621 	 * request in the currently active context, submit afresh.
1622 	 * We will resubmit again afterwards in case we need to split
1623 	 * the active context to interject the preemption request,
1624 	 * i.e. we will retrigger preemption following the ack in case
1625 	 * of trouble.
1626 	 */
1627 	last = last_active(execlists);
1628 	if (last) {
1629 		if (need_preempt(engine, last, rb)) {
1630 			GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n",
1631 				  engine->name,
1632 				  last->fence.context,
1633 				  last->fence.seqno,
1634 				  last->sched.attr.priority,
1635 				  execlists->queue_priority_hint);
1636 			record_preemption(execlists);
1637 
1638 			/*
1639 			 * Don't let the RING_HEAD advance past the breadcrumb
1640 			 * as we unwind (and until we resubmit) so that we do
1641 			 * not accidentally tell it to go backwards.
1642 			 */
1643 			ring_set_paused(engine, 1);
1644 
1645 			/*
1646 			 * Note that we have not stopped the GPU at this point,
1647 			 * so we are unwinding the incomplete requests as they
1648 			 * remain inflight and so by the time we do complete
1649 			 * the preemption, some of the unwound requests may
1650 			 * complete!
1651 			 */
1652 			__unwind_incomplete_requests(engine);
1653 
1654 			/*
1655 			 * If we need to return to the preempted context, we
1656 			 * need to skip the lite-restore and force it to
1657 			 * reload the RING_TAIL. Otherwise, the HW has a
1658 			 * tendency to ignore us rewinding the TAIL to the
1659 			 * end of an earlier request.
1660 			 */
1661 			last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1662 			last = NULL;
1663 		} else if (need_timeslice(engine, last) &&
1664 			   timer_expired(&engine->execlists.timer)) {
1665 			GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n",
1666 				  engine->name,
1667 				  last->fence.context,
1668 				  last->fence.seqno,
1669 				  last->sched.attr.priority,
1670 				  execlists->queue_priority_hint);
1671 
1672 			ring_set_paused(engine, 1);
1673 			defer_active(engine);
1674 
1675 			/*
1676 			 * Unlike for preemption, if we rewind and continue
1677 			 * executing the same context as previously active,
1678 			 * the order of execution will remain the same and
1679 			 * the tail will only advance. We do not need to
1680 			 * force a full context restore, as a lite-restore
1681 			 * is sufficient to resample the monotonic TAIL.
1682 			 *
1683 			 * If we switch to any other context, similarly we
1684 			 * will not rewind TAIL of current context, and
1685 			 * normal save/restore will preserve state and allow
1686 			 * us to later continue executing the same request.
1687 			 */
1688 			last = NULL;
1689 		} else {
1690 			/*
1691 			 * Otherwise if we already have a request pending
1692 			 * for execution after the current one, we can
1693 			 * just wait until the next CS event before
1694 			 * queuing more. In either case we will force a
1695 			 * lite-restore preemption event, but if we wait
1696 			 * we hopefully coalesce several updates into a single
1697 			 * submission.
1698 			 */
1699 			if (!list_is_last(&last->sched.link,
1700 					  &engine->active.requests)) {
1701 				/*
1702 				 * Even if ELSP[1] is occupied and not worthy
1703 				 * of timeslices, our queue might be.
1704 				 */
1705 				if (!execlists->timer.expires &&
1706 				    need_timeslice(engine, last))
1707 					set_timer_ms(&execlists->timer,
1708 						     timeslice(engine));
1709 
1710 				return;
1711 			}
1712 		}
1713 	}
1714 
1715 	while (rb) { /* XXX virtual is always taking precedence */
1716 		struct virtual_engine *ve =
1717 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1718 		struct i915_request *rq;
1719 
1720 		spin_lock(&ve->base.active.lock);
1721 
1722 		rq = ve->request;
1723 		if (unlikely(!rq)) { /* lost the race to a sibling */
1724 			spin_unlock(&ve->base.active.lock);
1725 			rb_erase_cached(rb, &execlists->virtual);
1726 			RB_CLEAR_NODE(rb);
1727 			rb = rb_first_cached(&execlists->virtual);
1728 			continue;
1729 		}
1730 
1731 		GEM_BUG_ON(rq != ve->request);
1732 		GEM_BUG_ON(rq->engine != &ve->base);
1733 		GEM_BUG_ON(rq->hw_context != &ve->context);
1734 
1735 		if (rq_prio(rq) >= queue_prio(execlists)) {
1736 			if (!virtual_matches(ve, rq, engine)) {
1737 				spin_unlock(&ve->base.active.lock);
1738 				rb = rb_next(rb);
1739 				continue;
1740 			}
1741 
1742 			if (last && !can_merge_rq(last, rq)) {
1743 				spin_unlock(&ve->base.active.lock);
1744 				return; /* leave this for another */
1745 			}
1746 
1747 			GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
1748 				  engine->name,
1749 				  rq->fence.context,
1750 				  rq->fence.seqno,
1751 				  i915_request_completed(rq) ? "!" :
1752 				  i915_request_started(rq) ? "*" :
1753 				  "",
1754 				  yesno(engine != ve->siblings[0]));
1755 
1756 			ve->request = NULL;
1757 			ve->base.execlists.queue_priority_hint = INT_MIN;
1758 			rb_erase_cached(rb, &execlists->virtual);
1759 			RB_CLEAR_NODE(rb);
1760 
1761 			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1762 			rq->engine = engine;
1763 
1764 			if (engine != ve->siblings[0]) {
1765 				u32 *regs = ve->context.lrc_reg_state;
1766 				unsigned int n;
1767 
1768 				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1769 
1770 				if (!intel_engine_has_relative_mmio(engine))
1771 					virtual_update_register_offsets(regs,
1772 									engine);
1773 
1774 				if (!list_empty(&ve->context.signals))
1775 					virtual_xfer_breadcrumbs(ve, engine);
1776 
1777 				/*
1778 				 * Move the bound engine to the top of the list
1779 				 * for future execution. We then kick this
1780 				 * tasklet first before checking others, so that
1781 				 * we preferentially reuse this set of bound
1782 				 * registers.
1783 				 */
1784 				for (n = 1; n < ve->num_siblings; n++) {
1785 					if (ve->siblings[n] == engine) {
1786 						swap(ve->siblings[n],
1787 						     ve->siblings[0]);
1788 						break;
1789 					}
1790 				}
1791 
1792 				GEM_BUG_ON(ve->siblings[0] != engine);
1793 			}
1794 
1795 			if (__i915_request_submit(rq)) {
1796 				submit = true;
1797 				last = rq;
1798 			}
1799 			i915_request_put(rq);
1800 
1801 			/*
1802 			 * Hmm, we have a bunch of virtual engine requests,
1803 			 * but the first one was already completed (thanks
1804 			 * preempt-to-busy!). Keep looking at the veng queue
1805 			 * until we have no more relevant requests (i.e.
1806 			 * the normal submit queue has higher priority).
1807 			 */
1808 			if (!submit) {
1809 				spin_unlock(&ve->base.active.lock);
1810 				rb = rb_first_cached(&execlists->virtual);
1811 				continue;
1812 			}
1813 		}
1814 
1815 		spin_unlock(&ve->base.active.lock);
1816 		break;
1817 	}
1818 
1819 	while ((rb = rb_first_cached(&execlists->queue))) {
1820 		struct i915_priolist *p = to_priolist(rb);
1821 		struct i915_request *rq, *rn;
1822 		int i;
1823 
1824 		priolist_for_each_request_consume(rq, rn, p, i) {
1825 			bool merge = true;
1826 
1827 			/*
1828 			 * Can we combine this request with the current port?
1829 			 * It has to be the same context/ringbuffer and not
1830 			 * have any exceptions (e.g. GVT saying never to
1831 			 * combine contexts).
1832 			 *
1833 			 * If we can combine the requests, we can execute both
1834 			 * by updating the RING_TAIL to point to the end of the
1835 			 * second request, and so we never need to tell the
1836 			 * hardware about the first.
1837 			 */
1838 			if (last && !can_merge_rq(last, rq)) {
1839 				/*
1840 				 * If we are on the second port and cannot
1841 				 * combine this request with the last, then we
1842 				 * are done.
1843 				 */
1844 				if (port == last_port)
1845 					goto done;
1846 
1847 				/*
1848 				 * We must not populate both ELSP[] with the
1849 				 * same LRCA, i.e. we must submit 2 different
1850 				 * contexts if we submit 2 ELSP.
1851 				 */
1852 				if (last->hw_context == rq->hw_context)
1853 					goto done;
1854 
1855 				if (i915_request_has_sentinel(last))
1856 					goto done;
1857 
1858 				/*
1859 				 * If GVT overrides us we only ever submit
1860 				 * port[0], leaving port[1] empty. Note that we
1861 				 * also have to be careful that we don't queue
1862 				 * the same context (even though a different
1863 				 * request) to the second port.
1864 				 */
1865 				if (ctx_single_port_submission(last->hw_context) ||
1866 				    ctx_single_port_submission(rq->hw_context))
1867 					goto done;
1868 
1869 				merge = false;
1870 			}
1871 
1872 			if (__i915_request_submit(rq)) {
1873 				if (!merge) {
1874 					*port = execlists_schedule_in(last, port - execlists->pending);
1875 					port++;
1876 					last = NULL;
1877 				}
1878 
1879 				GEM_BUG_ON(last &&
1880 					   !can_merge_ctx(last->hw_context,
1881 							  rq->hw_context));
1882 
1883 				submit = true;
1884 				last = rq;
1885 			}
1886 		}
1887 
1888 		rb_erase_cached(&p->node, &execlists->queue);
1889 		i915_priolist_free(p);
1890 	}
1891 
1892 done:
1893 	/*
1894 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
1895 	 *
1896 	 * We choose the priority hint such that if we add a request of greater
1897 	 * priority than this, we kick the submission tasklet to decide on
1898 	 * the right order of submitting the requests to hardware. We must
1899 	 * also be prepared to reorder requests as they are in-flight on the
1900 	 * HW. We derive the priority hint then as the first "hole" in
1901 	 * the HW submission ports and if there are no available slots,
1902 	 * the priority of the lowest executing request, i.e. last.
1903 	 *
1904 	 * When we do receive a higher priority request ready to run from the
1905 	 * user, see queue_request(), the priority hint is bumped to that
1906 	 * request triggering preemption on the next dequeue (or subsequent
1907 	 * interrupt for secondary ports).
1908 	 */
1909 	execlists->queue_priority_hint = queue_prio(execlists);
1910 	GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n",
1911 		  engine->name, execlists->queue_priority_hint,
1912 		  yesno(submit));
1913 
1914 	if (submit) {
1915 		*port = execlists_schedule_in(last, port - execlists->pending);
1916 		execlists->switch_priority_hint =
1917 			switch_prio(engine, *execlists->pending);
1918 
1919 		/*
1920 		 * Skip if we ended up with exactly the same set of requests,
1921 		 * e.g. trying to timeslice a pair of ordered contexts
1922 		 */
1923 		if (!memcmp(execlists->active, execlists->pending,
1924 			    (port - execlists->pending + 1) * sizeof(*port))) {
1925 			do
1926 				execlists_schedule_out(fetch_and_zero(port));
1927 			while (port-- != execlists->pending);
1928 
1929 			goto skip_submit;
1930 		}
1931 
1932 		memset(port + 1, 0, (last_port - port) * sizeof(*port));
1933 		execlists_submit_ports(engine);
1934 
1935 		set_preempt_timeout(engine);
1936 	} else {
1937 skip_submit:
1938 		ring_set_paused(engine, 0);
1939 	}
1940 }
1941 
1942 static void
1943 cancel_port_requests(struct intel_engine_execlists * const execlists)
1944 {
1945 	struct i915_request * const *port;
1946 
1947 	for (port = execlists->pending; *port; port++)
1948 		execlists_schedule_out(*port);
1949 	memset(execlists->pending, 0, sizeof(execlists->pending));
1950 
1951 	/* Mark the end of active before we overwrite *active */
1952 	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
1953 		execlists_schedule_out(*port);
1954 	WRITE_ONCE(execlists->active,
1955 		   memset(execlists->inflight, 0, sizeof(execlists->inflight)));
1956 }
1957 
1958 static inline void
1959 invalidate_csb_entries(const u32 *first, const u32 *last)
1960 {
1961 	clflush((void *)first);
1962 	clflush((void *)last);
1963 }
1964 
1965 static inline bool
1966 reset_in_progress(const struct intel_engine_execlists *execlists)
1967 {
1968 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1969 }
1970 
1971 /*
1972  * Starting with Gen12, the status has a new format:
1973  *
1974  *     bit  0:     switched to new queue
1975  *     bit  1:     reserved
1976  *     bit  2:     semaphore wait mode (poll or signal), only valid when
1977  *                 switch detail is set to "wait on semaphore"
1978  *     bits 3-5:   engine class
1979  *     bits 6-11:  engine instance
1980  *     bits 12-14: reserved
1981  *     bits 15-25: sw context id of the lrc the GT switched to
1982  *     bits 26-31: sw counter of the lrc the GT switched to
1983  *     bits 32-35: context switch detail
1984  *                  - 0: ctx complete
1985  *                  - 1: wait on sync flip
1986  *                  - 2: wait on vblank
1987  *                  - 3: wait on scanline
1988  *                  - 4: wait on semaphore
1989  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
1990  *                       WAIT_FOR_EVENT)
1991  *     bit  36:    reserved
1992  *     bits 37-43: wait detail (for switch detail 1 to 4)
1993  *     bits 44-46: reserved
1994  *     bits 47-57: sw context id of the lrc the GT switched away from
1995  *     bits 58-63: sw counter of the lrc the GT switched away from
1996  */
1997 static inline bool
1998 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1999 {
2000 	u32 lower_dw = csb[0];
2001 	u32 upper_dw = csb[1];
2002 	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2003 	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2004 	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2005 
2006 	/*
2007 	 * The context switch detail is not guaranteed to be 5 when a preemption
2008 	 * occurs, so we can't just check for that. The check below works for
2009 	 * all the cases we care about, including preemptions of WAIT
2010 	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2011 	 * would require some extra handling, but we don't support that.
2012 	 */
2013 	if (!ctx_away_valid || new_queue) {
2014 		GEM_BUG_ON(!ctx_to_valid);
2015 		return true;
2016 	}
2017 
2018 	/*
2019 	 * switch detail = 5 is covered by the case above and we do not expect a
2020 	 * context switch on an unsuccessful wait instruction since we always
2021 	 * use polling mode.
2022 	 */
2023 	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2024 	return false;
2025 }
2026 
2027 static inline bool
2028 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2029 {
2030 	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2031 }
2032 
2033 static void process_csb(struct intel_engine_cs *engine)
2034 {
2035 	struct intel_engine_execlists * const execlists = &engine->execlists;
2036 	const u32 * const buf = execlists->csb_status;
2037 	const u8 num_entries = execlists->csb_size;
2038 	u8 head, tail;
2039 
2040 	/*
2041 	 * As we modify our execlists state tracking we require exclusive
2042 	 * access. Either we are inside the tasklet, or the tasklet is disabled
2043 	 * and we assume that is only inside the reset paths and so serialised.
2044 	 */
2045 	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2046 		   !reset_in_progress(execlists));
2047 	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2048 
2049 	/*
2050 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2051 	 * When reading from the csb_write mmio register, we have to be
2052 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2053 	 * the low 4bits. As it happens we know the next 4bits are always
2054 	 * zero and so we can simply masked off the low u8 of the register
2055 	 * and treat it identically to reading from the HWSP (without having
2056 	 * to use explicit shifting and masking, and probably bifurcating
2057 	 * the code to handle the legacy mmio read).
2058 	 */
2059 	head = execlists->csb_head;
2060 	tail = READ_ONCE(*execlists->csb_write);
2061 	GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
2062 	if (unlikely(head == tail))
2063 		return;
2064 
2065 	/*
2066 	 * Hopefully paired with a wmb() in HW!
2067 	 *
2068 	 * We must complete the read of the write pointer before any reads
2069 	 * from the CSB, so that we do not see stale values. Without an rmb
2070 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2071 	 * we perform the READ_ONCE(*csb_write).
2072 	 */
2073 	rmb();
2074 
2075 	do {
2076 		bool promote;
2077 
2078 		if (++head == num_entries)
2079 			head = 0;
2080 
2081 		/*
2082 		 * We are flying near dragons again.
2083 		 *
2084 		 * We hold a reference to the request in execlist_port[]
2085 		 * but no more than that. We are operating in softirq
2086 		 * context and so cannot hold any mutex or sleep. That
2087 		 * prevents us stopping the requests we are processing
2088 		 * in port[] from being retired simultaneously (the
2089 		 * breadcrumb will be complete before we see the
2090 		 * context-switch). As we only hold the reference to the
2091 		 * request, any pointer chasing underneath the request
2092 		 * is subject to a potential use-after-free. Thus we
2093 		 * store all of the bookkeeping within port[] as
2094 		 * required, and avoid using unguarded pointers beneath
2095 		 * request itself. The same applies to the atomic
2096 		 * status notifier.
2097 		 */
2098 
2099 		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
2100 			  engine->name, head,
2101 			  buf[2 * head + 0], buf[2 * head + 1]);
2102 
2103 		if (INTEL_GEN(engine->i915) >= 12)
2104 			promote = gen12_csb_parse(execlists, buf + 2 * head);
2105 		else
2106 			promote = gen8_csb_parse(execlists, buf + 2 * head);
2107 		if (promote) {
2108 			struct i915_request * const *old = execlists->active;
2109 
2110 			/* Point active to the new ELSP; prevent overwriting */
2111 			WRITE_ONCE(execlists->active, execlists->pending);
2112 			set_timeslice(engine);
2113 
2114 			if (!inject_preempt_hang(execlists))
2115 				ring_set_paused(engine, 0);
2116 
2117 			/* cancel old inflight, prepare for switch */
2118 			trace_ports(execlists, "preempted", old);
2119 			while (*old)
2120 				execlists_schedule_out(*old++);
2121 
2122 			/* switch pending to inflight */
2123 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2124 			WRITE_ONCE(execlists->active,
2125 				   memcpy(execlists->inflight,
2126 					  execlists->pending,
2127 					  execlists_num_ports(execlists) *
2128 					  sizeof(*execlists->pending)));
2129 
2130 			WRITE_ONCE(execlists->pending[0], NULL);
2131 		} else {
2132 			GEM_BUG_ON(!*execlists->active);
2133 
2134 			/* port0 completed, advanced to port1 */
2135 			trace_ports(execlists, "completed", execlists->active);
2136 
2137 			/*
2138 			 * We rely on the hardware being strongly
2139 			 * ordered, that the breadcrumb write is
2140 			 * coherent (visible from the CPU) before the
2141 			 * user interrupt and CSB is processed.
2142 			 */
2143 			GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2144 				   !reset_in_progress(execlists));
2145 			execlists_schedule_out(*execlists->active++);
2146 
2147 			GEM_BUG_ON(execlists->active - execlists->inflight >
2148 				   execlists_num_ports(execlists));
2149 		}
2150 	} while (head != tail);
2151 
2152 	execlists->csb_head = head;
2153 
2154 	/*
2155 	 * Gen11 has proven to fail wrt global observation point between
2156 	 * entry and tail update, failing on the ordering and thus
2157 	 * we see an old entry in the context status buffer.
2158 	 *
2159 	 * Forcibly evict out entries for the next gpu csb update,
2160 	 * to increase the odds that we get a fresh entries with non
2161 	 * working hardware. The cost for doing so comes out mostly with
2162 	 * the wash as hardware, working or not, will need to do the
2163 	 * invalidation before.
2164 	 */
2165 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2166 }
2167 
2168 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2169 {
2170 	lockdep_assert_held(&engine->active.lock);
2171 	if (!engine->execlists.pending[0]) {
2172 		rcu_read_lock(); /* protect peeking at execlists->active */
2173 		execlists_dequeue(engine);
2174 		rcu_read_unlock();
2175 	}
2176 }
2177 
2178 static noinline void preempt_reset(struct intel_engine_cs *engine)
2179 {
2180 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2181 	unsigned long *lock = &engine->gt->reset.flags;
2182 
2183 	if (i915_modparams.reset < 3)
2184 		return;
2185 
2186 	if (test_and_set_bit(bit, lock))
2187 		return;
2188 
2189 	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2190 	tasklet_disable_nosync(&engine->execlists.tasklet);
2191 
2192 	GEM_TRACE("%s: preempt timeout %lu+%ums\n",
2193 		  engine->name,
2194 		  READ_ONCE(engine->props.preempt_timeout_ms),
2195 		  jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2196 	intel_engine_reset(engine, "preemption time out");
2197 
2198 	tasklet_enable(&engine->execlists.tasklet);
2199 	clear_and_wake_up_bit(bit, lock);
2200 }
2201 
2202 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2203 {
2204 	const struct timer_list *t = &engine->execlists.preempt;
2205 
2206 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2207 		return false;
2208 
2209 	if (!timer_expired(t))
2210 		return false;
2211 
2212 	return READ_ONCE(engine->execlists.pending[0]);
2213 }
2214 
2215 /*
2216  * Check the unread Context Status Buffers and manage the submission of new
2217  * contexts to the ELSP accordingly.
2218  */
2219 static void execlists_submission_tasklet(unsigned long data)
2220 {
2221 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2222 	bool timeout = preempt_timeout(engine);
2223 
2224 	process_csb(engine);
2225 	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2226 		unsigned long flags;
2227 
2228 		spin_lock_irqsave(&engine->active.lock, flags);
2229 		__execlists_submission_tasklet(engine);
2230 		spin_unlock_irqrestore(&engine->active.lock, flags);
2231 
2232 		/* Recheck after serialising with direct-submission */
2233 		if (timeout && preempt_timeout(engine))
2234 			preempt_reset(engine);
2235 	}
2236 }
2237 
2238 static void __execlists_kick(struct intel_engine_execlists *execlists)
2239 {
2240 	/* Kick the tasklet for some interrupt coalescing and reset handling */
2241 	tasklet_hi_schedule(&execlists->tasklet);
2242 }
2243 
2244 #define execlists_kick(t, member) \
2245 	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2246 
2247 static void execlists_timeslice(struct timer_list *timer)
2248 {
2249 	execlists_kick(timer, timer);
2250 }
2251 
2252 static void execlists_preempt(struct timer_list *timer)
2253 {
2254 	execlists_kick(timer, preempt);
2255 }
2256 
2257 static void queue_request(struct intel_engine_cs *engine,
2258 			  struct i915_sched_node *node,
2259 			  int prio)
2260 {
2261 	GEM_BUG_ON(!list_empty(&node->link));
2262 	list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
2263 }
2264 
2265 static void __submit_queue_imm(struct intel_engine_cs *engine)
2266 {
2267 	struct intel_engine_execlists * const execlists = &engine->execlists;
2268 
2269 	if (reset_in_progress(execlists))
2270 		return; /* defer until we restart the engine following reset */
2271 
2272 	if (execlists->tasklet.func == execlists_submission_tasklet)
2273 		__execlists_submission_tasklet(engine);
2274 	else
2275 		tasklet_hi_schedule(&execlists->tasklet);
2276 }
2277 
2278 static void submit_queue(struct intel_engine_cs *engine,
2279 			 const struct i915_request *rq)
2280 {
2281 	struct intel_engine_execlists *execlists = &engine->execlists;
2282 
2283 	if (rq_prio(rq) <= execlists->queue_priority_hint)
2284 		return;
2285 
2286 	execlists->queue_priority_hint = rq_prio(rq);
2287 	__submit_queue_imm(engine);
2288 }
2289 
2290 static void execlists_submit_request(struct i915_request *request)
2291 {
2292 	struct intel_engine_cs *engine = request->engine;
2293 	unsigned long flags;
2294 
2295 	/* Will be called from irq-context when using foreign fences. */
2296 	spin_lock_irqsave(&engine->active.lock, flags);
2297 
2298 	queue_request(engine, &request->sched, rq_prio(request));
2299 
2300 	GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2301 	GEM_BUG_ON(list_empty(&request->sched.link));
2302 
2303 	submit_queue(engine, request);
2304 
2305 	spin_unlock_irqrestore(&engine->active.lock, flags);
2306 }
2307 
2308 static void __execlists_context_fini(struct intel_context *ce)
2309 {
2310 	intel_ring_put(ce->ring);
2311 	i915_vma_put(ce->state);
2312 }
2313 
2314 static void execlists_context_destroy(struct kref *kref)
2315 {
2316 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2317 
2318 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2319 	GEM_BUG_ON(intel_context_is_pinned(ce));
2320 
2321 	if (ce->state)
2322 		__execlists_context_fini(ce);
2323 
2324 	intel_context_fini(ce);
2325 	intel_context_free(ce);
2326 }
2327 
2328 static void
2329 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2330 {
2331 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2332 		return;
2333 
2334 	vaddr += engine->context_size;
2335 
2336 	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
2337 }
2338 
2339 static void
2340 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2341 {
2342 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2343 		return;
2344 
2345 	vaddr += engine->context_size;
2346 
2347 	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE))
2348 		dev_err_once(engine->i915->drm.dev,
2349 			     "%s context redzone overwritten!\n",
2350 			     engine->name);
2351 }
2352 
2353 static void execlists_context_unpin(struct intel_context *ce)
2354 {
2355 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2356 		      ce->engine);
2357 
2358 	i915_gem_object_unpin_map(ce->state->obj);
2359 	intel_ring_reset(ce->ring, ce->ring->tail);
2360 }
2361 
2362 static void
2363 __execlists_update_reg_state(const struct intel_context *ce,
2364 			     const struct intel_engine_cs *engine)
2365 {
2366 	struct intel_ring *ring = ce->ring;
2367 	u32 *regs = ce->lrc_reg_state;
2368 
2369 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
2370 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2371 
2372 	regs[CTX_RING_BUFFER_START] = i915_ggtt_offset(ring->vma);
2373 	regs[CTX_RING_HEAD] = ring->head;
2374 	regs[CTX_RING_TAIL] = ring->tail;
2375 
2376 	/* RPCS */
2377 	if (engine->class == RENDER_CLASS) {
2378 		regs[CTX_R_PWR_CLK_STATE] =
2379 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2380 
2381 		i915_oa_init_reg_state(ce, engine);
2382 	}
2383 }
2384 
2385 static int
2386 __execlists_context_pin(struct intel_context *ce,
2387 			struct intel_engine_cs *engine)
2388 {
2389 	void *vaddr;
2390 	int ret;
2391 
2392 	GEM_BUG_ON(!ce->state);
2393 
2394 	ret = intel_context_active_acquire(ce);
2395 	if (ret)
2396 		goto err;
2397 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2398 
2399 	vaddr = i915_gem_object_pin_map(ce->state->obj,
2400 					i915_coherent_map_type(engine->i915) |
2401 					I915_MAP_OVERRIDE);
2402 	if (IS_ERR(vaddr)) {
2403 		ret = PTR_ERR(vaddr);
2404 		goto unpin_active;
2405 	}
2406 
2407 	ce->lrc_desc = lrc_descriptor(ce, engine);
2408 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2409 	__execlists_update_reg_state(ce, engine);
2410 
2411 	return 0;
2412 
2413 unpin_active:
2414 	intel_context_active_release(ce);
2415 err:
2416 	return ret;
2417 }
2418 
2419 static int execlists_context_pin(struct intel_context *ce)
2420 {
2421 	return __execlists_context_pin(ce, ce->engine);
2422 }
2423 
2424 static int execlists_context_alloc(struct intel_context *ce)
2425 {
2426 	return __execlists_context_alloc(ce, ce->engine);
2427 }
2428 
2429 static void execlists_context_reset(struct intel_context *ce)
2430 {
2431 	/*
2432 	 * Because we emit WA_TAIL_DWORDS there may be a disparity
2433 	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
2434 	 * that stored in context. As we only write new commands from
2435 	 * ce->ring->tail onwards, everything before that is junk. If the GPU
2436 	 * starts reading from its RING_HEAD from the context, it may try to
2437 	 * execute that junk and die.
2438 	 *
2439 	 * The contexts that are stilled pinned on resume belong to the
2440 	 * kernel, and are local to each engine. All other contexts will
2441 	 * have their head/tail sanitized upon pinning before use, so they
2442 	 * will never see garbage,
2443 	 *
2444 	 * So to avoid that we reset the context images upon resume. For
2445 	 * simplicity, we just zero everything out.
2446 	 */
2447 	intel_ring_reset(ce->ring, 0);
2448 	__execlists_update_reg_state(ce, ce->engine);
2449 }
2450 
2451 static const struct intel_context_ops execlists_context_ops = {
2452 	.alloc = execlists_context_alloc,
2453 
2454 	.pin = execlists_context_pin,
2455 	.unpin = execlists_context_unpin,
2456 
2457 	.enter = intel_context_enter_engine,
2458 	.exit = intel_context_exit_engine,
2459 
2460 	.reset = execlists_context_reset,
2461 	.destroy = execlists_context_destroy,
2462 };
2463 
2464 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2465 {
2466 	u32 *cs;
2467 
2468 	GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2469 
2470 	cs = intel_ring_begin(rq, 6);
2471 	if (IS_ERR(cs))
2472 		return PTR_ERR(cs);
2473 
2474 	/*
2475 	 * Check if we have been preempted before we even get started.
2476 	 *
2477 	 * After this point i915_request_started() reports true, even if
2478 	 * we get preempted and so are no longer running.
2479 	 */
2480 	*cs++ = MI_ARB_CHECK;
2481 	*cs++ = MI_NOOP;
2482 
2483 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2484 	*cs++ = i915_request_timeline(rq)->hwsp_offset;
2485 	*cs++ = 0;
2486 	*cs++ = rq->fence.seqno - 1;
2487 
2488 	intel_ring_advance(rq, cs);
2489 
2490 	/* Record the updated position of the request's payload */
2491 	rq->infix = intel_ring_offset(rq, cs);
2492 
2493 	return 0;
2494 }
2495 
2496 static int execlists_request_alloc(struct i915_request *request)
2497 {
2498 	int ret;
2499 
2500 	GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
2501 
2502 	/*
2503 	 * Flush enough space to reduce the likelihood of waiting after
2504 	 * we start building the request - in which case we will just
2505 	 * have to repeat work.
2506 	 */
2507 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
2508 
2509 	/*
2510 	 * Note that after this point, we have committed to using
2511 	 * this request as it is being used to both track the
2512 	 * state of engine initialisation and liveness of the
2513 	 * golden renderstate above. Think twice before you try
2514 	 * to cancel/unwind this request now.
2515 	 */
2516 
2517 	/* Unconditionally invalidate GPU caches and TLBs. */
2518 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
2519 	if (ret)
2520 		return ret;
2521 
2522 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
2523 	return 0;
2524 }
2525 
2526 /*
2527  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
2528  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
2529  * but there is a slight complication as this is applied in WA batch where the
2530  * values are only initialized once so we cannot take register value at the
2531  * beginning and reuse it further; hence we save its value to memory, upload a
2532  * constant value with bit21 set and then we restore it back with the saved value.
2533  * To simplify the WA, a constant value is formed by using the default value
2534  * of this register. This shouldn't be a problem because we are only modifying
2535  * it for a short period and this batch in non-premptible. We can ofcourse
2536  * use additional instructions that read the actual value of the register
2537  * at that time and set our bit of interest but it makes the WA complicated.
2538  *
2539  * This WA is also required for Gen9 so extracting as a function avoids
2540  * code duplication.
2541  */
2542 static u32 *
2543 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
2544 {
2545 	/* NB no one else is allowed to scribble over scratch + 256! */
2546 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2547 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2548 	*batch++ = intel_gt_scratch_offset(engine->gt,
2549 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2550 	*batch++ = 0;
2551 
2552 	*batch++ = MI_LOAD_REGISTER_IMM(1);
2553 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2554 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
2555 
2556 	batch = gen8_emit_pipe_control(batch,
2557 				       PIPE_CONTROL_CS_STALL |
2558 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
2559 				       0);
2560 
2561 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2562 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2563 	*batch++ = intel_gt_scratch_offset(engine->gt,
2564 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2565 	*batch++ = 0;
2566 
2567 	return batch;
2568 }
2569 
2570 /*
2571  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
2572  * initialized at the beginning and shared across all contexts but this field
2573  * helps us to have multiple batches at different offsets and select them based
2574  * on a criteria. At the moment this batch always start at the beginning of the page
2575  * and at this point we don't have multiple wa_ctx batch buffers.
2576  *
2577  * The number of WA applied are not known at the beginning; we use this field
2578  * to return the no of DWORDS written.
2579  *
2580  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
2581  * so it adds NOOPs as padding to make it cacheline aligned.
2582  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
2583  * makes a complete batch buffer.
2584  */
2585 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2586 {
2587 	/* WaDisableCtxRestoreArbitration:bdw,chv */
2588 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2589 
2590 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
2591 	if (IS_BROADWELL(engine->i915))
2592 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2593 
2594 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
2595 	/* Actual scratch location is at 128 bytes offset */
2596 	batch = gen8_emit_pipe_control(batch,
2597 				       PIPE_CONTROL_FLUSH_L3 |
2598 				       PIPE_CONTROL_STORE_DATA_INDEX |
2599 				       PIPE_CONTROL_CS_STALL |
2600 				       PIPE_CONTROL_QW_WRITE,
2601 				       LRC_PPHWSP_SCRATCH_ADDR);
2602 
2603 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2604 
2605 	/* Pad to end of cacheline */
2606 	while ((unsigned long)batch % CACHELINE_BYTES)
2607 		*batch++ = MI_NOOP;
2608 
2609 	/*
2610 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
2611 	 * execution depends on the length specified in terms of cache lines
2612 	 * in the register CTX_RCS_INDIRECT_CTX
2613 	 */
2614 
2615 	return batch;
2616 }
2617 
2618 struct lri {
2619 	i915_reg_t reg;
2620 	u32 value;
2621 };
2622 
2623 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
2624 {
2625 	GEM_BUG_ON(!count || count > 63);
2626 
2627 	*batch++ = MI_LOAD_REGISTER_IMM(count);
2628 	do {
2629 		*batch++ = i915_mmio_reg_offset(lri->reg);
2630 		*batch++ = lri->value;
2631 	} while (lri++, --count);
2632 	*batch++ = MI_NOOP;
2633 
2634 	return batch;
2635 }
2636 
2637 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2638 {
2639 	static const struct lri lri[] = {
2640 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
2641 		{
2642 			COMMON_SLICE_CHICKEN2,
2643 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
2644 				       0),
2645 		},
2646 
2647 		/* BSpec: 11391 */
2648 		{
2649 			FF_SLICE_CHICKEN,
2650 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
2651 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
2652 		},
2653 
2654 		/* BSpec: 11299 */
2655 		{
2656 			_3D_CHICKEN3,
2657 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
2658 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
2659 		}
2660 	};
2661 
2662 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2663 
2664 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
2665 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2666 
2667 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
2668 	batch = gen8_emit_pipe_control(batch,
2669 				       PIPE_CONTROL_FLUSH_L3 |
2670 				       PIPE_CONTROL_STORE_DATA_INDEX |
2671 				       PIPE_CONTROL_CS_STALL |
2672 				       PIPE_CONTROL_QW_WRITE,
2673 				       LRC_PPHWSP_SCRATCH_ADDR);
2674 
2675 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
2676 
2677 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
2678 	if (HAS_POOLED_EU(engine->i915)) {
2679 		/*
2680 		 * EU pool configuration is setup along with golden context
2681 		 * during context initialization. This value depends on
2682 		 * device type (2x6 or 3x6) and needs to be updated based
2683 		 * on which subslice is disabled especially for 2x6
2684 		 * devices, however it is safe to load default
2685 		 * configuration of 3x6 device instead of masking off
2686 		 * corresponding bits because HW ignores bits of a disabled
2687 		 * subslice and drops down to appropriate config. Please
2688 		 * see render_state_setup() in i915_gem_render_state.c for
2689 		 * possible configurations, to avoid duplication they are
2690 		 * not shown here again.
2691 		 */
2692 		*batch++ = GEN9_MEDIA_POOL_STATE;
2693 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
2694 		*batch++ = 0x00777000;
2695 		*batch++ = 0;
2696 		*batch++ = 0;
2697 		*batch++ = 0;
2698 	}
2699 
2700 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2701 
2702 	/* Pad to end of cacheline */
2703 	while ((unsigned long)batch % CACHELINE_BYTES)
2704 		*batch++ = MI_NOOP;
2705 
2706 	return batch;
2707 }
2708 
2709 static u32 *
2710 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2711 {
2712 	int i;
2713 
2714 	/*
2715 	 * WaPipeControlBefore3DStateSamplePattern: cnl
2716 	 *
2717 	 * Ensure the engine is idle prior to programming a
2718 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
2719 	 */
2720 	batch = gen8_emit_pipe_control(batch,
2721 				       PIPE_CONTROL_CS_STALL,
2722 				       0);
2723 	/*
2724 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
2725 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
2726 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
2727 	 * confusing. Since gen8_emit_pipe_control() already advances the
2728 	 * batch by 6 dwords, we advance the other 10 here, completing a
2729 	 * cacheline. It's not clear if the workaround requires this padding
2730 	 * before other commands, or if it's just the regular padding we would
2731 	 * already have for the workaround bb, so leave it here for now.
2732 	 */
2733 	for (i = 0; i < 10; i++)
2734 		*batch++ = MI_NOOP;
2735 
2736 	/* Pad to end of cacheline */
2737 	while ((unsigned long)batch % CACHELINE_BYTES)
2738 		*batch++ = MI_NOOP;
2739 
2740 	return batch;
2741 }
2742 
2743 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
2744 
2745 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
2746 {
2747 	struct drm_i915_gem_object *obj;
2748 	struct i915_vma *vma;
2749 	int err;
2750 
2751 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
2752 	if (IS_ERR(obj))
2753 		return PTR_ERR(obj);
2754 
2755 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
2756 	if (IS_ERR(vma)) {
2757 		err = PTR_ERR(vma);
2758 		goto err;
2759 	}
2760 
2761 	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2762 	if (err)
2763 		goto err;
2764 
2765 	engine->wa_ctx.vma = vma;
2766 	return 0;
2767 
2768 err:
2769 	i915_gem_object_put(obj);
2770 	return err;
2771 }
2772 
2773 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
2774 {
2775 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
2776 }
2777 
2778 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
2779 
2780 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
2781 {
2782 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2783 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
2784 					    &wa_ctx->per_ctx };
2785 	wa_bb_func_t wa_bb_fn[2];
2786 	struct page *page;
2787 	void *batch, *batch_ptr;
2788 	unsigned int i;
2789 	int ret;
2790 
2791 	if (engine->class != RENDER_CLASS)
2792 		return 0;
2793 
2794 	switch (INTEL_GEN(engine->i915)) {
2795 	case 12:
2796 	case 11:
2797 		return 0;
2798 	case 10:
2799 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
2800 		wa_bb_fn[1] = NULL;
2801 		break;
2802 	case 9:
2803 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
2804 		wa_bb_fn[1] = NULL;
2805 		break;
2806 	case 8:
2807 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
2808 		wa_bb_fn[1] = NULL;
2809 		break;
2810 	default:
2811 		MISSING_CASE(INTEL_GEN(engine->i915));
2812 		return 0;
2813 	}
2814 
2815 	ret = lrc_setup_wa_ctx(engine);
2816 	if (ret) {
2817 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
2818 		return ret;
2819 	}
2820 
2821 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
2822 	batch = batch_ptr = kmap_atomic(page);
2823 
2824 	/*
2825 	 * Emit the two workaround batch buffers, recording the offset from the
2826 	 * start of the workaround batch buffer object for each and their
2827 	 * respective sizes.
2828 	 */
2829 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
2830 		wa_bb[i]->offset = batch_ptr - batch;
2831 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
2832 						  CACHELINE_BYTES))) {
2833 			ret = -EINVAL;
2834 			break;
2835 		}
2836 		if (wa_bb_fn[i])
2837 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
2838 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
2839 	}
2840 
2841 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
2842 
2843 	kunmap_atomic(batch);
2844 	if (ret)
2845 		lrc_destroy_wa_ctx(engine);
2846 
2847 	return ret;
2848 }
2849 
2850 static void enable_execlists(struct intel_engine_cs *engine)
2851 {
2852 	u32 mode;
2853 
2854 	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
2855 
2856 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
2857 
2858 	if (INTEL_GEN(engine->i915) >= 11)
2859 		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
2860 	else
2861 		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
2862 	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
2863 
2864 	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
2865 
2866 	ENGINE_WRITE_FW(engine,
2867 			RING_HWS_PGA,
2868 			i915_ggtt_offset(engine->status_page.vma));
2869 	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
2870 }
2871 
2872 static bool unexpected_starting_state(struct intel_engine_cs *engine)
2873 {
2874 	bool unexpected = false;
2875 
2876 	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
2877 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
2878 		unexpected = true;
2879 	}
2880 
2881 	return unexpected;
2882 }
2883 
2884 static int execlists_resume(struct intel_engine_cs *engine)
2885 {
2886 	intel_engine_apply_workarounds(engine);
2887 	intel_engine_apply_whitelist(engine);
2888 
2889 	intel_mocs_init_engine(engine);
2890 
2891 	intel_engine_reset_breadcrumbs(engine);
2892 
2893 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
2894 		struct drm_printer p = drm_debug_printer(__func__);
2895 
2896 		intel_engine_dump(engine, &p, NULL);
2897 	}
2898 
2899 	enable_execlists(engine);
2900 
2901 	return 0;
2902 }
2903 
2904 static void execlists_reset_prepare(struct intel_engine_cs *engine)
2905 {
2906 	struct intel_engine_execlists * const execlists = &engine->execlists;
2907 	unsigned long flags;
2908 
2909 	GEM_TRACE("%s: depth<-%d\n", engine->name,
2910 		  atomic_read(&execlists->tasklet.count));
2911 
2912 	/*
2913 	 * Prevent request submission to the hardware until we have
2914 	 * completed the reset in i915_gem_reset_finish(). If a request
2915 	 * is completed by one engine, it may then queue a request
2916 	 * to a second via its execlists->tasklet *just* as we are
2917 	 * calling engine->resume() and also writing the ELSP.
2918 	 * Turning off the execlists->tasklet until the reset is over
2919 	 * prevents the race.
2920 	 */
2921 	__tasklet_disable_sync_once(&execlists->tasklet);
2922 	GEM_BUG_ON(!reset_in_progress(execlists));
2923 
2924 	/* And flush any current direct submission. */
2925 	spin_lock_irqsave(&engine->active.lock, flags);
2926 	spin_unlock_irqrestore(&engine->active.lock, flags);
2927 
2928 	/*
2929 	 * We stop engines, otherwise we might get failed reset and a
2930 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
2931 	 * from system hang if batchbuffer is progressing when
2932 	 * the reset is issued, regardless of READY_TO_RESET ack.
2933 	 * Thus assume it is best to stop engines on all gens
2934 	 * where we have a gpu reset.
2935 	 *
2936 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
2937 	 *
2938 	 * FIXME: Wa for more modern gens needs to be validated
2939 	 */
2940 	intel_engine_stop_cs(engine);
2941 }
2942 
2943 static void reset_csb_pointers(struct intel_engine_cs *engine)
2944 {
2945 	struct intel_engine_execlists * const execlists = &engine->execlists;
2946 	const unsigned int reset_value = execlists->csb_size - 1;
2947 
2948 	ring_set_paused(engine, 0);
2949 
2950 	/*
2951 	 * After a reset, the HW starts writing into CSB entry [0]. We
2952 	 * therefore have to set our HEAD pointer back one entry so that
2953 	 * the *first* entry we check is entry 0. To complicate this further,
2954 	 * as we don't wait for the first interrupt after reset, we have to
2955 	 * fake the HW write to point back to the last entry so that our
2956 	 * inline comparison of our cached head position against the last HW
2957 	 * write works even before the first interrupt.
2958 	 */
2959 	execlists->csb_head = reset_value;
2960 	WRITE_ONCE(*execlists->csb_write, reset_value);
2961 	wmb(); /* Make sure this is visible to HW (paranoia?) */
2962 
2963 	invalidate_csb_entries(&execlists->csb_status[0],
2964 			       &execlists->csb_status[reset_value]);
2965 }
2966 
2967 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
2968 {
2969 	if (INTEL_GEN(engine->i915) >= 12)
2970 		return 0x60;
2971 	else if (INTEL_GEN(engine->i915) >= 9)
2972 		return 0x54;
2973 	else if (engine->class == RENDER_CLASS)
2974 		return 0x58;
2975 	else
2976 		return -1;
2977 }
2978 
2979 static void __execlists_reset_reg_state(const struct intel_context *ce,
2980 					const struct intel_engine_cs *engine)
2981 {
2982 	u32 *regs = ce->lrc_reg_state;
2983 	int x;
2984 
2985 	x = lrc_ring_mi_mode(engine);
2986 	if (x != -1) {
2987 		regs[x + 1] &= ~STOP_RING;
2988 		regs[x + 1] |= STOP_RING << 16;
2989 	}
2990 }
2991 
2992 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
2993 {
2994 	struct intel_engine_execlists * const execlists = &engine->execlists;
2995 	struct intel_context *ce;
2996 	struct i915_request *rq;
2997 
2998 	mb(); /* paranoia: read the CSB pointers from after the reset */
2999 	clflush(execlists->csb_write);
3000 	mb();
3001 
3002 	process_csb(engine); /* drain preemption events */
3003 
3004 	/* Following the reset, we need to reload the CSB read/write pointers */
3005 	reset_csb_pointers(engine);
3006 
3007 	/*
3008 	 * Save the currently executing context, even if we completed
3009 	 * its request, it was still running at the time of the
3010 	 * reset and will have been clobbered.
3011 	 */
3012 	rq = execlists_active(execlists);
3013 	if (!rq)
3014 		goto unwind;
3015 
3016 	/* We still have requests in-flight; the engine should be active */
3017 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3018 
3019 	ce = rq->hw_context;
3020 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3021 
3022 	if (i915_request_completed(rq)) {
3023 		/* Idle context; tidy up the ring so we can restart afresh */
3024 		ce->ring->head = intel_ring_wrap(ce->ring, rq->tail);
3025 		goto out_replay;
3026 	}
3027 
3028 	/* Context has requests still in-flight; it should not be idle! */
3029 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3030 	rq = active_request(ce->timeline, rq);
3031 	ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
3032 	GEM_BUG_ON(ce->ring->head == ce->ring->tail);
3033 
3034 	/*
3035 	 * If this request hasn't started yet, e.g. it is waiting on a
3036 	 * semaphore, we need to avoid skipping the request or else we
3037 	 * break the signaling chain. However, if the context is corrupt
3038 	 * the request will not restart and we will be stuck with a wedged
3039 	 * device. It is quite often the case that if we issue a reset
3040 	 * while the GPU is loading the context image, that the context
3041 	 * image becomes corrupt.
3042 	 *
3043 	 * Otherwise, if we have not started yet, the request should replay
3044 	 * perfectly and we do not need to flag the result as being erroneous.
3045 	 */
3046 	if (!i915_request_started(rq))
3047 		goto out_replay;
3048 
3049 	/*
3050 	 * If the request was innocent, we leave the request in the ELSP
3051 	 * and will try to replay it on restarting. The context image may
3052 	 * have been corrupted by the reset, in which case we may have
3053 	 * to service a new GPU hang, but more likely we can continue on
3054 	 * without impact.
3055 	 *
3056 	 * If the request was guilty, we presume the context is corrupt
3057 	 * and have to at least restore the RING register in the context
3058 	 * image back to the expected values to skip over the guilty request.
3059 	 */
3060 	__i915_request_reset(rq, stalled);
3061 	if (!stalled)
3062 		goto out_replay;
3063 
3064 	/*
3065 	 * We want a simple context + ring to execute the breadcrumb update.
3066 	 * We cannot rely on the context being intact across the GPU hang,
3067 	 * so clear it and rebuild just what we need for the breadcrumb.
3068 	 * All pending requests for this context will be zapped, and any
3069 	 * future request will be after userspace has had the opportunity
3070 	 * to recreate its own state.
3071 	 */
3072 	GEM_BUG_ON(!intel_context_is_pinned(ce));
3073 	restore_default_state(ce, engine);
3074 
3075 out_replay:
3076 	GEM_TRACE("%s replay {head:%04x, tail:%04x}\n",
3077 		  engine->name, ce->ring->head, ce->ring->tail);
3078 	intel_ring_update_space(ce->ring);
3079 	__execlists_reset_reg_state(ce, engine);
3080 	__execlists_update_reg_state(ce, engine);
3081 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3082 
3083 unwind:
3084 	/* Push back any incomplete requests for replay after the reset. */
3085 	cancel_port_requests(execlists);
3086 	__unwind_incomplete_requests(engine);
3087 }
3088 
3089 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
3090 {
3091 	unsigned long flags;
3092 
3093 	GEM_TRACE("%s\n", engine->name);
3094 
3095 	spin_lock_irqsave(&engine->active.lock, flags);
3096 
3097 	__execlists_reset(engine, stalled);
3098 
3099 	spin_unlock_irqrestore(&engine->active.lock, flags);
3100 }
3101 
3102 static void nop_submission_tasklet(unsigned long data)
3103 {
3104 	/* The driver is wedged; don't process any more events. */
3105 }
3106 
3107 static void execlists_cancel_requests(struct intel_engine_cs *engine)
3108 {
3109 	struct intel_engine_execlists * const execlists = &engine->execlists;
3110 	struct i915_request *rq, *rn;
3111 	struct rb_node *rb;
3112 	unsigned long flags;
3113 
3114 	GEM_TRACE("%s\n", engine->name);
3115 
3116 	/*
3117 	 * Before we call engine->cancel_requests(), we should have exclusive
3118 	 * access to the submission state. This is arranged for us by the
3119 	 * caller disabling the interrupt generation, the tasklet and other
3120 	 * threads that may then access the same state, giving us a free hand
3121 	 * to reset state. However, we still need to let lockdep be aware that
3122 	 * we know this state may be accessed in hardirq context, so we
3123 	 * disable the irq around this manipulation and we want to keep
3124 	 * the spinlock focused on its duties and not accidentally conflate
3125 	 * coverage to the submission's irq state. (Similarly, although we
3126 	 * shouldn't need to disable irq around the manipulation of the
3127 	 * submission's irq state, we also wish to remind ourselves that
3128 	 * it is irq state.)
3129 	 */
3130 	spin_lock_irqsave(&engine->active.lock, flags);
3131 
3132 	__execlists_reset(engine, true);
3133 
3134 	/* Mark all executing requests as skipped. */
3135 	list_for_each_entry(rq, &engine->active.requests, sched.link)
3136 		mark_eio(rq);
3137 
3138 	/* Flush the queued requests to the timeline list (for retiring). */
3139 	while ((rb = rb_first_cached(&execlists->queue))) {
3140 		struct i915_priolist *p = to_priolist(rb);
3141 		int i;
3142 
3143 		priolist_for_each_request_consume(rq, rn, p, i) {
3144 			mark_eio(rq);
3145 			__i915_request_submit(rq);
3146 		}
3147 
3148 		rb_erase_cached(&p->node, &execlists->queue);
3149 		i915_priolist_free(p);
3150 	}
3151 
3152 	/* Cancel all attached virtual engines */
3153 	while ((rb = rb_first_cached(&execlists->virtual))) {
3154 		struct virtual_engine *ve =
3155 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3156 
3157 		rb_erase_cached(rb, &execlists->virtual);
3158 		RB_CLEAR_NODE(rb);
3159 
3160 		spin_lock(&ve->base.active.lock);
3161 		rq = fetch_and_zero(&ve->request);
3162 		if (rq) {
3163 			mark_eio(rq);
3164 
3165 			rq->engine = engine;
3166 			__i915_request_submit(rq);
3167 			i915_request_put(rq);
3168 
3169 			ve->base.execlists.queue_priority_hint = INT_MIN;
3170 		}
3171 		spin_unlock(&ve->base.active.lock);
3172 	}
3173 
3174 	/* Remaining _unready_ requests will be nop'ed when submitted */
3175 
3176 	execlists->queue_priority_hint = INT_MIN;
3177 	execlists->queue = RB_ROOT_CACHED;
3178 
3179 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3180 	execlists->tasklet.func = nop_submission_tasklet;
3181 
3182 	spin_unlock_irqrestore(&engine->active.lock, flags);
3183 }
3184 
3185 static void execlists_reset_finish(struct intel_engine_cs *engine)
3186 {
3187 	struct intel_engine_execlists * const execlists = &engine->execlists;
3188 
3189 	/*
3190 	 * After a GPU reset, we may have requests to replay. Do so now while
3191 	 * we still have the forcewake to be sure that the GPU is not allowed
3192 	 * to sleep before we restart and reload a context.
3193 	 */
3194 	GEM_BUG_ON(!reset_in_progress(execlists));
3195 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3196 		execlists->tasklet.func(execlists->tasklet.data);
3197 
3198 	if (__tasklet_enable(&execlists->tasklet))
3199 		/* And kick in case we missed a new request submission. */
3200 		tasklet_hi_schedule(&execlists->tasklet);
3201 	GEM_TRACE("%s: depth->%d\n", engine->name,
3202 		  atomic_read(&execlists->tasklet.count));
3203 }
3204 
3205 static int gen8_emit_bb_start(struct i915_request *rq,
3206 			      u64 offset, u32 len,
3207 			      const unsigned int flags)
3208 {
3209 	u32 *cs;
3210 
3211 	cs = intel_ring_begin(rq, 4);
3212 	if (IS_ERR(cs))
3213 		return PTR_ERR(cs);
3214 
3215 	/*
3216 	 * WaDisableCtxRestoreArbitration:bdw,chv
3217 	 *
3218 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3219 	 * particular all the gen that do not need the w/a at all!), if we
3220 	 * took care to make sure that on every switch into this context
3221 	 * (both ordinary and for preemption) that arbitrartion was enabled
3222 	 * we would be fine.  However, for gen8 there is another w/a that
3223 	 * requires us to not preempt inside GPGPU execution, so we keep
3224 	 * arbitration disabled for gen8 batches. Arbitration will be
3225 	 * re-enabled before we close the request
3226 	 * (engine->emit_fini_breadcrumb).
3227 	 */
3228 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3229 
3230 	/* FIXME(BDW+): Address space and security selectors. */
3231 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3232 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3233 	*cs++ = lower_32_bits(offset);
3234 	*cs++ = upper_32_bits(offset);
3235 
3236 	intel_ring_advance(rq, cs);
3237 
3238 	return 0;
3239 }
3240 
3241 static int gen9_emit_bb_start(struct i915_request *rq,
3242 			      u64 offset, u32 len,
3243 			      const unsigned int flags)
3244 {
3245 	u32 *cs;
3246 
3247 	cs = intel_ring_begin(rq, 6);
3248 	if (IS_ERR(cs))
3249 		return PTR_ERR(cs);
3250 
3251 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3252 
3253 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3254 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3255 	*cs++ = lower_32_bits(offset);
3256 	*cs++ = upper_32_bits(offset);
3257 
3258 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3259 	*cs++ = MI_NOOP;
3260 
3261 	intel_ring_advance(rq, cs);
3262 
3263 	return 0;
3264 }
3265 
3266 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3267 {
3268 	ENGINE_WRITE(engine, RING_IMR,
3269 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
3270 	ENGINE_POSTING_READ(engine, RING_IMR);
3271 }
3272 
3273 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3274 {
3275 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3276 }
3277 
3278 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3279 {
3280 	u32 cmd, *cs;
3281 
3282 	cs = intel_ring_begin(request, 4);
3283 	if (IS_ERR(cs))
3284 		return PTR_ERR(cs);
3285 
3286 	cmd = MI_FLUSH_DW + 1;
3287 
3288 	/* We always require a command barrier so that subsequent
3289 	 * commands, such as breadcrumb interrupts, are strictly ordered
3290 	 * wrt the contents of the write cache being flushed to memory
3291 	 * (and thus being coherent from the CPU).
3292 	 */
3293 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3294 
3295 	if (mode & EMIT_INVALIDATE) {
3296 		cmd |= MI_INVALIDATE_TLB;
3297 		if (request->engine->class == VIDEO_DECODE_CLASS)
3298 			cmd |= MI_INVALIDATE_BSD;
3299 	}
3300 
3301 	*cs++ = cmd;
3302 	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3303 	*cs++ = 0; /* upper addr */
3304 	*cs++ = 0; /* value */
3305 	intel_ring_advance(request, cs);
3306 
3307 	return 0;
3308 }
3309 
3310 static int gen8_emit_flush_render(struct i915_request *request,
3311 				  u32 mode)
3312 {
3313 	bool vf_flush_wa = false, dc_flush_wa = false;
3314 	u32 *cs, flags = 0;
3315 	int len;
3316 
3317 	flags |= PIPE_CONTROL_CS_STALL;
3318 
3319 	if (mode & EMIT_FLUSH) {
3320 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3321 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3322 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3323 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3324 	}
3325 
3326 	if (mode & EMIT_INVALIDATE) {
3327 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3328 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3329 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3330 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3331 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3332 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3333 		flags |= PIPE_CONTROL_QW_WRITE;
3334 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3335 
3336 		/*
3337 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3338 		 * pipe control.
3339 		 */
3340 		if (IS_GEN(request->i915, 9))
3341 			vf_flush_wa = true;
3342 
3343 		/* WaForGAMHang:kbl */
3344 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3345 			dc_flush_wa = true;
3346 	}
3347 
3348 	len = 6;
3349 
3350 	if (vf_flush_wa)
3351 		len += 6;
3352 
3353 	if (dc_flush_wa)
3354 		len += 12;
3355 
3356 	cs = intel_ring_begin(request, len);
3357 	if (IS_ERR(cs))
3358 		return PTR_ERR(cs);
3359 
3360 	if (vf_flush_wa)
3361 		cs = gen8_emit_pipe_control(cs, 0, 0);
3362 
3363 	if (dc_flush_wa)
3364 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3365 					    0);
3366 
3367 	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3368 
3369 	if (dc_flush_wa)
3370 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3371 
3372 	intel_ring_advance(request, cs);
3373 
3374 	return 0;
3375 }
3376 
3377 static int gen11_emit_flush_render(struct i915_request *request,
3378 				   u32 mode)
3379 {
3380 	if (mode & EMIT_FLUSH) {
3381 		u32 *cs;
3382 		u32 flags = 0;
3383 
3384 		flags |= PIPE_CONTROL_CS_STALL;
3385 
3386 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3387 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3388 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3389 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3390 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3391 		flags |= PIPE_CONTROL_QW_WRITE;
3392 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3393 
3394 		cs = intel_ring_begin(request, 6);
3395 		if (IS_ERR(cs))
3396 			return PTR_ERR(cs);
3397 
3398 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3399 		intel_ring_advance(request, cs);
3400 	}
3401 
3402 	if (mode & EMIT_INVALIDATE) {
3403 		u32 *cs;
3404 		u32 flags = 0;
3405 
3406 		flags |= PIPE_CONTROL_CS_STALL;
3407 
3408 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3409 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3410 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3411 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3412 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3413 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3414 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3415 		flags |= PIPE_CONTROL_QW_WRITE;
3416 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3417 
3418 		cs = intel_ring_begin(request, 6);
3419 		if (IS_ERR(cs))
3420 			return PTR_ERR(cs);
3421 
3422 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3423 		intel_ring_advance(request, cs);
3424 	}
3425 
3426 	return 0;
3427 }
3428 
3429 static u32 preparser_disable(bool state)
3430 {
3431 	return MI_ARB_CHECK | 1 << 8 | state;
3432 }
3433 
3434 static int gen12_emit_flush_render(struct i915_request *request,
3435 				   u32 mode)
3436 {
3437 	if (mode & EMIT_FLUSH) {
3438 		u32 flags = 0;
3439 		u32 *cs;
3440 
3441 		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3442 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3443 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3444 		/* Wa_1409600907:tgl */
3445 		flags |= PIPE_CONTROL_DEPTH_STALL;
3446 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3447 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
3448 		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3449 
3450 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3451 		flags |= PIPE_CONTROL_QW_WRITE;
3452 
3453 		flags |= PIPE_CONTROL_CS_STALL;
3454 
3455 		cs = intel_ring_begin(request, 6);
3456 		if (IS_ERR(cs))
3457 			return PTR_ERR(cs);
3458 
3459 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3460 		intel_ring_advance(request, cs);
3461 	}
3462 
3463 	if (mode & EMIT_INVALIDATE) {
3464 		u32 flags = 0;
3465 		u32 *cs;
3466 
3467 		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3468 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
3469 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3470 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3471 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3472 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3473 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3474 		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
3475 
3476 		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3477 		flags |= PIPE_CONTROL_QW_WRITE;
3478 
3479 		flags |= PIPE_CONTROL_CS_STALL;
3480 
3481 		cs = intel_ring_begin(request, 8);
3482 		if (IS_ERR(cs))
3483 			return PTR_ERR(cs);
3484 
3485 		/*
3486 		 * Prevent the pre-parser from skipping past the TLB
3487 		 * invalidate and loading a stale page for the batch
3488 		 * buffer / request payload.
3489 		 */
3490 		*cs++ = preparser_disable(true);
3491 
3492 		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3493 
3494 		*cs++ = preparser_disable(false);
3495 		intel_ring_advance(request, cs);
3496 
3497 		/*
3498 		 * Wa_1604544889:tgl
3499 		 */
3500 		if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
3501 			flags = 0;
3502 			flags |= PIPE_CONTROL_CS_STALL;
3503 			flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3504 
3505 			flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3506 			flags |= PIPE_CONTROL_QW_WRITE;
3507 
3508 			cs = intel_ring_begin(request, 6);
3509 			if (IS_ERR(cs))
3510 				return PTR_ERR(cs);
3511 
3512 			cs = gen8_emit_pipe_control(cs, flags,
3513 						    LRC_PPHWSP_SCRATCH_ADDR);
3514 			intel_ring_advance(request, cs);
3515 		}
3516 	}
3517 
3518 	return 0;
3519 }
3520 
3521 /*
3522  * Reserve space for 2 NOOPs at the end of each request to be
3523  * used as a workaround for not being allowed to do lite
3524  * restore with HEAD==TAIL (WaIdleLiteRestore).
3525  */
3526 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
3527 {
3528 	/* Ensure there's always at least one preemption point per-request. */
3529 	*cs++ = MI_ARB_CHECK;
3530 	*cs++ = MI_NOOP;
3531 	request->wa_tail = intel_ring_offset(request, cs);
3532 
3533 	return cs;
3534 }
3535 
3536 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
3537 {
3538 	*cs++ = MI_SEMAPHORE_WAIT |
3539 		MI_SEMAPHORE_GLOBAL_GTT |
3540 		MI_SEMAPHORE_POLL |
3541 		MI_SEMAPHORE_SAD_EQ_SDD;
3542 	*cs++ = 0;
3543 	*cs++ = intel_hws_preempt_address(request->engine);
3544 	*cs++ = 0;
3545 
3546 	return cs;
3547 }
3548 
3549 static __always_inline u32*
3550 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
3551 				 u32 *cs)
3552 {
3553 	*cs++ = MI_USER_INTERRUPT;
3554 
3555 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3556 	if (intel_engine_has_semaphores(request->engine))
3557 		cs = emit_preempt_busywait(request, cs);
3558 
3559 	request->tail = intel_ring_offset(request, cs);
3560 	assert_ring_tail_valid(request->ring, request->tail);
3561 
3562 	return gen8_emit_wa_tail(request, cs);
3563 }
3564 
3565 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3566 {
3567 	cs = gen8_emit_ggtt_write(cs,
3568 				  request->fence.seqno,
3569 				  i915_request_active_timeline(request)->hwsp_offset,
3570 				  0);
3571 
3572 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3573 }
3574 
3575 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3576 {
3577 	cs = gen8_emit_pipe_control(cs,
3578 				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3579 				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3580 				    PIPE_CONTROL_DC_FLUSH_ENABLE,
3581 				    0);
3582 
3583 	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
3584 	cs = gen8_emit_ggtt_write_rcs(cs,
3585 				      request->fence.seqno,
3586 				      i915_request_active_timeline(request)->hwsp_offset,
3587 				      PIPE_CONTROL_FLUSH_ENABLE |
3588 				      PIPE_CONTROL_CS_STALL);
3589 
3590 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3591 }
3592 
3593 static u32 *
3594 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3595 {
3596 	cs = gen8_emit_ggtt_write_rcs(cs,
3597 				      request->fence.seqno,
3598 				      i915_request_active_timeline(request)->hwsp_offset,
3599 				      PIPE_CONTROL_CS_STALL |
3600 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
3601 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3602 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3603 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
3604 				      PIPE_CONTROL_FLUSH_ENABLE);
3605 
3606 	return gen8_emit_fini_breadcrumb_footer(request, cs);
3607 }
3608 
3609 /*
3610  * Note that the CS instruction pre-parser will not stall on the breadcrumb
3611  * flush and will continue pre-fetching the instructions after it before the
3612  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
3613  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
3614  * of the next request before the memory has been flushed, we're guaranteed that
3615  * we won't access the batch itself too early.
3616  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
3617  * so, if the current request is modifying an instruction in the next request on
3618  * the same intel_context, we might pre-fetch and then execute the pre-update
3619  * instruction. To avoid this, the users of self-modifying code should either
3620  * disable the parser around the code emitting the memory writes, via a new flag
3621  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
3622  * the in-kernel use-cases we've opted to use a separate context, see
3623  * reloc_gpu() as an example.
3624  * All the above applies only to the instructions themselves. Non-inline data
3625  * used by the instructions is not pre-fetched.
3626  */
3627 
3628 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
3629 {
3630 	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
3631 		MI_SEMAPHORE_GLOBAL_GTT |
3632 		MI_SEMAPHORE_POLL |
3633 		MI_SEMAPHORE_SAD_EQ_SDD;
3634 	*cs++ = 0;
3635 	*cs++ = intel_hws_preempt_address(request->engine);
3636 	*cs++ = 0;
3637 	*cs++ = 0;
3638 	*cs++ = MI_NOOP;
3639 
3640 	return cs;
3641 }
3642 
3643 static __always_inline u32*
3644 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
3645 {
3646 	*cs++ = MI_USER_INTERRUPT;
3647 
3648 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3649 	if (intel_engine_has_semaphores(request->engine))
3650 		cs = gen12_emit_preempt_busywait(request, cs);
3651 
3652 	request->tail = intel_ring_offset(request, cs);
3653 	assert_ring_tail_valid(request->ring, request->tail);
3654 
3655 	return gen8_emit_wa_tail(request, cs);
3656 }
3657 
3658 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3659 {
3660 	cs = gen8_emit_ggtt_write(cs,
3661 				  request->fence.seqno,
3662 				  i915_request_active_timeline(request)->hwsp_offset,
3663 				  0);
3664 
3665 	return gen12_emit_fini_breadcrumb_footer(request, cs);
3666 }
3667 
3668 static u32 *
3669 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3670 {
3671 	cs = gen8_emit_ggtt_write_rcs(cs,
3672 				      request->fence.seqno,
3673 				      i915_request_active_timeline(request)->hwsp_offset,
3674 				      PIPE_CONTROL_CS_STALL |
3675 				      PIPE_CONTROL_TILE_CACHE_FLUSH |
3676 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3677 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3678 				      /* Wa_1409600907:tgl */
3679 				      PIPE_CONTROL_DEPTH_STALL |
3680 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
3681 				      PIPE_CONTROL_FLUSH_ENABLE |
3682 				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
3683 
3684 	return gen12_emit_fini_breadcrumb_footer(request, cs);
3685 }
3686 
3687 static void execlists_park(struct intel_engine_cs *engine)
3688 {
3689 	cancel_timer(&engine->execlists.timer);
3690 	cancel_timer(&engine->execlists.preempt);
3691 }
3692 
3693 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
3694 {
3695 	engine->submit_request = execlists_submit_request;
3696 	engine->cancel_requests = execlists_cancel_requests;
3697 	engine->schedule = i915_schedule;
3698 	engine->execlists.tasklet.func = execlists_submission_tasklet;
3699 
3700 	engine->reset.prepare = execlists_reset_prepare;
3701 	engine->reset.reset = execlists_reset;
3702 	engine->reset.finish = execlists_reset_finish;
3703 
3704 	engine->park = execlists_park;
3705 	engine->unpark = NULL;
3706 
3707 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
3708 	if (!intel_vgpu_active(engine->i915)) {
3709 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
3710 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
3711 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3712 	}
3713 
3714 	if (INTEL_GEN(engine->i915) >= 12)
3715 		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
3716 }
3717 
3718 static void execlists_destroy(struct intel_engine_cs *engine)
3719 {
3720 	intel_engine_cleanup_common(engine);
3721 	lrc_destroy_wa_ctx(engine);
3722 	kfree(engine);
3723 }
3724 
3725 static void
3726 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3727 {
3728 	/* Default vfuncs which can be overriden by each engine. */
3729 
3730 	engine->destroy = execlists_destroy;
3731 	engine->resume = execlists_resume;
3732 
3733 	engine->reset.prepare = execlists_reset_prepare;
3734 	engine->reset.reset = execlists_reset;
3735 	engine->reset.finish = execlists_reset_finish;
3736 
3737 	engine->cops = &execlists_context_ops;
3738 	engine->request_alloc = execlists_request_alloc;
3739 
3740 	engine->emit_flush = gen8_emit_flush;
3741 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3742 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
3743 	if (INTEL_GEN(engine->i915) >= 12)
3744 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
3745 
3746 	engine->set_default_submission = intel_execlists_set_default_submission;
3747 
3748 	if (INTEL_GEN(engine->i915) < 11) {
3749 		engine->irq_enable = gen8_logical_ring_enable_irq;
3750 		engine->irq_disable = gen8_logical_ring_disable_irq;
3751 	} else {
3752 		/*
3753 		 * TODO: On Gen11 interrupt masks need to be clear
3754 		 * to allow C6 entry. Keep interrupts enabled at
3755 		 * and take the hit of generating extra interrupts
3756 		 * until a more refined solution exists.
3757 		 */
3758 	}
3759 	if (IS_GEN(engine->i915, 8))
3760 		engine->emit_bb_start = gen8_emit_bb_start;
3761 	else
3762 		engine->emit_bb_start = gen9_emit_bb_start;
3763 }
3764 
3765 static inline void
3766 logical_ring_default_irqs(struct intel_engine_cs *engine)
3767 {
3768 	unsigned int shift = 0;
3769 
3770 	if (INTEL_GEN(engine->i915) < 11) {
3771 		const u8 irq_shifts[] = {
3772 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
3773 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
3774 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
3775 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
3776 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
3777 		};
3778 
3779 		shift = irq_shifts[engine->id];
3780 	}
3781 
3782 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3783 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3784 }
3785 
3786 static void rcs_submission_override(struct intel_engine_cs *engine)
3787 {
3788 	switch (INTEL_GEN(engine->i915)) {
3789 	case 12:
3790 		engine->emit_flush = gen12_emit_flush_render;
3791 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
3792 		break;
3793 	case 11:
3794 		engine->emit_flush = gen11_emit_flush_render;
3795 		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3796 		break;
3797 	default:
3798 		engine->emit_flush = gen8_emit_flush_render;
3799 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3800 		break;
3801 	}
3802 }
3803 
3804 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3805 {
3806 	tasklet_init(&engine->execlists.tasklet,
3807 		     execlists_submission_tasklet, (unsigned long)engine);
3808 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
3809 	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
3810 
3811 	logical_ring_default_vfuncs(engine);
3812 	logical_ring_default_irqs(engine);
3813 
3814 	if (engine->class == RENDER_CLASS)
3815 		rcs_submission_override(engine);
3816 
3817 	return 0;
3818 }
3819 
3820 int intel_execlists_submission_init(struct intel_engine_cs *engine)
3821 {
3822 	struct intel_engine_execlists * const execlists = &engine->execlists;
3823 	struct drm_i915_private *i915 = engine->i915;
3824 	struct intel_uncore *uncore = engine->uncore;
3825 	u32 base = engine->mmio_base;
3826 	int ret;
3827 
3828 	ret = intel_engine_init_common(engine);
3829 	if (ret)
3830 		return ret;
3831 
3832 	if (intel_init_workaround_bb(engine))
3833 		/*
3834 		 * We continue even if we fail to initialize WA batch
3835 		 * because we only expect rare glitches but nothing
3836 		 * critical to prevent us from using GPU
3837 		 */
3838 		DRM_ERROR("WA batch buffer initialization failed\n");
3839 
3840 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
3841 		execlists->submit_reg = uncore->regs +
3842 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
3843 		execlists->ctrl_reg = uncore->regs +
3844 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
3845 	} else {
3846 		execlists->submit_reg = uncore->regs +
3847 			i915_mmio_reg_offset(RING_ELSP(base));
3848 	}
3849 
3850 	execlists->csb_status =
3851 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
3852 
3853 	execlists->csb_write =
3854 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
3855 
3856 	if (INTEL_GEN(i915) < 11)
3857 		execlists->csb_size = GEN8_CSB_ENTRIES;
3858 	else
3859 		execlists->csb_size = GEN11_CSB_ENTRIES;
3860 
3861 	reset_csb_pointers(engine);
3862 
3863 	return 0;
3864 }
3865 
3866 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
3867 {
3868 	u32 indirect_ctx_offset;
3869 
3870 	switch (INTEL_GEN(engine->i915)) {
3871 	default:
3872 		MISSING_CASE(INTEL_GEN(engine->i915));
3873 		/* fall through */
3874 	case 12:
3875 		indirect_ctx_offset =
3876 			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3877 		break;
3878 	case 11:
3879 		indirect_ctx_offset =
3880 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3881 		break;
3882 	case 10:
3883 		indirect_ctx_offset =
3884 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3885 		break;
3886 	case 9:
3887 		indirect_ctx_offset =
3888 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3889 		break;
3890 	case 8:
3891 		indirect_ctx_offset =
3892 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3893 		break;
3894 	}
3895 
3896 	return indirect_ctx_offset;
3897 }
3898 
3899 
3900 static void init_common_reg_state(u32 * const regs,
3901 				  const struct intel_engine_cs *engine,
3902 				  const struct intel_ring *ring)
3903 {
3904 	regs[CTX_CONTEXT_CONTROL] =
3905 		_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
3906 		_MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
3907 	if (INTEL_GEN(engine->i915) < 11)
3908 		regs[CTX_CONTEXT_CONTROL] |=
3909 			_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
3910 					    CTX_CTRL_RS_CTX_ENABLE);
3911 
3912 	regs[CTX_RING_BUFFER_CONTROL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3913 	regs[CTX_BB_STATE] = RING_BB_PPGTT;
3914 }
3915 
3916 static void init_wa_bb_reg_state(u32 * const regs,
3917 				 const struct intel_engine_cs *engine,
3918 				 u32 pos_bb_per_ctx)
3919 {
3920 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
3921 
3922 	if (wa_ctx->per_ctx.size) {
3923 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3924 
3925 		regs[pos_bb_per_ctx] =
3926 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
3927 	}
3928 
3929 	if (wa_ctx->indirect_ctx.size) {
3930 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3931 
3932 		regs[pos_bb_per_ctx + 2] =
3933 			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
3934 			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
3935 
3936 		regs[pos_bb_per_ctx + 4] =
3937 			intel_lr_indirect_ctx_offset(engine) << 6;
3938 	}
3939 }
3940 
3941 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
3942 {
3943 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
3944 		/* 64b PPGTT (48bit canonical)
3945 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
3946 		 * other PDP Descriptors are ignored.
3947 		 */
3948 		ASSIGN_CTX_PML4(ppgtt, regs);
3949 	} else {
3950 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
3951 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
3952 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
3953 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
3954 	}
3955 }
3956 
3957 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
3958 {
3959 	if (i915_is_ggtt(vm))
3960 		return i915_vm_to_ggtt(vm)->alias;
3961 	else
3962 		return i915_vm_to_ppgtt(vm);
3963 }
3964 
3965 static void execlists_init_reg_state(u32 *regs,
3966 				     const struct intel_context *ce,
3967 				     const struct intel_engine_cs *engine,
3968 				     const struct intel_ring *ring,
3969 				     bool close)
3970 {
3971 	/*
3972 	 * A context is actually a big batch buffer with several
3973 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
3974 	 * values we are setting here are only for the first context restore:
3975 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
3976 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
3977 	 * we are not initializing here).
3978 	 *
3979 	 * Must keep consistent with virtual_update_register_offsets().
3980 	 */
3981 	u32 *bbe = set_offsets(regs, reg_offsets(engine), engine);
3982 
3983 	if (close) { /* Close the batch; used mainly by live_lrc_layout() */
3984 		*bbe = MI_BATCH_BUFFER_END;
3985 		if (INTEL_GEN(engine->i915) >= 10)
3986 			*bbe |= BIT(0);
3987 	}
3988 
3989 	init_common_reg_state(regs, engine, ring);
3990 	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
3991 
3992 	init_wa_bb_reg_state(regs, engine,
3993 			     INTEL_GEN(engine->i915) >= 12 ?
3994 			     GEN12_CTX_BB_PER_CTX_PTR :
3995 			     CTX_BB_PER_CTX_PTR);
3996 }
3997 
3998 static int
3999 populate_lr_context(struct intel_context *ce,
4000 		    struct drm_i915_gem_object *ctx_obj,
4001 		    struct intel_engine_cs *engine,
4002 		    struct intel_ring *ring)
4003 {
4004 	bool inhibit = true;
4005 	void *vaddr;
4006 	u32 *regs;
4007 	int ret;
4008 
4009 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4010 	if (IS_ERR(vaddr)) {
4011 		ret = PTR_ERR(vaddr);
4012 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4013 		return ret;
4014 	}
4015 
4016 	set_redzone(vaddr, engine);
4017 
4018 	if (engine->default_state) {
4019 		void *defaults;
4020 
4021 		defaults = i915_gem_object_pin_map(engine->default_state,
4022 						   I915_MAP_WB);
4023 		if (IS_ERR(defaults)) {
4024 			ret = PTR_ERR(defaults);
4025 			goto err_unpin_ctx;
4026 		}
4027 
4028 		memcpy(vaddr, defaults, engine->context_size);
4029 		i915_gem_object_unpin_map(engine->default_state);
4030 		inhibit = false;
4031 	}
4032 
4033 	/* The second page of the context object contains some fields which must
4034 	 * be set up prior to the first execution. */
4035 	regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
4036 	execlists_init_reg_state(regs, ce, engine, ring, inhibit);
4037 	if (inhibit)
4038 		regs[CTX_CONTEXT_CONTROL] |=
4039 			_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4040 
4041 	ret = 0;
4042 err_unpin_ctx:
4043 	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4044 	i915_gem_object_unpin_map(ctx_obj);
4045 	return ret;
4046 }
4047 
4048 static int __execlists_context_alloc(struct intel_context *ce,
4049 				     struct intel_engine_cs *engine)
4050 {
4051 	struct drm_i915_gem_object *ctx_obj;
4052 	struct intel_ring *ring;
4053 	struct i915_vma *vma;
4054 	u32 context_size;
4055 	int ret;
4056 
4057 	GEM_BUG_ON(ce->state);
4058 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4059 
4060 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4061 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4062 
4063 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4064 	if (IS_ERR(ctx_obj))
4065 		return PTR_ERR(ctx_obj);
4066 
4067 	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4068 	if (IS_ERR(vma)) {
4069 		ret = PTR_ERR(vma);
4070 		goto error_deref_obj;
4071 	}
4072 
4073 	if (!ce->timeline) {
4074 		struct intel_timeline *tl;
4075 
4076 		tl = intel_timeline_create(engine->gt, NULL);
4077 		if (IS_ERR(tl)) {
4078 			ret = PTR_ERR(tl);
4079 			goto error_deref_obj;
4080 		}
4081 
4082 		ce->timeline = tl;
4083 	}
4084 
4085 	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4086 	if (IS_ERR(ring)) {
4087 		ret = PTR_ERR(ring);
4088 		goto error_deref_obj;
4089 	}
4090 
4091 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4092 	if (ret) {
4093 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4094 		goto error_ring_free;
4095 	}
4096 
4097 	ce->ring = ring;
4098 	ce->state = vma;
4099 
4100 	return 0;
4101 
4102 error_ring_free:
4103 	intel_ring_put(ring);
4104 error_deref_obj:
4105 	i915_gem_object_put(ctx_obj);
4106 	return ret;
4107 }
4108 
4109 static struct list_head *virtual_queue(struct virtual_engine *ve)
4110 {
4111 	return &ve->base.execlists.default_priolist.requests[0];
4112 }
4113 
4114 static void virtual_context_destroy(struct kref *kref)
4115 {
4116 	struct virtual_engine *ve =
4117 		container_of(kref, typeof(*ve), context.ref);
4118 	unsigned int n;
4119 
4120 	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4121 	GEM_BUG_ON(ve->request);
4122 	GEM_BUG_ON(ve->context.inflight);
4123 
4124 	for (n = 0; n < ve->num_siblings; n++) {
4125 		struct intel_engine_cs *sibling = ve->siblings[n];
4126 		struct rb_node *node = &ve->nodes[sibling->id].rb;
4127 		unsigned long flags;
4128 
4129 		if (RB_EMPTY_NODE(node))
4130 			continue;
4131 
4132 		spin_lock_irqsave(&sibling->active.lock, flags);
4133 
4134 		/* Detachment is lazily performed in the execlists tasklet */
4135 		if (!RB_EMPTY_NODE(node))
4136 			rb_erase_cached(node, &sibling->execlists.virtual);
4137 
4138 		spin_unlock_irqrestore(&sibling->active.lock, flags);
4139 	}
4140 	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4141 
4142 	if (ve->context.state)
4143 		__execlists_context_fini(&ve->context);
4144 	intel_context_fini(&ve->context);
4145 
4146 	kfree(ve->bonds);
4147 	kfree(ve);
4148 }
4149 
4150 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4151 {
4152 	int swp;
4153 
4154 	/*
4155 	 * Pick a random sibling on starting to help spread the load around.
4156 	 *
4157 	 * New contexts are typically created with exactly the same order
4158 	 * of siblings, and often started in batches. Due to the way we iterate
4159 	 * the array of sibling when submitting requests, sibling[0] is
4160 	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4161 	 * randomised across the system, we also help spread the load by the
4162 	 * first engine we inspect being different each time.
4163 	 *
4164 	 * NB This does not force us to execute on this engine, it will just
4165 	 * typically be the first we inspect for submission.
4166 	 */
4167 	swp = prandom_u32_max(ve->num_siblings);
4168 	if (!swp)
4169 		return;
4170 
4171 	swap(ve->siblings[swp], ve->siblings[0]);
4172 	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4173 		virtual_update_register_offsets(ve->context.lrc_reg_state,
4174 						ve->siblings[0]);
4175 }
4176 
4177 static int virtual_context_pin(struct intel_context *ce)
4178 {
4179 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4180 	int err;
4181 
4182 	/* Note: we must use a real engine class for setting up reg state */
4183 	err = __execlists_context_pin(ce, ve->siblings[0]);
4184 	if (err)
4185 		return err;
4186 
4187 	virtual_engine_initial_hint(ve);
4188 	return 0;
4189 }
4190 
4191 static void virtual_context_enter(struct intel_context *ce)
4192 {
4193 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4194 	unsigned int n;
4195 
4196 	for (n = 0; n < ve->num_siblings; n++)
4197 		intel_engine_pm_get(ve->siblings[n]);
4198 
4199 	intel_timeline_enter(ce->timeline);
4200 }
4201 
4202 static void virtual_context_exit(struct intel_context *ce)
4203 {
4204 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4205 	unsigned int n;
4206 
4207 	intel_timeline_exit(ce->timeline);
4208 
4209 	for (n = 0; n < ve->num_siblings; n++)
4210 		intel_engine_pm_put(ve->siblings[n]);
4211 }
4212 
4213 static const struct intel_context_ops virtual_context_ops = {
4214 	.pin = virtual_context_pin,
4215 	.unpin = execlists_context_unpin,
4216 
4217 	.enter = virtual_context_enter,
4218 	.exit = virtual_context_exit,
4219 
4220 	.destroy = virtual_context_destroy,
4221 };
4222 
4223 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4224 {
4225 	struct i915_request *rq;
4226 	intel_engine_mask_t mask;
4227 
4228 	rq = READ_ONCE(ve->request);
4229 	if (!rq)
4230 		return 0;
4231 
4232 	/* The rq is ready for submission; rq->execution_mask is now stable. */
4233 	mask = rq->execution_mask;
4234 	if (unlikely(!mask)) {
4235 		/* Invalid selection, submit to a random engine in error */
4236 		i915_request_skip(rq, -ENODEV);
4237 		mask = ve->siblings[0]->mask;
4238 	}
4239 
4240 	GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n",
4241 		  ve->base.name,
4242 		  rq->fence.context, rq->fence.seqno,
4243 		  mask, ve->base.execlists.queue_priority_hint);
4244 
4245 	return mask;
4246 }
4247 
4248 static void virtual_submission_tasklet(unsigned long data)
4249 {
4250 	struct virtual_engine * const ve = (struct virtual_engine *)data;
4251 	const int prio = ve->base.execlists.queue_priority_hint;
4252 	intel_engine_mask_t mask;
4253 	unsigned int n;
4254 
4255 	rcu_read_lock();
4256 	mask = virtual_submission_mask(ve);
4257 	rcu_read_unlock();
4258 	if (unlikely(!mask))
4259 		return;
4260 
4261 	local_irq_disable();
4262 	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4263 		struct intel_engine_cs *sibling = ve->siblings[n];
4264 		struct ve_node * const node = &ve->nodes[sibling->id];
4265 		struct rb_node **parent, *rb;
4266 		bool first;
4267 
4268 		if (unlikely(!(mask & sibling->mask))) {
4269 			if (!RB_EMPTY_NODE(&node->rb)) {
4270 				spin_lock(&sibling->active.lock);
4271 				rb_erase_cached(&node->rb,
4272 						&sibling->execlists.virtual);
4273 				RB_CLEAR_NODE(&node->rb);
4274 				spin_unlock(&sibling->active.lock);
4275 			}
4276 			continue;
4277 		}
4278 
4279 		spin_lock(&sibling->active.lock);
4280 
4281 		if (!RB_EMPTY_NODE(&node->rb)) {
4282 			/*
4283 			 * Cheat and avoid rebalancing the tree if we can
4284 			 * reuse this node in situ.
4285 			 */
4286 			first = rb_first_cached(&sibling->execlists.virtual) ==
4287 				&node->rb;
4288 			if (prio == node->prio || (prio > node->prio && first))
4289 				goto submit_engine;
4290 
4291 			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4292 		}
4293 
4294 		rb = NULL;
4295 		first = true;
4296 		parent = &sibling->execlists.virtual.rb_root.rb_node;
4297 		while (*parent) {
4298 			struct ve_node *other;
4299 
4300 			rb = *parent;
4301 			other = rb_entry(rb, typeof(*other), rb);
4302 			if (prio > other->prio) {
4303 				parent = &rb->rb_left;
4304 			} else {
4305 				parent = &rb->rb_right;
4306 				first = false;
4307 			}
4308 		}
4309 
4310 		rb_link_node(&node->rb, rb, parent);
4311 		rb_insert_color_cached(&node->rb,
4312 				       &sibling->execlists.virtual,
4313 				       first);
4314 
4315 submit_engine:
4316 		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4317 		node->prio = prio;
4318 		if (first && prio > sibling->execlists.queue_priority_hint) {
4319 			sibling->execlists.queue_priority_hint = prio;
4320 			tasklet_hi_schedule(&sibling->execlists.tasklet);
4321 		}
4322 
4323 		spin_unlock(&sibling->active.lock);
4324 	}
4325 	local_irq_enable();
4326 }
4327 
4328 static void virtual_submit_request(struct i915_request *rq)
4329 {
4330 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4331 	struct i915_request *old;
4332 	unsigned long flags;
4333 
4334 	GEM_TRACE("%s: rq=%llx:%lld\n",
4335 		  ve->base.name,
4336 		  rq->fence.context,
4337 		  rq->fence.seqno);
4338 
4339 	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4340 
4341 	spin_lock_irqsave(&ve->base.active.lock, flags);
4342 
4343 	old = ve->request;
4344 	if (old) { /* background completion event from preempt-to-busy */
4345 		GEM_BUG_ON(!i915_request_completed(old));
4346 		__i915_request_submit(old);
4347 		i915_request_put(old);
4348 	}
4349 
4350 	if (i915_request_completed(rq)) {
4351 		__i915_request_submit(rq);
4352 
4353 		ve->base.execlists.queue_priority_hint = INT_MIN;
4354 		ve->request = NULL;
4355 	} else {
4356 		ve->base.execlists.queue_priority_hint = rq_prio(rq);
4357 		ve->request = i915_request_get(rq);
4358 
4359 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4360 		list_move_tail(&rq->sched.link, virtual_queue(ve));
4361 
4362 		tasklet_schedule(&ve->base.execlists.tasklet);
4363 	}
4364 
4365 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
4366 }
4367 
4368 static struct ve_bond *
4369 virtual_find_bond(struct virtual_engine *ve,
4370 		  const struct intel_engine_cs *master)
4371 {
4372 	int i;
4373 
4374 	for (i = 0; i < ve->num_bonds; i++) {
4375 		if (ve->bonds[i].master == master)
4376 			return &ve->bonds[i];
4377 	}
4378 
4379 	return NULL;
4380 }
4381 
4382 static void
4383 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4384 {
4385 	struct virtual_engine *ve = to_virtual_engine(rq->engine);
4386 	intel_engine_mask_t allowed, exec;
4387 	struct ve_bond *bond;
4388 
4389 	allowed = ~to_request(signal)->engine->mask;
4390 
4391 	bond = virtual_find_bond(ve, to_request(signal)->engine);
4392 	if (bond)
4393 		allowed &= bond->sibling_mask;
4394 
4395 	/* Restrict the bonded request to run on only the available engines */
4396 	exec = READ_ONCE(rq->execution_mask);
4397 	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4398 		;
4399 
4400 	/* Prevent the master from being re-run on the bonded engines */
4401 	to_request(signal)->execution_mask &= ~allowed;
4402 }
4403 
4404 struct intel_context *
4405 intel_execlists_create_virtual(struct i915_gem_context *ctx,
4406 			       struct intel_engine_cs **siblings,
4407 			       unsigned int count)
4408 {
4409 	struct virtual_engine *ve;
4410 	unsigned int n;
4411 	int err;
4412 
4413 	if (count == 0)
4414 		return ERR_PTR(-EINVAL);
4415 
4416 	if (count == 1)
4417 		return intel_context_create(ctx, siblings[0]);
4418 
4419 	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4420 	if (!ve)
4421 		return ERR_PTR(-ENOMEM);
4422 
4423 	ve->base.i915 = ctx->i915;
4424 	ve->base.gt = siblings[0]->gt;
4425 	ve->base.uncore = siblings[0]->uncore;
4426 	ve->base.id = -1;
4427 
4428 	ve->base.class = OTHER_CLASS;
4429 	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4430 	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4431 	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4432 
4433 	/*
4434 	 * The decision on whether to submit a request using semaphores
4435 	 * depends on the saturated state of the engine. We only compute
4436 	 * this during HW submission of the request, and we need for this
4437 	 * state to be globally applied to all requests being submitted
4438 	 * to this engine. Virtual engines encompass more than one physical
4439 	 * engine and so we cannot accurately tell in advance if one of those
4440 	 * engines is already saturated and so cannot afford to use a semaphore
4441 	 * and be pessimized in priority for doing so -- if we are the only
4442 	 * context using semaphores after all other clients have stopped, we
4443 	 * will be starved on the saturated system. Such a global switch for
4444 	 * semaphores is less than ideal, but alas is the current compromise.
4445 	 */
4446 	ve->base.saturated = ALL_ENGINES;
4447 
4448 	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4449 
4450 	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4451 	intel_engine_init_breadcrumbs(&ve->base);
4452 
4453 	intel_engine_init_execlists(&ve->base);
4454 
4455 	ve->base.cops = &virtual_context_ops;
4456 	ve->base.request_alloc = execlists_request_alloc;
4457 
4458 	ve->base.schedule = i915_schedule;
4459 	ve->base.submit_request = virtual_submit_request;
4460 	ve->base.bond_execute = virtual_bond_execute;
4461 
4462 	INIT_LIST_HEAD(virtual_queue(ve));
4463 	ve->base.execlists.queue_priority_hint = INT_MIN;
4464 	tasklet_init(&ve->base.execlists.tasklet,
4465 		     virtual_submission_tasklet,
4466 		     (unsigned long)ve);
4467 
4468 	intel_context_init(&ve->context, ctx, &ve->base);
4469 
4470 	for (n = 0; n < count; n++) {
4471 		struct intel_engine_cs *sibling = siblings[n];
4472 
4473 		GEM_BUG_ON(!is_power_of_2(sibling->mask));
4474 		if (sibling->mask & ve->base.mask) {
4475 			DRM_DEBUG("duplicate %s entry in load balancer\n",
4476 				  sibling->name);
4477 			err = -EINVAL;
4478 			goto err_put;
4479 		}
4480 
4481 		/*
4482 		 * The virtual engine implementation is tightly coupled to
4483 		 * the execlists backend -- we push out request directly
4484 		 * into a tree inside each physical engine. We could support
4485 		 * layering if we handle cloning of the requests and
4486 		 * submitting a copy into each backend.
4487 		 */
4488 		if (sibling->execlists.tasklet.func !=
4489 		    execlists_submission_tasklet) {
4490 			err = -ENODEV;
4491 			goto err_put;
4492 		}
4493 
4494 		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4495 		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4496 
4497 		ve->siblings[ve->num_siblings++] = sibling;
4498 		ve->base.mask |= sibling->mask;
4499 
4500 		/*
4501 		 * All physical engines must be compatible for their emission
4502 		 * functions (as we build the instructions during request
4503 		 * construction and do not alter them before submission
4504 		 * on the physical engine). We use the engine class as a guide
4505 		 * here, although that could be refined.
4506 		 */
4507 		if (ve->base.class != OTHER_CLASS) {
4508 			if (ve->base.class != sibling->class) {
4509 				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
4510 					  sibling->class, ve->base.class);
4511 				err = -EINVAL;
4512 				goto err_put;
4513 			}
4514 			continue;
4515 		}
4516 
4517 		ve->base.class = sibling->class;
4518 		ve->base.uabi_class = sibling->uabi_class;
4519 		snprintf(ve->base.name, sizeof(ve->base.name),
4520 			 "v%dx%d", ve->base.class, count);
4521 		ve->base.context_size = sibling->context_size;
4522 
4523 		ve->base.emit_bb_start = sibling->emit_bb_start;
4524 		ve->base.emit_flush = sibling->emit_flush;
4525 		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
4526 		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
4527 		ve->base.emit_fini_breadcrumb_dw =
4528 			sibling->emit_fini_breadcrumb_dw;
4529 
4530 		ve->base.flags = sibling->flags;
4531 	}
4532 
4533 	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
4534 
4535 	err = __execlists_context_alloc(&ve->context, siblings[0]);
4536 	if (err)
4537 		goto err_put;
4538 
4539 	__set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags);
4540 
4541 	return &ve->context;
4542 
4543 err_put:
4544 	intel_context_put(&ve->context);
4545 	return ERR_PTR(err);
4546 }
4547 
4548 struct intel_context *
4549 intel_execlists_clone_virtual(struct i915_gem_context *ctx,
4550 			      struct intel_engine_cs *src)
4551 {
4552 	struct virtual_engine *se = to_virtual_engine(src);
4553 	struct intel_context *dst;
4554 
4555 	dst = intel_execlists_create_virtual(ctx,
4556 					     se->siblings,
4557 					     se->num_siblings);
4558 	if (IS_ERR(dst))
4559 		return dst;
4560 
4561 	if (se->num_bonds) {
4562 		struct virtual_engine *de = to_virtual_engine(dst->engine);
4563 
4564 		de->bonds = kmemdup(se->bonds,
4565 				    sizeof(*se->bonds) * se->num_bonds,
4566 				    GFP_KERNEL);
4567 		if (!de->bonds) {
4568 			intel_context_put(dst);
4569 			return ERR_PTR(-ENOMEM);
4570 		}
4571 
4572 		de->num_bonds = se->num_bonds;
4573 	}
4574 
4575 	return dst;
4576 }
4577 
4578 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
4579 				     const struct intel_engine_cs *master,
4580 				     const struct intel_engine_cs *sibling)
4581 {
4582 	struct virtual_engine *ve = to_virtual_engine(engine);
4583 	struct ve_bond *bond;
4584 	int n;
4585 
4586 	/* Sanity check the sibling is part of the virtual engine */
4587 	for (n = 0; n < ve->num_siblings; n++)
4588 		if (sibling == ve->siblings[n])
4589 			break;
4590 	if (n == ve->num_siblings)
4591 		return -EINVAL;
4592 
4593 	bond = virtual_find_bond(ve, master);
4594 	if (bond) {
4595 		bond->sibling_mask |= sibling->mask;
4596 		return 0;
4597 	}
4598 
4599 	bond = krealloc(ve->bonds,
4600 			sizeof(*bond) * (ve->num_bonds + 1),
4601 			GFP_KERNEL);
4602 	if (!bond)
4603 		return -ENOMEM;
4604 
4605 	bond[ve->num_bonds].master = master;
4606 	bond[ve->num_bonds].sibling_mask = sibling->mask;
4607 
4608 	ve->bonds = bond;
4609 	ve->num_bonds++;
4610 
4611 	return 0;
4612 }
4613 
4614 struct intel_engine_cs *
4615 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
4616 				 unsigned int sibling)
4617 {
4618 	struct virtual_engine *ve = to_virtual_engine(engine);
4619 
4620 	if (sibling >= ve->num_siblings)
4621 		return NULL;
4622 
4623 	return ve->siblings[sibling];
4624 }
4625 
4626 void intel_execlists_show_requests(struct intel_engine_cs *engine,
4627 				   struct drm_printer *m,
4628 				   void (*show_request)(struct drm_printer *m,
4629 							struct i915_request *rq,
4630 							const char *prefix),
4631 				   unsigned int max)
4632 {
4633 	const struct intel_engine_execlists *execlists = &engine->execlists;
4634 	struct i915_request *rq, *last;
4635 	unsigned long flags;
4636 	unsigned int count;
4637 	struct rb_node *rb;
4638 
4639 	spin_lock_irqsave(&engine->active.lock, flags);
4640 
4641 	last = NULL;
4642 	count = 0;
4643 	list_for_each_entry(rq, &engine->active.requests, sched.link) {
4644 		if (count++ < max - 1)
4645 			show_request(m, rq, "\t\tE ");
4646 		else
4647 			last = rq;
4648 	}
4649 	if (last) {
4650 		if (count > max) {
4651 			drm_printf(m,
4652 				   "\t\t...skipping %d executing requests...\n",
4653 				   count - max);
4654 		}
4655 		show_request(m, last, "\t\tE ");
4656 	}
4657 
4658 	last = NULL;
4659 	count = 0;
4660 	if (execlists->queue_priority_hint != INT_MIN)
4661 		drm_printf(m, "\t\tQueue priority hint: %d\n",
4662 			   execlists->queue_priority_hint);
4663 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
4664 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
4665 		int i;
4666 
4667 		priolist_for_each_request(rq, p, i) {
4668 			if (count++ < max - 1)
4669 				show_request(m, rq, "\t\tQ ");
4670 			else
4671 				last = rq;
4672 		}
4673 	}
4674 	if (last) {
4675 		if (count > max) {
4676 			drm_printf(m,
4677 				   "\t\t...skipping %d queued requests...\n",
4678 				   count - max);
4679 		}
4680 		show_request(m, last, "\t\tQ ");
4681 	}
4682 
4683 	last = NULL;
4684 	count = 0;
4685 	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
4686 		struct virtual_engine *ve =
4687 			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4688 		struct i915_request *rq = READ_ONCE(ve->request);
4689 
4690 		if (rq) {
4691 			if (count++ < max - 1)
4692 				show_request(m, rq, "\t\tV ");
4693 			else
4694 				last = rq;
4695 		}
4696 	}
4697 	if (last) {
4698 		if (count > max) {
4699 			drm_printf(m,
4700 				   "\t\t...skipping %d virtual requests...\n",
4701 				   count - max);
4702 		}
4703 		show_request(m, last, "\t\tV ");
4704 	}
4705 
4706 	spin_unlock_irqrestore(&engine->active.lock, flags);
4707 }
4708 
4709 void intel_lr_context_reset(struct intel_engine_cs *engine,
4710 			    struct intel_context *ce,
4711 			    u32 head,
4712 			    bool scrub)
4713 {
4714 	GEM_BUG_ON(!intel_context_is_pinned(ce));
4715 
4716 	/*
4717 	 * We want a simple context + ring to execute the breadcrumb update.
4718 	 * We cannot rely on the context being intact across the GPU hang,
4719 	 * so clear it and rebuild just what we need for the breadcrumb.
4720 	 * All pending requests for this context will be zapped, and any
4721 	 * future request will be after userspace has had the opportunity
4722 	 * to recreate its own state.
4723 	 */
4724 	if (scrub)
4725 		restore_default_state(ce, engine);
4726 
4727 	/* Rerun the request; its payload has been neutered (if guilty). */
4728 	ce->ring->head = head;
4729 	intel_ring_update_space(ce->ring);
4730 
4731 	__execlists_update_reg_state(ce, engine);
4732 }
4733 
4734 bool
4735 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
4736 {
4737 	return engine->set_default_submission ==
4738 	       intel_execlists_set_default_submission;
4739 }
4740 
4741 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4742 #include "selftest_lrc.c"
4743 #endif
4744