xref: /openbmc/linux/drivers/gpu/drm/i915/gt/intel_lrc.c (revision 79ffac85)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30 
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135 
136 #include "i915_drv.h"
137 #include "i915_gem_render_state.h"
138 #include "i915_vgpu.h"
139 #include "intel_lrc_reg.h"
140 #include "intel_mocs.h"
141 #include "intel_reset.h"
142 #include "intel_workarounds.h"
143 
144 #define RING_EXECLIST_QFULL		(1 << 0x2)
145 #define RING_EXECLIST1_VALID		(1 << 0x3)
146 #define RING_EXECLIST0_VALID		(1 << 0x4)
147 #define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
148 #define RING_EXECLIST1_ACTIVE		(1 << 0x11)
149 #define RING_EXECLIST0_ACTIVE		(1 << 0x12)
150 
151 #define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
152 #define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
153 #define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
154 #define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
155 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
156 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
157 
158 #define GEN8_CTX_STATUS_COMPLETED_MASK \
159 	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
160 
161 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
162 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
163 #define WA_TAIL_DWORDS 2
164 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
165 
166 #define ACTIVE_PRIORITY (I915_PRIORITY_NEWCLIENT | I915_PRIORITY_NOSEMAPHORE)
167 
168 static int execlists_context_deferred_alloc(struct intel_context *ce,
169 					    struct intel_engine_cs *engine);
170 static void execlists_init_reg_state(u32 *reg_state,
171 				     struct intel_context *ce,
172 				     struct intel_engine_cs *engine,
173 				     struct intel_ring *ring);
174 
175 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
176 {
177 	return rb_entry(rb, struct i915_priolist, node);
178 }
179 
180 static inline int rq_prio(const struct i915_request *rq)
181 {
182 	return rq->sched.attr.priority;
183 }
184 
185 static int effective_prio(const struct i915_request *rq)
186 {
187 	int prio = rq_prio(rq);
188 
189 	/*
190 	 * On unwinding the active request, we give it a priority bump
191 	 * equivalent to a freshly submitted request. This protects it from
192 	 * being gazumped again, but it would be preferable if we didn't
193 	 * let it be gazumped in the first place!
194 	 *
195 	 * See __unwind_incomplete_requests()
196 	 */
197 	if (~prio & ACTIVE_PRIORITY && __i915_request_has_started(rq)) {
198 		/*
199 		 * After preemption, we insert the active request at the
200 		 * end of the new priority level. This means that we will be
201 		 * _lower_ priority than the preemptee all things equal (and
202 		 * so the preemption is valid), so adjust our comparison
203 		 * accordingly.
204 		 */
205 		prio |= ACTIVE_PRIORITY;
206 		prio--;
207 	}
208 
209 	/* Restrict mere WAIT boosts from triggering preemption */
210 	return prio | __NO_PREEMPTION;
211 }
212 
213 static int queue_prio(const struct intel_engine_execlists *execlists)
214 {
215 	struct i915_priolist *p;
216 	struct rb_node *rb;
217 
218 	rb = rb_first_cached(&execlists->queue);
219 	if (!rb)
220 		return INT_MIN;
221 
222 	/*
223 	 * As the priolist[] are inverted, with the highest priority in [0],
224 	 * we have to flip the index value to become priority.
225 	 */
226 	p = to_priolist(rb);
227 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
228 }
229 
230 static inline bool need_preempt(const struct intel_engine_cs *engine,
231 				const struct i915_request *rq)
232 {
233 	int last_prio;
234 
235 	if (!engine->preempt_context)
236 		return false;
237 
238 	if (i915_request_completed(rq))
239 		return false;
240 
241 	/*
242 	 * Check if the current priority hint merits a preemption attempt.
243 	 *
244 	 * We record the highest value priority we saw during rescheduling
245 	 * prior to this dequeue, therefore we know that if it is strictly
246 	 * less than the current tail of ESLP[0], we do not need to force
247 	 * a preempt-to-idle cycle.
248 	 *
249 	 * However, the priority hint is a mere hint that we may need to
250 	 * preempt. If that hint is stale or we may be trying to preempt
251 	 * ourselves, ignore the request.
252 	 */
253 	last_prio = effective_prio(rq);
254 	if (!__execlists_need_preempt(engine->execlists.queue_priority_hint,
255 				      last_prio))
256 		return false;
257 
258 	/*
259 	 * Check against the first request in ELSP[1], it will, thanks to the
260 	 * power of PI, be the highest priority of that context.
261 	 */
262 	if (!list_is_last(&rq->link, &engine->timeline.requests) &&
263 	    rq_prio(list_next_entry(rq, link)) > last_prio)
264 		return true;
265 
266 	/*
267 	 * If the inflight context did not trigger the preemption, then maybe
268 	 * it was the set of queued requests? Pick the highest priority in
269 	 * the queue (the first active priolist) and see if it deserves to be
270 	 * running instead of ELSP[0].
271 	 *
272 	 * The highest priority request in the queue can not be either
273 	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
274 	 * context, it's priority would not exceed ELSP[0] aka last_prio.
275 	 */
276 	return queue_prio(&engine->execlists) > last_prio;
277 }
278 
279 __maybe_unused static inline bool
280 assert_priority_queue(const struct i915_request *prev,
281 		      const struct i915_request *next)
282 {
283 	const struct intel_engine_execlists *execlists =
284 		&prev->engine->execlists;
285 
286 	/*
287 	 * Without preemption, the prev may refer to the still active element
288 	 * which we refuse to let go.
289 	 *
290 	 * Even with preemption, there are times when we think it is better not
291 	 * to preempt and leave an ostensibly lower priority request in flight.
292 	 */
293 	if (port_request(execlists->port) == prev)
294 		return true;
295 
296 	return rq_prio(prev) >= rq_prio(next);
297 }
298 
299 /*
300  * The context descriptor encodes various attributes of a context,
301  * including its GTT address and some flags. Because it's fairly
302  * expensive to calculate, we'll just do it once and cache the result,
303  * which remains valid until the context is unpinned.
304  *
305  * This is what a descriptor looks like, from LSB to MSB::
306  *
307  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
308  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
309  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
310  *      bits 53-54:    mbz, reserved for use by hardware
311  *      bits 55-63:    group ID, currently unused and set to 0
312  *
313  * Starting from Gen11, the upper dword of the descriptor has a new format:
314  *
315  *      bits 32-36:    reserved
316  *      bits 37-47:    SW context ID
317  *      bits 48:53:    engine instance
318  *      bit 54:        mbz, reserved for use by hardware
319  *      bits 55-60:    SW counter
320  *      bits 61-63:    engine class
321  *
322  * engine info, SW context ID and SW counter need to form a unique number
323  * (Context ID) per lrc.
324  */
325 static u64
326 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
327 {
328 	struct i915_gem_context *ctx = ce->gem_context;
329 	u64 desc;
330 
331 	BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
332 	BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH)));
333 
334 	desc = ctx->desc_template;				/* bits  0-11 */
335 	GEM_BUG_ON(desc & GENMASK_ULL(63, 12));
336 
337 	desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
338 								/* bits 12-31 */
339 	GEM_BUG_ON(desc & GENMASK_ULL(63, 32));
340 
341 	/*
342 	 * The following 32bits are copied into the OA reports (dword 2).
343 	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
344 	 * anything below.
345 	 */
346 	if (INTEL_GEN(engine->i915) >= 11) {
347 		GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
348 		desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
349 								/* bits 37-47 */
350 
351 		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
352 								/* bits 48-53 */
353 
354 		/* TODO: decide what to do with SW counter (bits 55-60) */
355 
356 		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
357 								/* bits 61-63 */
358 	} else {
359 		GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH));
360 		desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT;	/* bits 32-52 */
361 	}
362 
363 	return desc;
364 }
365 
366 static void unwind_wa_tail(struct i915_request *rq)
367 {
368 	rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
369 	assert_ring_tail_valid(rq->ring, rq->tail);
370 }
371 
372 static struct i915_request *
373 __unwind_incomplete_requests(struct intel_engine_cs *engine)
374 {
375 	struct i915_request *rq, *rn, *active = NULL;
376 	struct list_head *uninitialized_var(pl);
377 	int prio = I915_PRIORITY_INVALID | ACTIVE_PRIORITY;
378 
379 	lockdep_assert_held(&engine->timeline.lock);
380 
381 	list_for_each_entry_safe_reverse(rq, rn,
382 					 &engine->timeline.requests,
383 					 link) {
384 		if (i915_request_completed(rq))
385 			break;
386 
387 		__i915_request_unsubmit(rq);
388 		unwind_wa_tail(rq);
389 
390 		GEM_BUG_ON(rq->hw_context->active);
391 
392 		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
393 		if (rq_prio(rq) != prio) {
394 			prio = rq_prio(rq);
395 			pl = i915_sched_lookup_priolist(engine, prio);
396 		}
397 		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
398 
399 		list_add(&rq->sched.link, pl);
400 
401 		active = rq;
402 	}
403 
404 	/*
405 	 * The active request is now effectively the start of a new client
406 	 * stream, so give it the equivalent small priority bump to prevent
407 	 * it being gazumped a second time by another peer.
408 	 *
409 	 * Note we have to be careful not to apply a priority boost to a request
410 	 * still spinning on its semaphores. If the request hasn't started, that
411 	 * means it is still waiting for its dependencies to be signaled, and
412 	 * if we apply a priority boost to this request, we will boost it past
413 	 * its signalers and so break PI.
414 	 *
415 	 * One consequence of this preemption boost is that we may jump
416 	 * over lesser priorities (such as I915_PRIORITY_WAIT), effectively
417 	 * making those priorities non-preemptible. They will be moved forward
418 	 * in the priority queue, but they will not gain immediate access to
419 	 * the GPU.
420 	 */
421 	if (~prio & ACTIVE_PRIORITY && __i915_request_has_started(active)) {
422 		prio |= ACTIVE_PRIORITY;
423 		active->sched.attr.priority = prio;
424 		list_move_tail(&active->sched.link,
425 			       i915_sched_lookup_priolist(engine, prio));
426 	}
427 
428 	return active;
429 }
430 
431 struct i915_request *
432 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
433 {
434 	struct intel_engine_cs *engine =
435 		container_of(execlists, typeof(*engine), execlists);
436 
437 	return __unwind_incomplete_requests(engine);
438 }
439 
440 static inline void
441 execlists_context_status_change(struct i915_request *rq, unsigned long status)
442 {
443 	/*
444 	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
445 	 * The compiler should eliminate this function as dead-code.
446 	 */
447 	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
448 		return;
449 
450 	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
451 				   status, rq);
452 }
453 
454 inline void
455 execlists_user_begin(struct intel_engine_execlists *execlists,
456 		     const struct execlist_port *port)
457 {
458 	execlists_set_active_once(execlists, EXECLISTS_ACTIVE_USER);
459 }
460 
461 inline void
462 execlists_user_end(struct intel_engine_execlists *execlists)
463 {
464 	execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
465 }
466 
467 static inline void
468 execlists_context_schedule_in(struct i915_request *rq)
469 {
470 	GEM_BUG_ON(rq->hw_context->active);
471 
472 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
473 	intel_engine_context_in(rq->engine);
474 	rq->hw_context->active = rq->engine;
475 }
476 
477 static inline void
478 execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
479 {
480 	rq->hw_context->active = NULL;
481 	intel_engine_context_out(rq->engine);
482 	execlists_context_status_change(rq, status);
483 	trace_i915_request_out(rq);
484 }
485 
486 static u64 execlists_update_context(struct i915_request *rq)
487 {
488 	struct intel_context *ce = rq->hw_context;
489 
490 	ce->lrc_reg_state[CTX_RING_TAIL + 1] =
491 		intel_ring_set_tail(rq->ring, rq->tail);
492 
493 	/*
494 	 * Make sure the context image is complete before we submit it to HW.
495 	 *
496 	 * Ostensibly, writes (including the WCB) should be flushed prior to
497 	 * an uncached write such as our mmio register access, the empirical
498 	 * evidence (esp. on Braswell) suggests that the WC write into memory
499 	 * may not be visible to the HW prior to the completion of the UC
500 	 * register write and that we may begin execution from the context
501 	 * before its image is complete leading to invalid PD chasing.
502 	 *
503 	 * Furthermore, Braswell, at least, wants a full mb to be sure that
504 	 * the writes are coherent in memory (visible to the GPU) prior to
505 	 * execution, and not just visible to other CPUs (as is the result of
506 	 * wmb).
507 	 */
508 	mb();
509 	return ce->lrc_desc;
510 }
511 
512 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
513 {
514 	if (execlists->ctrl_reg) {
515 		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
516 		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
517 	} else {
518 		writel(upper_32_bits(desc), execlists->submit_reg);
519 		writel(lower_32_bits(desc), execlists->submit_reg);
520 	}
521 }
522 
523 static void execlists_submit_ports(struct intel_engine_cs *engine)
524 {
525 	struct intel_engine_execlists *execlists = &engine->execlists;
526 	struct execlist_port *port = execlists->port;
527 	unsigned int n;
528 
529 	/*
530 	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
531 	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
532 	 * not be relinquished until the device is idle (see
533 	 * i915_gem_idle_work_handler()). As a precaution, we make sure
534 	 * that all ELSP are drained i.e. we have processed the CSB,
535 	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
536 	 */
537 	GEM_BUG_ON(!engine->i915->gt.awake);
538 
539 	/*
540 	 * ELSQ note: the submit queue is not cleared after being submitted
541 	 * to the HW so we need to make sure we always clean it up. This is
542 	 * currently ensured by the fact that we always write the same number
543 	 * of elsq entries, keep this in mind before changing the loop below.
544 	 */
545 	for (n = execlists_num_ports(execlists); n--; ) {
546 		struct i915_request *rq;
547 		unsigned int count;
548 		u64 desc;
549 
550 		rq = port_unpack(&port[n], &count);
551 		if (rq) {
552 			GEM_BUG_ON(count > !n);
553 			if (!count++)
554 				execlists_context_schedule_in(rq);
555 			port_set(&port[n], port_pack(rq, count));
556 			desc = execlists_update_context(rq);
557 			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
558 
559 			GEM_TRACE("%s in[%d]:  ctx=%d.%d, fence %llx:%lld (current %d), prio=%d\n",
560 				  engine->name, n,
561 				  port[n].context_id, count,
562 				  rq->fence.context, rq->fence.seqno,
563 				  hwsp_seqno(rq),
564 				  rq_prio(rq));
565 		} else {
566 			GEM_BUG_ON(!n);
567 			desc = 0;
568 		}
569 
570 		write_desc(execlists, desc, n);
571 	}
572 
573 	/* we need to manually load the submit queue */
574 	if (execlists->ctrl_reg)
575 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
576 
577 	execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
578 }
579 
580 static bool ctx_single_port_submission(const struct intel_context *ce)
581 {
582 	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
583 		i915_gem_context_force_single_submission(ce->gem_context));
584 }
585 
586 static bool can_merge_ctx(const struct intel_context *prev,
587 			  const struct intel_context *next)
588 {
589 	if (prev != next)
590 		return false;
591 
592 	if (ctx_single_port_submission(prev))
593 		return false;
594 
595 	return true;
596 }
597 
598 static bool can_merge_rq(const struct i915_request *prev,
599 			 const struct i915_request *next)
600 {
601 	GEM_BUG_ON(!assert_priority_queue(prev, next));
602 
603 	if (!can_merge_ctx(prev->hw_context, next->hw_context))
604 		return false;
605 
606 	return true;
607 }
608 
609 static void port_assign(struct execlist_port *port, struct i915_request *rq)
610 {
611 	GEM_BUG_ON(rq == port_request(port));
612 
613 	if (port_isset(port))
614 		i915_request_put(port_request(port));
615 
616 	port_set(port, port_pack(i915_request_get(rq), port_count(port)));
617 }
618 
619 static void inject_preempt_context(struct intel_engine_cs *engine)
620 {
621 	struct intel_engine_execlists *execlists = &engine->execlists;
622 	struct intel_context *ce = engine->preempt_context;
623 	unsigned int n;
624 
625 	GEM_BUG_ON(execlists->preempt_complete_status !=
626 		   upper_32_bits(ce->lrc_desc));
627 
628 	/*
629 	 * Switch to our empty preempt context so
630 	 * the state of the GPU is known (idle).
631 	 */
632 	GEM_TRACE("%s\n", engine->name);
633 	for (n = execlists_num_ports(execlists); --n; )
634 		write_desc(execlists, 0, n);
635 
636 	write_desc(execlists, ce->lrc_desc, n);
637 
638 	/* we need to manually load the submit queue */
639 	if (execlists->ctrl_reg)
640 		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
641 
642 	execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
643 	execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
644 
645 	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
646 }
647 
648 static void complete_preempt_context(struct intel_engine_execlists *execlists)
649 {
650 	GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));
651 
652 	if (inject_preempt_hang(execlists))
653 		return;
654 
655 	execlists_cancel_port_requests(execlists);
656 	__unwind_incomplete_requests(container_of(execlists,
657 						  struct intel_engine_cs,
658 						  execlists));
659 }
660 
661 static void execlists_dequeue(struct intel_engine_cs *engine)
662 {
663 	struct intel_engine_execlists * const execlists = &engine->execlists;
664 	struct execlist_port *port = execlists->port;
665 	const struct execlist_port * const last_port =
666 		&execlists->port[execlists->port_mask];
667 	struct i915_request *last = port_request(port);
668 	struct rb_node *rb;
669 	bool submit = false;
670 
671 	/*
672 	 * Hardware submission is through 2 ports. Conceptually each port
673 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
674 	 * static for a context, and unique to each, so we only execute
675 	 * requests belonging to a single context from each ring. RING_HEAD
676 	 * is maintained by the CS in the context image, it marks the place
677 	 * where it got up to last time, and through RING_TAIL we tell the CS
678 	 * where we want to execute up to this time.
679 	 *
680 	 * In this list the requests are in order of execution. Consecutive
681 	 * requests from the same context are adjacent in the ringbuffer. We
682 	 * can combine these requests into a single RING_TAIL update:
683 	 *
684 	 *              RING_HEAD...req1...req2
685 	 *                                    ^- RING_TAIL
686 	 * since to execute req2 the CS must first execute req1.
687 	 *
688 	 * Our goal then is to point each port to the end of a consecutive
689 	 * sequence of requests as being the most optimal (fewest wake ups
690 	 * and context switches) submission.
691 	 */
692 
693 	if (last) {
694 		/*
695 		 * Don't resubmit or switch until all outstanding
696 		 * preemptions (lite-restore) are seen. Then we
697 		 * know the next preemption status we see corresponds
698 		 * to this ELSP update.
699 		 */
700 		GEM_BUG_ON(!execlists_is_active(execlists,
701 						EXECLISTS_ACTIVE_USER));
702 		GEM_BUG_ON(!port_count(&port[0]));
703 
704 		/*
705 		 * If we write to ELSP a second time before the HW has had
706 		 * a chance to respond to the previous write, we can confuse
707 		 * the HW and hit "undefined behaviour". After writing to ELSP,
708 		 * we must then wait until we see a context-switch event from
709 		 * the HW to indicate that it has had a chance to respond.
710 		 */
711 		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
712 			return;
713 
714 		if (need_preempt(engine, last)) {
715 			inject_preempt_context(engine);
716 			return;
717 		}
718 
719 		/*
720 		 * In theory, we could coalesce more requests onto
721 		 * the second port (the first port is active, with
722 		 * no preemptions pending). However, that means we
723 		 * then have to deal with the possible lite-restore
724 		 * of the second port (as we submit the ELSP, there
725 		 * may be a context-switch) but also we may complete
726 		 * the resubmission before the context-switch. Ergo,
727 		 * coalescing onto the second port will cause a
728 		 * preemption event, but we cannot predict whether
729 		 * that will affect port[0] or port[1].
730 		 *
731 		 * If the second port is already active, we can wait
732 		 * until the next context-switch before contemplating
733 		 * new requests. The GPU will be busy and we should be
734 		 * able to resubmit the new ELSP before it idles,
735 		 * avoiding pipeline bubbles (momentary pauses where
736 		 * the driver is unable to keep up the supply of new
737 		 * work). However, we have to double check that the
738 		 * priorities of the ports haven't been switch.
739 		 */
740 		if (port_count(&port[1]))
741 			return;
742 
743 		/*
744 		 * WaIdleLiteRestore:bdw,skl
745 		 * Apply the wa NOOPs to prevent
746 		 * ring:HEAD == rq:TAIL as we resubmit the
747 		 * request. See gen8_emit_fini_breadcrumb() for
748 		 * where we prepare the padding after the
749 		 * end of the request.
750 		 */
751 		last->tail = last->wa_tail;
752 	}
753 
754 	while ((rb = rb_first_cached(&execlists->queue))) {
755 		struct i915_priolist *p = to_priolist(rb);
756 		struct i915_request *rq, *rn;
757 		int i;
758 
759 		priolist_for_each_request_consume(rq, rn, p, i) {
760 			/*
761 			 * Can we combine this request with the current port?
762 			 * It has to be the same context/ringbuffer and not
763 			 * have any exceptions (e.g. GVT saying never to
764 			 * combine contexts).
765 			 *
766 			 * If we can combine the requests, we can execute both
767 			 * by updating the RING_TAIL to point to the end of the
768 			 * second request, and so we never need to tell the
769 			 * hardware about the first.
770 			 */
771 			if (last && !can_merge_rq(last, rq)) {
772 				/*
773 				 * If we are on the second port and cannot
774 				 * combine this request with the last, then we
775 				 * are done.
776 				 */
777 				if (port == last_port)
778 					goto done;
779 
780 				/*
781 				 * We must not populate both ELSP[] with the
782 				 * same LRCA, i.e. we must submit 2 different
783 				 * contexts if we submit 2 ELSP.
784 				 */
785 				if (last->hw_context == rq->hw_context)
786 					goto done;
787 
788 				/*
789 				 * If GVT overrides us we only ever submit
790 				 * port[0], leaving port[1] empty. Note that we
791 				 * also have to be careful that we don't queue
792 				 * the same context (even though a different
793 				 * request) to the second port.
794 				 */
795 				if (ctx_single_port_submission(last->hw_context) ||
796 				    ctx_single_port_submission(rq->hw_context))
797 					goto done;
798 
799 
800 				if (submit)
801 					port_assign(port, last);
802 				port++;
803 
804 				GEM_BUG_ON(port_isset(port));
805 			}
806 
807 			list_del_init(&rq->sched.link);
808 
809 			__i915_request_submit(rq);
810 			trace_i915_request_in(rq, port_index(port, execlists));
811 
812 			last = rq;
813 			submit = true;
814 		}
815 
816 		rb_erase_cached(&p->node, &execlists->queue);
817 		i915_priolist_free(p);
818 	}
819 
820 done:
821 	/*
822 	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
823 	 *
824 	 * We choose the priority hint such that if we add a request of greater
825 	 * priority than this, we kick the submission tasklet to decide on
826 	 * the right order of submitting the requests to hardware. We must
827 	 * also be prepared to reorder requests as they are in-flight on the
828 	 * HW. We derive the priority hint then as the first "hole" in
829 	 * the HW submission ports and if there are no available slots,
830 	 * the priority of the lowest executing request, i.e. last.
831 	 *
832 	 * When we do receive a higher priority request ready to run from the
833 	 * user, see queue_request(), the priority hint is bumped to that
834 	 * request triggering preemption on the next dequeue (or subsequent
835 	 * interrupt for secondary ports).
836 	 */
837 	execlists->queue_priority_hint = queue_prio(execlists);
838 
839 	if (submit) {
840 		port_assign(port, last);
841 		execlists_submit_ports(engine);
842 	}
843 
844 	/* We must always keep the beast fed if we have work piled up */
845 	GEM_BUG_ON(rb_first_cached(&execlists->queue) &&
846 		   !port_isset(execlists->port));
847 
848 	/* Re-evaluate the executing context setup after each preemptive kick */
849 	if (last)
850 		execlists_user_begin(execlists, execlists->port);
851 
852 	/* If the engine is now idle, so should be the flag; and vice versa. */
853 	GEM_BUG_ON(execlists_is_active(&engine->execlists,
854 				       EXECLISTS_ACTIVE_USER) ==
855 		   !port_isset(engine->execlists.port));
856 }
857 
858 void
859 execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
860 {
861 	struct execlist_port *port = execlists->port;
862 	unsigned int num_ports = execlists_num_ports(execlists);
863 
864 	while (num_ports-- && port_isset(port)) {
865 		struct i915_request *rq = port_request(port);
866 
867 		GEM_TRACE("%s:port%u fence %llx:%lld, (current %d)\n",
868 			  rq->engine->name,
869 			  (unsigned int)(port - execlists->port),
870 			  rq->fence.context, rq->fence.seqno,
871 			  hwsp_seqno(rq));
872 
873 		GEM_BUG_ON(!execlists->active);
874 		execlists_context_schedule_out(rq,
875 					       i915_request_completed(rq) ?
876 					       INTEL_CONTEXT_SCHEDULE_OUT :
877 					       INTEL_CONTEXT_SCHEDULE_PREEMPTED);
878 
879 		i915_request_put(rq);
880 
881 		memset(port, 0, sizeof(*port));
882 		port++;
883 	}
884 
885 	execlists_clear_all_active(execlists);
886 }
887 
888 static inline void
889 invalidate_csb_entries(const u32 *first, const u32 *last)
890 {
891 	clflush((void *)first);
892 	clflush((void *)last);
893 }
894 
895 static inline bool
896 reset_in_progress(const struct intel_engine_execlists *execlists)
897 {
898 	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
899 }
900 
901 static void process_csb(struct intel_engine_cs *engine)
902 {
903 	struct intel_engine_execlists * const execlists = &engine->execlists;
904 	struct execlist_port *port = execlists->port;
905 	const u32 * const buf = execlists->csb_status;
906 	const u8 num_entries = execlists->csb_size;
907 	u8 head, tail;
908 
909 	lockdep_assert_held(&engine->timeline.lock);
910 
911 	/*
912 	 * Note that csb_write, csb_status may be either in HWSP or mmio.
913 	 * When reading from the csb_write mmio register, we have to be
914 	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
915 	 * the low 4bits. As it happens we know the next 4bits are always
916 	 * zero and so we can simply masked off the low u8 of the register
917 	 * and treat it identically to reading from the HWSP (without having
918 	 * to use explicit shifting and masking, and probably bifurcating
919 	 * the code to handle the legacy mmio read).
920 	 */
921 	head = execlists->csb_head;
922 	tail = READ_ONCE(*execlists->csb_write);
923 	GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
924 	if (unlikely(head == tail))
925 		return;
926 
927 	/*
928 	 * Hopefully paired with a wmb() in HW!
929 	 *
930 	 * We must complete the read of the write pointer before any reads
931 	 * from the CSB, so that we do not see stale values. Without an rmb
932 	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
933 	 * we perform the READ_ONCE(*csb_write).
934 	 */
935 	rmb();
936 
937 	do {
938 		struct i915_request *rq;
939 		unsigned int status;
940 		unsigned int count;
941 
942 		if (++head == num_entries)
943 			head = 0;
944 
945 		/*
946 		 * We are flying near dragons again.
947 		 *
948 		 * We hold a reference to the request in execlist_port[]
949 		 * but no more than that. We are operating in softirq
950 		 * context and so cannot hold any mutex or sleep. That
951 		 * prevents us stopping the requests we are processing
952 		 * in port[] from being retired simultaneously (the
953 		 * breadcrumb will be complete before we see the
954 		 * context-switch). As we only hold the reference to the
955 		 * request, any pointer chasing underneath the request
956 		 * is subject to a potential use-after-free. Thus we
957 		 * store all of the bookkeeping within port[] as
958 		 * required, and avoid using unguarded pointers beneath
959 		 * request itself. The same applies to the atomic
960 		 * status notifier.
961 		 */
962 
963 		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
964 			  engine->name, head,
965 			  buf[2 * head + 0], buf[2 * head + 1],
966 			  execlists->active);
967 
968 		status = buf[2 * head];
969 		if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
970 			      GEN8_CTX_STATUS_PREEMPTED))
971 			execlists_set_active(execlists,
972 					     EXECLISTS_ACTIVE_HWACK);
973 		if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
974 			execlists_clear_active(execlists,
975 					       EXECLISTS_ACTIVE_HWACK);
976 
977 		if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
978 			continue;
979 
980 		/* We should never get a COMPLETED | IDLE_ACTIVE! */
981 		GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
982 
983 		if (status & GEN8_CTX_STATUS_COMPLETE &&
984 		    buf[2*head + 1] == execlists->preempt_complete_status) {
985 			GEM_TRACE("%s preempt-idle\n", engine->name);
986 			complete_preempt_context(execlists);
987 			continue;
988 		}
989 
990 		if (status & GEN8_CTX_STATUS_PREEMPTED &&
991 		    execlists_is_active(execlists,
992 					EXECLISTS_ACTIVE_PREEMPT))
993 			continue;
994 
995 		GEM_BUG_ON(!execlists_is_active(execlists,
996 						EXECLISTS_ACTIVE_USER));
997 
998 		rq = port_unpack(port, &count);
999 		GEM_TRACE("%s out[0]: ctx=%d.%d, fence %llx:%lld (current %d), prio=%d\n",
1000 			  engine->name,
1001 			  port->context_id, count,
1002 			  rq ? rq->fence.context : 0,
1003 			  rq ? rq->fence.seqno : 0,
1004 			  rq ? hwsp_seqno(rq) : 0,
1005 			  rq ? rq_prio(rq) : 0);
1006 
1007 		/* Check the context/desc id for this event matches */
1008 		GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
1009 
1010 		GEM_BUG_ON(count == 0);
1011 		if (--count == 0) {
1012 			/*
1013 			 * On the final event corresponding to the
1014 			 * submission of this context, we expect either
1015 			 * an element-switch event or a completion
1016 			 * event (and on completion, the active-idle
1017 			 * marker). No more preemptions, lite-restore
1018 			 * or otherwise.
1019 			 */
1020 			GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
1021 			GEM_BUG_ON(port_isset(&port[1]) &&
1022 				   !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
1023 			GEM_BUG_ON(!port_isset(&port[1]) &&
1024 				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
1025 
1026 			/*
1027 			 * We rely on the hardware being strongly
1028 			 * ordered, that the breadcrumb write is
1029 			 * coherent (visible from the CPU) before the
1030 			 * user interrupt and CSB is processed.
1031 			 */
1032 			GEM_BUG_ON(!i915_request_completed(rq));
1033 
1034 			execlists_context_schedule_out(rq,
1035 						       INTEL_CONTEXT_SCHEDULE_OUT);
1036 			i915_request_put(rq);
1037 
1038 			GEM_TRACE("%s completed ctx=%d\n",
1039 				  engine->name, port->context_id);
1040 
1041 			port = execlists_port_complete(execlists, port);
1042 			if (port_isset(port))
1043 				execlists_user_begin(execlists, port);
1044 			else
1045 				execlists_user_end(execlists);
1046 		} else {
1047 			port_set(port, port_pack(rq, count));
1048 		}
1049 	} while (head != tail);
1050 
1051 	execlists->csb_head = head;
1052 
1053 	/*
1054 	 * Gen11 has proven to fail wrt global observation point between
1055 	 * entry and tail update, failing on the ordering and thus
1056 	 * we see an old entry in the context status buffer.
1057 	 *
1058 	 * Forcibly evict out entries for the next gpu csb update,
1059 	 * to increase the odds that we get a fresh entries with non
1060 	 * working hardware. The cost for doing so comes out mostly with
1061 	 * the wash as hardware, working or not, will need to do the
1062 	 * invalidation before.
1063 	 */
1064 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
1065 }
1066 
1067 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
1068 {
1069 	lockdep_assert_held(&engine->timeline.lock);
1070 
1071 	process_csb(engine);
1072 	if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
1073 		execlists_dequeue(engine);
1074 }
1075 
1076 /*
1077  * Check the unread Context Status Buffers and manage the submission of new
1078  * contexts to the ELSP accordingly.
1079  */
1080 static void execlists_submission_tasklet(unsigned long data)
1081 {
1082 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
1083 	unsigned long flags;
1084 
1085 	GEM_TRACE("%s awake?=%d, active=%x\n",
1086 		  engine->name,
1087 		  !!engine->i915->gt.awake,
1088 		  engine->execlists.active);
1089 
1090 	spin_lock_irqsave(&engine->timeline.lock, flags);
1091 	__execlists_submission_tasklet(engine);
1092 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
1093 }
1094 
1095 static void queue_request(struct intel_engine_cs *engine,
1096 			  struct i915_sched_node *node,
1097 			  int prio)
1098 {
1099 	list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
1100 }
1101 
1102 static void __submit_queue_imm(struct intel_engine_cs *engine)
1103 {
1104 	struct intel_engine_execlists * const execlists = &engine->execlists;
1105 
1106 	if (reset_in_progress(execlists))
1107 		return; /* defer until we restart the engine following reset */
1108 
1109 	if (execlists->tasklet.func == execlists_submission_tasklet)
1110 		__execlists_submission_tasklet(engine);
1111 	else
1112 		tasklet_hi_schedule(&execlists->tasklet);
1113 }
1114 
1115 static void submit_queue(struct intel_engine_cs *engine, int prio)
1116 {
1117 	if (prio > engine->execlists.queue_priority_hint) {
1118 		engine->execlists.queue_priority_hint = prio;
1119 		__submit_queue_imm(engine);
1120 	}
1121 }
1122 
1123 static void execlists_submit_request(struct i915_request *request)
1124 {
1125 	struct intel_engine_cs *engine = request->engine;
1126 	unsigned long flags;
1127 
1128 	/* Will be called from irq-context when using foreign fences. */
1129 	spin_lock_irqsave(&engine->timeline.lock, flags);
1130 
1131 	queue_request(engine, &request->sched, rq_prio(request));
1132 
1133 	GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1134 	GEM_BUG_ON(list_empty(&request->sched.link));
1135 
1136 	submit_queue(engine, rq_prio(request));
1137 
1138 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
1139 }
1140 
1141 static void __execlists_context_fini(struct intel_context *ce)
1142 {
1143 	intel_ring_put(ce->ring);
1144 
1145 	GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj));
1146 	i915_gem_object_put(ce->state->obj);
1147 }
1148 
1149 static void execlists_context_destroy(struct kref *kref)
1150 {
1151 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1152 
1153 	GEM_BUG_ON(intel_context_is_pinned(ce));
1154 
1155 	if (ce->state)
1156 		__execlists_context_fini(ce);
1157 
1158 	intel_context_free(ce);
1159 }
1160 
1161 static int __context_pin(struct i915_vma *vma)
1162 {
1163 	unsigned int flags;
1164 	int err;
1165 
1166 	flags = PIN_GLOBAL | PIN_HIGH;
1167 	flags |= PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma);
1168 
1169 	err = i915_vma_pin(vma, 0, 0, flags);
1170 	if (err)
1171 		return err;
1172 
1173 	vma->obj->pin_global++;
1174 	vma->obj->mm.dirty = true;
1175 
1176 	return 0;
1177 }
1178 
1179 static void __context_unpin(struct i915_vma *vma)
1180 {
1181 	vma->obj->pin_global--;
1182 	__i915_vma_unpin(vma);
1183 }
1184 
1185 static void execlists_context_unpin(struct intel_context *ce)
1186 {
1187 	struct intel_engine_cs *engine;
1188 
1189 	/*
1190 	 * The tasklet may still be using a pointer to our state, via an
1191 	 * old request. However, since we know we only unpin the context
1192 	 * on retirement of the following request, we know that the last
1193 	 * request referencing us will have had a completion CS interrupt.
1194 	 * If we see that it is still active, it means that the tasklet hasn't
1195 	 * had the chance to run yet; let it run before we teardown the
1196 	 * reference it may use.
1197 	 */
1198 	engine = READ_ONCE(ce->active);
1199 	if (unlikely(engine)) {
1200 		unsigned long flags;
1201 
1202 		spin_lock_irqsave(&engine->timeline.lock, flags);
1203 		process_csb(engine);
1204 		spin_unlock_irqrestore(&engine->timeline.lock, flags);
1205 
1206 		GEM_BUG_ON(READ_ONCE(ce->active));
1207 	}
1208 
1209 	i915_gem_context_unpin_hw_id(ce->gem_context);
1210 
1211 	intel_ring_unpin(ce->ring);
1212 
1213 	i915_gem_object_unpin_map(ce->state->obj);
1214 	__context_unpin(ce->state);
1215 }
1216 
1217 static void
1218 __execlists_update_reg_state(struct intel_context *ce,
1219 			     struct intel_engine_cs *engine)
1220 {
1221 	struct intel_ring *ring = ce->ring;
1222 	u32 *regs = ce->lrc_reg_state;
1223 
1224 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
1225 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1226 
1227 	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(ring->vma);
1228 	regs[CTX_RING_HEAD + 1] = ring->head;
1229 	regs[CTX_RING_TAIL + 1] = ring->tail;
1230 
1231 	/* RPCS */
1232 	if (engine->class == RENDER_CLASS)
1233 		regs[CTX_R_PWR_CLK_STATE + 1] =
1234 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
1235 }
1236 
1237 static int
1238 __execlists_context_pin(struct intel_context *ce,
1239 			struct intel_engine_cs *engine)
1240 {
1241 	void *vaddr;
1242 	int ret;
1243 
1244 	GEM_BUG_ON(!ce->gem_context->ppgtt);
1245 
1246 	ret = execlists_context_deferred_alloc(ce, engine);
1247 	if (ret)
1248 		goto err;
1249 	GEM_BUG_ON(!ce->state);
1250 
1251 	ret = __context_pin(ce->state);
1252 	if (ret)
1253 		goto err;
1254 
1255 	vaddr = i915_gem_object_pin_map(ce->state->obj,
1256 					i915_coherent_map_type(engine->i915) |
1257 					I915_MAP_OVERRIDE);
1258 	if (IS_ERR(vaddr)) {
1259 		ret = PTR_ERR(vaddr);
1260 		goto unpin_vma;
1261 	}
1262 
1263 	ret = intel_ring_pin(ce->ring);
1264 	if (ret)
1265 		goto unpin_map;
1266 
1267 	ret = i915_gem_context_pin_hw_id(ce->gem_context);
1268 	if (ret)
1269 		goto unpin_ring;
1270 
1271 	ce->lrc_desc = lrc_descriptor(ce, engine);
1272 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
1273 	__execlists_update_reg_state(ce, engine);
1274 
1275 	return 0;
1276 
1277 unpin_ring:
1278 	intel_ring_unpin(ce->ring);
1279 unpin_map:
1280 	i915_gem_object_unpin_map(ce->state->obj);
1281 unpin_vma:
1282 	__context_unpin(ce->state);
1283 err:
1284 	return ret;
1285 }
1286 
1287 static int execlists_context_pin(struct intel_context *ce)
1288 {
1289 	return __execlists_context_pin(ce, ce->engine);
1290 }
1291 
1292 static void execlists_context_reset(struct intel_context *ce)
1293 {
1294 	/*
1295 	 * Because we emit WA_TAIL_DWORDS there may be a disparity
1296 	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
1297 	 * that stored in context. As we only write new commands from
1298 	 * ce->ring->tail onwards, everything before that is junk. If the GPU
1299 	 * starts reading from its RING_HEAD from the context, it may try to
1300 	 * execute that junk and die.
1301 	 *
1302 	 * The contexts that are stilled pinned on resume belong to the
1303 	 * kernel, and are local to each engine. All other contexts will
1304 	 * have their head/tail sanitized upon pinning before use, so they
1305 	 * will never see garbage,
1306 	 *
1307 	 * So to avoid that we reset the context images upon resume. For
1308 	 * simplicity, we just zero everything out.
1309 	 */
1310 	intel_ring_reset(ce->ring, 0);
1311 	__execlists_update_reg_state(ce, ce->engine);
1312 }
1313 
1314 static const struct intel_context_ops execlists_context_ops = {
1315 	.pin = execlists_context_pin,
1316 	.unpin = execlists_context_unpin,
1317 
1318 	.enter = intel_context_enter_engine,
1319 	.exit = intel_context_exit_engine,
1320 
1321 	.reset = execlists_context_reset,
1322 	.destroy = execlists_context_destroy,
1323 };
1324 
1325 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
1326 {
1327 	u32 *cs;
1328 
1329 	GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb);
1330 
1331 	cs = intel_ring_begin(rq, 6);
1332 	if (IS_ERR(cs))
1333 		return PTR_ERR(cs);
1334 
1335 	/*
1336 	 * Check if we have been preempted before we even get started.
1337 	 *
1338 	 * After this point i915_request_started() reports true, even if
1339 	 * we get preempted and so are no longer running.
1340 	 */
1341 	*cs++ = MI_ARB_CHECK;
1342 	*cs++ = MI_NOOP;
1343 
1344 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1345 	*cs++ = rq->timeline->hwsp_offset;
1346 	*cs++ = 0;
1347 	*cs++ = rq->fence.seqno - 1;
1348 
1349 	intel_ring_advance(rq, cs);
1350 
1351 	/* Record the updated position of the request's payload */
1352 	rq->infix = intel_ring_offset(rq, cs);
1353 
1354 	return 0;
1355 }
1356 
1357 static int emit_pdps(struct i915_request *rq)
1358 {
1359 	const struct intel_engine_cs * const engine = rq->engine;
1360 	struct i915_hw_ppgtt * const ppgtt = rq->gem_context->ppgtt;
1361 	int err, i;
1362 	u32 *cs;
1363 
1364 	GEM_BUG_ON(intel_vgpu_active(rq->i915));
1365 
1366 	/*
1367 	 * Beware ye of the dragons, this sequence is magic!
1368 	 *
1369 	 * Small changes to this sequence can cause anything from
1370 	 * GPU hangs to forcewake errors and machine lockups!
1371 	 */
1372 
1373 	/* Flush any residual operations from the context load */
1374 	err = engine->emit_flush(rq, EMIT_FLUSH);
1375 	if (err)
1376 		return err;
1377 
1378 	/* Magic required to prevent forcewake errors! */
1379 	err = engine->emit_flush(rq, EMIT_INVALIDATE);
1380 	if (err)
1381 		return err;
1382 
1383 	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
1384 	if (IS_ERR(cs))
1385 		return PTR_ERR(cs);
1386 
1387 	/* Ensure the LRI have landed before we invalidate & continue */
1388 	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
1389 	for (i = GEN8_3LVL_PDPES; i--; ) {
1390 		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
1391 		u32 base = engine->mmio_base;
1392 
1393 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
1394 		*cs++ = upper_32_bits(pd_daddr);
1395 		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
1396 		*cs++ = lower_32_bits(pd_daddr);
1397 	}
1398 	*cs++ = MI_NOOP;
1399 
1400 	intel_ring_advance(rq, cs);
1401 
1402 	/* Be doubly sure the LRI have landed before proceeding */
1403 	err = engine->emit_flush(rq, EMIT_FLUSH);
1404 	if (err)
1405 		return err;
1406 
1407 	/* Re-invalidate the TLB for luck */
1408 	return engine->emit_flush(rq, EMIT_INVALIDATE);
1409 }
1410 
1411 static int execlists_request_alloc(struct i915_request *request)
1412 {
1413 	int ret;
1414 
1415 	GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
1416 
1417 	/*
1418 	 * Flush enough space to reduce the likelihood of waiting after
1419 	 * we start building the request - in which case we will just
1420 	 * have to repeat work.
1421 	 */
1422 	request->reserved_space += EXECLISTS_REQUEST_SIZE;
1423 
1424 	/*
1425 	 * Note that after this point, we have committed to using
1426 	 * this request as it is being used to both track the
1427 	 * state of engine initialisation and liveness of the
1428 	 * golden renderstate above. Think twice before you try
1429 	 * to cancel/unwind this request now.
1430 	 */
1431 
1432 	/* Unconditionally invalidate GPU caches and TLBs. */
1433 	if (i915_vm_is_4lvl(&request->gem_context->ppgtt->vm))
1434 		ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
1435 	else
1436 		ret = emit_pdps(request);
1437 	if (ret)
1438 		return ret;
1439 
1440 	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
1441 	return 0;
1442 }
1443 
1444 /*
1445  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1446  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1447  * but there is a slight complication as this is applied in WA batch where the
1448  * values are only initialized once so we cannot take register value at the
1449  * beginning and reuse it further; hence we save its value to memory, upload a
1450  * constant value with bit21 set and then we restore it back with the saved value.
1451  * To simplify the WA, a constant value is formed by using the default value
1452  * of this register. This shouldn't be a problem because we are only modifying
1453  * it for a short period and this batch in non-premptible. We can ofcourse
1454  * use additional instructions that read the actual value of the register
1455  * at that time and set our bit of interest but it makes the WA complicated.
1456  *
1457  * This WA is also required for Gen9 so extracting as a function avoids
1458  * code duplication.
1459  */
1460 static u32 *
1461 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1462 {
1463 	/* NB no one else is allowed to scribble over scratch + 256! */
1464 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1465 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1466 	*batch++ = i915_scratch_offset(engine->i915) + 256;
1467 	*batch++ = 0;
1468 
1469 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1470 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1471 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1472 
1473 	batch = gen8_emit_pipe_control(batch,
1474 				       PIPE_CONTROL_CS_STALL |
1475 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1476 				       0);
1477 
1478 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1479 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1480 	*batch++ = i915_scratch_offset(engine->i915) + 256;
1481 	*batch++ = 0;
1482 
1483 	return batch;
1484 }
1485 
1486 /*
1487  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1488  * initialized at the beginning and shared across all contexts but this field
1489  * helps us to have multiple batches at different offsets and select them based
1490  * on a criteria. At the moment this batch always start at the beginning of the page
1491  * and at this point we don't have multiple wa_ctx batch buffers.
1492  *
1493  * The number of WA applied are not known at the beginning; we use this field
1494  * to return the no of DWORDS written.
1495  *
1496  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1497  * so it adds NOOPs as padding to make it cacheline aligned.
1498  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1499  * makes a complete batch buffer.
1500  */
1501 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1502 {
1503 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1504 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1505 
1506 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1507 	if (IS_BROADWELL(engine->i915))
1508 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1509 
1510 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1511 	/* Actual scratch location is at 128 bytes offset */
1512 	batch = gen8_emit_pipe_control(batch,
1513 				       PIPE_CONTROL_FLUSH_L3 |
1514 				       PIPE_CONTROL_GLOBAL_GTT_IVB |
1515 				       PIPE_CONTROL_CS_STALL |
1516 				       PIPE_CONTROL_QW_WRITE,
1517 				       i915_scratch_offset(engine->i915) +
1518 				       2 * CACHELINE_BYTES);
1519 
1520 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1521 
1522 	/* Pad to end of cacheline */
1523 	while ((unsigned long)batch % CACHELINE_BYTES)
1524 		*batch++ = MI_NOOP;
1525 
1526 	/*
1527 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1528 	 * execution depends on the length specified in terms of cache lines
1529 	 * in the register CTX_RCS_INDIRECT_CTX
1530 	 */
1531 
1532 	return batch;
1533 }
1534 
1535 struct lri {
1536 	i915_reg_t reg;
1537 	u32 value;
1538 };
1539 
1540 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1541 {
1542 	GEM_BUG_ON(!count || count > 63);
1543 
1544 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1545 	do {
1546 		*batch++ = i915_mmio_reg_offset(lri->reg);
1547 		*batch++ = lri->value;
1548 	} while (lri++, --count);
1549 	*batch++ = MI_NOOP;
1550 
1551 	return batch;
1552 }
1553 
1554 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1555 {
1556 	static const struct lri lri[] = {
1557 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1558 		{
1559 			COMMON_SLICE_CHICKEN2,
1560 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1561 				       0),
1562 		},
1563 
1564 		/* BSpec: 11391 */
1565 		{
1566 			FF_SLICE_CHICKEN,
1567 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1568 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1569 		},
1570 
1571 		/* BSpec: 11299 */
1572 		{
1573 			_3D_CHICKEN3,
1574 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1575 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1576 		}
1577 	};
1578 
1579 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1580 
1581 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1582 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1583 
1584 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1585 
1586 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1587 	if (HAS_POOLED_EU(engine->i915)) {
1588 		/*
1589 		 * EU pool configuration is setup along with golden context
1590 		 * during context initialization. This value depends on
1591 		 * device type (2x6 or 3x6) and needs to be updated based
1592 		 * on which subslice is disabled especially for 2x6
1593 		 * devices, however it is safe to load default
1594 		 * configuration of 3x6 device instead of masking off
1595 		 * corresponding bits because HW ignores bits of a disabled
1596 		 * subslice and drops down to appropriate config. Please
1597 		 * see render_state_setup() in i915_gem_render_state.c for
1598 		 * possible configurations, to avoid duplication they are
1599 		 * not shown here again.
1600 		 */
1601 		*batch++ = GEN9_MEDIA_POOL_STATE;
1602 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1603 		*batch++ = 0x00777000;
1604 		*batch++ = 0;
1605 		*batch++ = 0;
1606 		*batch++ = 0;
1607 	}
1608 
1609 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1610 
1611 	/* Pad to end of cacheline */
1612 	while ((unsigned long)batch % CACHELINE_BYTES)
1613 		*batch++ = MI_NOOP;
1614 
1615 	return batch;
1616 }
1617 
1618 static u32 *
1619 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1620 {
1621 	int i;
1622 
1623 	/*
1624 	 * WaPipeControlBefore3DStateSamplePattern: cnl
1625 	 *
1626 	 * Ensure the engine is idle prior to programming a
1627 	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
1628 	 */
1629 	batch = gen8_emit_pipe_control(batch,
1630 				       PIPE_CONTROL_CS_STALL,
1631 				       0);
1632 	/*
1633 	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
1634 	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
1635 	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
1636 	 * confusing. Since gen8_emit_pipe_control() already advances the
1637 	 * batch by 6 dwords, we advance the other 10 here, completing a
1638 	 * cacheline. It's not clear if the workaround requires this padding
1639 	 * before other commands, or if it's just the regular padding we would
1640 	 * already have for the workaround bb, so leave it here for now.
1641 	 */
1642 	for (i = 0; i < 10; i++)
1643 		*batch++ = MI_NOOP;
1644 
1645 	/* Pad to end of cacheline */
1646 	while ((unsigned long)batch % CACHELINE_BYTES)
1647 		*batch++ = MI_NOOP;
1648 
1649 	return batch;
1650 }
1651 
1652 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
1653 
1654 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
1655 {
1656 	struct drm_i915_gem_object *obj;
1657 	struct i915_vma *vma;
1658 	int err;
1659 
1660 	obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE);
1661 	if (IS_ERR(obj))
1662 		return PTR_ERR(obj);
1663 
1664 	vma = i915_vma_instance(obj, &engine->i915->ggtt.vm, NULL);
1665 	if (IS_ERR(vma)) {
1666 		err = PTR_ERR(vma);
1667 		goto err;
1668 	}
1669 
1670 	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
1671 	if (err)
1672 		goto err;
1673 
1674 	engine->wa_ctx.vma = vma;
1675 	return 0;
1676 
1677 err:
1678 	i915_gem_object_put(obj);
1679 	return err;
1680 }
1681 
1682 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
1683 {
1684 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1685 }
1686 
1687 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1688 
1689 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
1690 {
1691 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1692 	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
1693 					    &wa_ctx->per_ctx };
1694 	wa_bb_func_t wa_bb_fn[2];
1695 	struct page *page;
1696 	void *batch, *batch_ptr;
1697 	unsigned int i;
1698 	int ret;
1699 
1700 	if (GEM_DEBUG_WARN_ON(engine->id != RCS0))
1701 		return -EINVAL;
1702 
1703 	switch (INTEL_GEN(engine->i915)) {
1704 	case 11:
1705 		return 0;
1706 	case 10:
1707 		wa_bb_fn[0] = gen10_init_indirectctx_bb;
1708 		wa_bb_fn[1] = NULL;
1709 		break;
1710 	case 9:
1711 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1712 		wa_bb_fn[1] = NULL;
1713 		break;
1714 	case 8:
1715 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1716 		wa_bb_fn[1] = NULL;
1717 		break;
1718 	default:
1719 		MISSING_CASE(INTEL_GEN(engine->i915));
1720 		return 0;
1721 	}
1722 
1723 	ret = lrc_setup_wa_ctx(engine);
1724 	if (ret) {
1725 		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
1726 		return ret;
1727 	}
1728 
1729 	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
1730 	batch = batch_ptr = kmap_atomic(page);
1731 
1732 	/*
1733 	 * Emit the two workaround batch buffers, recording the offset from the
1734 	 * start of the workaround batch buffer object for each and their
1735 	 * respective sizes.
1736 	 */
1737 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1738 		wa_bb[i]->offset = batch_ptr - batch;
1739 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1740 						  CACHELINE_BYTES))) {
1741 			ret = -EINVAL;
1742 			break;
1743 		}
1744 		if (wa_bb_fn[i])
1745 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1746 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1747 	}
1748 
1749 	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
1750 
1751 	kunmap_atomic(batch);
1752 	if (ret)
1753 		lrc_destroy_wa_ctx(engine);
1754 
1755 	return ret;
1756 }
1757 
1758 static void enable_execlists(struct intel_engine_cs *engine)
1759 {
1760 	struct drm_i915_private *dev_priv = engine->i915;
1761 
1762 	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
1763 
1764 	if (INTEL_GEN(dev_priv) >= 11)
1765 		I915_WRITE(RING_MODE_GEN7(engine),
1766 			   _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE));
1767 	else
1768 		I915_WRITE(RING_MODE_GEN7(engine),
1769 			   _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
1770 
1771 	I915_WRITE(RING_MI_MODE(engine->mmio_base),
1772 		   _MASKED_BIT_DISABLE(STOP_RING));
1773 
1774 	I915_WRITE(RING_HWS_PGA(engine->mmio_base),
1775 		   i915_ggtt_offset(engine->status_page.vma));
1776 	POSTING_READ(RING_HWS_PGA(engine->mmio_base));
1777 }
1778 
1779 static bool unexpected_starting_state(struct intel_engine_cs *engine)
1780 {
1781 	struct drm_i915_private *dev_priv = engine->i915;
1782 	bool unexpected = false;
1783 
1784 	if (I915_READ(RING_MI_MODE(engine->mmio_base)) & STOP_RING) {
1785 		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
1786 		unexpected = true;
1787 	}
1788 
1789 	return unexpected;
1790 }
1791 
1792 static int execlists_resume(struct intel_engine_cs *engine)
1793 {
1794 	intel_engine_apply_workarounds(engine);
1795 	intel_engine_apply_whitelist(engine);
1796 
1797 	intel_mocs_init_engine(engine);
1798 
1799 	intel_engine_reset_breadcrumbs(engine);
1800 
1801 	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
1802 		struct drm_printer p = drm_debug_printer(__func__);
1803 
1804 		intel_engine_dump(engine, &p, NULL);
1805 	}
1806 
1807 	enable_execlists(engine);
1808 
1809 	return 0;
1810 }
1811 
1812 static void execlists_reset_prepare(struct intel_engine_cs *engine)
1813 {
1814 	struct intel_engine_execlists * const execlists = &engine->execlists;
1815 	unsigned long flags;
1816 
1817 	GEM_TRACE("%s: depth<-%d\n", engine->name,
1818 		  atomic_read(&execlists->tasklet.count));
1819 
1820 	/*
1821 	 * Prevent request submission to the hardware until we have
1822 	 * completed the reset in i915_gem_reset_finish(). If a request
1823 	 * is completed by one engine, it may then queue a request
1824 	 * to a second via its execlists->tasklet *just* as we are
1825 	 * calling engine->resume() and also writing the ELSP.
1826 	 * Turning off the execlists->tasklet until the reset is over
1827 	 * prevents the race.
1828 	 */
1829 	__tasklet_disable_sync_once(&execlists->tasklet);
1830 	GEM_BUG_ON(!reset_in_progress(execlists));
1831 
1832 	intel_engine_stop_cs(engine);
1833 
1834 	/* And flush any current direct submission. */
1835 	spin_lock_irqsave(&engine->timeline.lock, flags);
1836 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
1837 }
1838 
1839 static bool lrc_regs_ok(const struct i915_request *rq)
1840 {
1841 	const struct intel_ring *ring = rq->ring;
1842 	const u32 *regs = rq->hw_context->lrc_reg_state;
1843 
1844 	/* Quick spot check for the common signs of context corruption */
1845 
1846 	if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
1847 	    (RING_CTL_SIZE(ring->size) | RING_VALID))
1848 		return false;
1849 
1850 	if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
1851 		return false;
1852 
1853 	return true;
1854 }
1855 
1856 static void reset_csb_pointers(struct intel_engine_execlists *execlists)
1857 {
1858 	const unsigned int reset_value = execlists->csb_size - 1;
1859 
1860 	/*
1861 	 * After a reset, the HW starts writing into CSB entry [0]. We
1862 	 * therefore have to set our HEAD pointer back one entry so that
1863 	 * the *first* entry we check is entry 0. To complicate this further,
1864 	 * as we don't wait for the first interrupt after reset, we have to
1865 	 * fake the HW write to point back to the last entry so that our
1866 	 * inline comparison of our cached head position against the last HW
1867 	 * write works even before the first interrupt.
1868 	 */
1869 	execlists->csb_head = reset_value;
1870 	WRITE_ONCE(*execlists->csb_write, reset_value);
1871 	wmb(); /* Make sure this is visible to HW (paranoia?) */
1872 
1873 	invalidate_csb_entries(&execlists->csb_status[0],
1874 			       &execlists->csb_status[reset_value]);
1875 }
1876 
1877 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
1878 {
1879 	struct intel_engine_execlists * const execlists = &engine->execlists;
1880 	struct intel_context *ce;
1881 	struct i915_request *rq;
1882 	u32 *regs;
1883 
1884 	process_csb(engine); /* drain preemption events */
1885 
1886 	/* Following the reset, we need to reload the CSB read/write pointers */
1887 	reset_csb_pointers(&engine->execlists);
1888 
1889 	/*
1890 	 * Save the currently executing context, even if we completed
1891 	 * its request, it was still running at the time of the
1892 	 * reset and will have been clobbered.
1893 	 */
1894 	if (!port_isset(execlists->port))
1895 		goto out_clear;
1896 
1897 	ce = port_request(execlists->port)->hw_context;
1898 
1899 	/*
1900 	 * Catch up with any missed context-switch interrupts.
1901 	 *
1902 	 * Ideally we would just read the remaining CSB entries now that we
1903 	 * know the gpu is idle. However, the CSB registers are sometimes^W
1904 	 * often trashed across a GPU reset! Instead we have to rely on
1905 	 * guessing the missed context-switch events by looking at what
1906 	 * requests were completed.
1907 	 */
1908 	execlists_cancel_port_requests(execlists);
1909 
1910 	/* Push back any incomplete requests for replay after the reset. */
1911 	rq = __unwind_incomplete_requests(engine);
1912 	if (!rq)
1913 		goto out_replay;
1914 
1915 	if (rq->hw_context != ce) { /* caught just before a CS event */
1916 		rq = NULL;
1917 		goto out_replay;
1918 	}
1919 
1920 	/*
1921 	 * If this request hasn't started yet, e.g. it is waiting on a
1922 	 * semaphore, we need to avoid skipping the request or else we
1923 	 * break the signaling chain. However, if the context is corrupt
1924 	 * the request will not restart and we will be stuck with a wedged
1925 	 * device. It is quite often the case that if we issue a reset
1926 	 * while the GPU is loading the context image, that the context
1927 	 * image becomes corrupt.
1928 	 *
1929 	 * Otherwise, if we have not started yet, the request should replay
1930 	 * perfectly and we do not need to flag the result as being erroneous.
1931 	 */
1932 	if (!i915_request_started(rq) && lrc_regs_ok(rq))
1933 		goto out_replay;
1934 
1935 	/*
1936 	 * If the request was innocent, we leave the request in the ELSP
1937 	 * and will try to replay it on restarting. The context image may
1938 	 * have been corrupted by the reset, in which case we may have
1939 	 * to service a new GPU hang, but more likely we can continue on
1940 	 * without impact.
1941 	 *
1942 	 * If the request was guilty, we presume the context is corrupt
1943 	 * and have to at least restore the RING register in the context
1944 	 * image back to the expected values to skip over the guilty request.
1945 	 */
1946 	i915_reset_request(rq, stalled);
1947 	if (!stalled && lrc_regs_ok(rq))
1948 		goto out_replay;
1949 
1950 	/*
1951 	 * We want a simple context + ring to execute the breadcrumb update.
1952 	 * We cannot rely on the context being intact across the GPU hang,
1953 	 * so clear it and rebuild just what we need for the breadcrumb.
1954 	 * All pending requests for this context will be zapped, and any
1955 	 * future request will be after userspace has had the opportunity
1956 	 * to recreate its own state.
1957 	 */
1958 	regs = ce->lrc_reg_state;
1959 	if (engine->pinned_default_state) {
1960 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
1961 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1962 		       engine->context_size - PAGE_SIZE);
1963 	}
1964 	execlists_init_reg_state(regs, ce, engine, ce->ring);
1965 
1966 	/* Rerun the request; its payload has been neutered (if guilty). */
1967 out_replay:
1968 	ce->ring->head =
1969 		rq ? intel_ring_wrap(ce->ring, rq->head) : ce->ring->tail;
1970 	intel_ring_update_space(ce->ring);
1971 	__execlists_update_reg_state(ce, engine);
1972 
1973 out_clear:
1974 	execlists_clear_all_active(execlists);
1975 }
1976 
1977 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
1978 {
1979 	unsigned long flags;
1980 
1981 	GEM_TRACE("%s\n", engine->name);
1982 
1983 	spin_lock_irqsave(&engine->timeline.lock, flags);
1984 
1985 	__execlists_reset(engine, stalled);
1986 
1987 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
1988 }
1989 
1990 static void nop_submission_tasklet(unsigned long data)
1991 {
1992 	/* The driver is wedged; don't process any more events. */
1993 }
1994 
1995 static void execlists_cancel_requests(struct intel_engine_cs *engine)
1996 {
1997 	struct intel_engine_execlists * const execlists = &engine->execlists;
1998 	struct i915_request *rq, *rn;
1999 	struct rb_node *rb;
2000 	unsigned long flags;
2001 
2002 	GEM_TRACE("%s\n", engine->name);
2003 
2004 	/*
2005 	 * Before we call engine->cancel_requests(), we should have exclusive
2006 	 * access to the submission state. This is arranged for us by the
2007 	 * caller disabling the interrupt generation, the tasklet and other
2008 	 * threads that may then access the same state, giving us a free hand
2009 	 * to reset state. However, we still need to let lockdep be aware that
2010 	 * we know this state may be accessed in hardirq context, so we
2011 	 * disable the irq around this manipulation and we want to keep
2012 	 * the spinlock focused on its duties and not accidentally conflate
2013 	 * coverage to the submission's irq state. (Similarly, although we
2014 	 * shouldn't need to disable irq around the manipulation of the
2015 	 * submission's irq state, we also wish to remind ourselves that
2016 	 * it is irq state.)
2017 	 */
2018 	spin_lock_irqsave(&engine->timeline.lock, flags);
2019 
2020 	__execlists_reset(engine, true);
2021 
2022 	/* Mark all executing requests as skipped. */
2023 	list_for_each_entry(rq, &engine->timeline.requests, link) {
2024 		if (!i915_request_signaled(rq))
2025 			dma_fence_set_error(&rq->fence, -EIO);
2026 
2027 		i915_request_mark_complete(rq);
2028 	}
2029 
2030 	/* Flush the queued requests to the timeline list (for retiring). */
2031 	while ((rb = rb_first_cached(&execlists->queue))) {
2032 		struct i915_priolist *p = to_priolist(rb);
2033 		int i;
2034 
2035 		priolist_for_each_request_consume(rq, rn, p, i) {
2036 			list_del_init(&rq->sched.link);
2037 			__i915_request_submit(rq);
2038 			dma_fence_set_error(&rq->fence, -EIO);
2039 			i915_request_mark_complete(rq);
2040 		}
2041 
2042 		rb_erase_cached(&p->node, &execlists->queue);
2043 		i915_priolist_free(p);
2044 	}
2045 
2046 	/* Remaining _unready_ requests will be nop'ed when submitted */
2047 
2048 	execlists->queue_priority_hint = INT_MIN;
2049 	execlists->queue = RB_ROOT_CACHED;
2050 	GEM_BUG_ON(port_isset(execlists->port));
2051 
2052 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
2053 	execlists->tasklet.func = nop_submission_tasklet;
2054 
2055 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
2056 }
2057 
2058 static void execlists_reset_finish(struct intel_engine_cs *engine)
2059 {
2060 	struct intel_engine_execlists * const execlists = &engine->execlists;
2061 
2062 	/*
2063 	 * After a GPU reset, we may have requests to replay. Do so now while
2064 	 * we still have the forcewake to be sure that the GPU is not allowed
2065 	 * to sleep before we restart and reload a context.
2066 	 */
2067 	GEM_BUG_ON(!reset_in_progress(execlists));
2068 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
2069 		execlists->tasklet.func(execlists->tasklet.data);
2070 
2071 	if (__tasklet_enable(&execlists->tasklet))
2072 		/* And kick in case we missed a new request submission. */
2073 		tasklet_hi_schedule(&execlists->tasklet);
2074 	GEM_TRACE("%s: depth->%d\n", engine->name,
2075 		  atomic_read(&execlists->tasklet.count));
2076 }
2077 
2078 static int gen8_emit_bb_start(struct i915_request *rq,
2079 			      u64 offset, u32 len,
2080 			      const unsigned int flags)
2081 {
2082 	u32 *cs;
2083 
2084 	cs = intel_ring_begin(rq, 4);
2085 	if (IS_ERR(cs))
2086 		return PTR_ERR(cs);
2087 
2088 	/*
2089 	 * WaDisableCtxRestoreArbitration:bdw,chv
2090 	 *
2091 	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
2092 	 * particular all the gen that do not need the w/a at all!), if we
2093 	 * took care to make sure that on every switch into this context
2094 	 * (both ordinary and for preemption) that arbitrartion was enabled
2095 	 * we would be fine.  However, for gen8 there is another w/a that
2096 	 * requires us to not preempt inside GPGPU execution, so we keep
2097 	 * arbitration disabled for gen8 batches. Arbitration will be
2098 	 * re-enabled before we close the request
2099 	 * (engine->emit_fini_breadcrumb).
2100 	 */
2101 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2102 
2103 	/* FIXME(BDW+): Address space and security selectors. */
2104 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
2105 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2106 	*cs++ = lower_32_bits(offset);
2107 	*cs++ = upper_32_bits(offset);
2108 
2109 	intel_ring_advance(rq, cs);
2110 
2111 	return 0;
2112 }
2113 
2114 static int gen9_emit_bb_start(struct i915_request *rq,
2115 			      u64 offset, u32 len,
2116 			      const unsigned int flags)
2117 {
2118 	u32 *cs;
2119 
2120 	cs = intel_ring_begin(rq, 6);
2121 	if (IS_ERR(cs))
2122 		return PTR_ERR(cs);
2123 
2124 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2125 
2126 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
2127 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2128 	*cs++ = lower_32_bits(offset);
2129 	*cs++ = upper_32_bits(offset);
2130 
2131 	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2132 	*cs++ = MI_NOOP;
2133 
2134 	intel_ring_advance(rq, cs);
2135 
2136 	return 0;
2137 }
2138 
2139 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
2140 {
2141 	ENGINE_WRITE(engine, RING_IMR,
2142 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
2143 	ENGINE_POSTING_READ(engine, RING_IMR);
2144 }
2145 
2146 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
2147 {
2148 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
2149 }
2150 
2151 static int gen8_emit_flush(struct i915_request *request, u32 mode)
2152 {
2153 	u32 cmd, *cs;
2154 
2155 	cs = intel_ring_begin(request, 4);
2156 	if (IS_ERR(cs))
2157 		return PTR_ERR(cs);
2158 
2159 	cmd = MI_FLUSH_DW + 1;
2160 
2161 	/* We always require a command barrier so that subsequent
2162 	 * commands, such as breadcrumb interrupts, are strictly ordered
2163 	 * wrt the contents of the write cache being flushed to memory
2164 	 * (and thus being coherent from the CPU).
2165 	 */
2166 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2167 
2168 	if (mode & EMIT_INVALIDATE) {
2169 		cmd |= MI_INVALIDATE_TLB;
2170 		if (request->engine->class == VIDEO_DECODE_CLASS)
2171 			cmd |= MI_INVALIDATE_BSD;
2172 	}
2173 
2174 	*cs++ = cmd;
2175 	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2176 	*cs++ = 0; /* upper addr */
2177 	*cs++ = 0; /* value */
2178 	intel_ring_advance(request, cs);
2179 
2180 	return 0;
2181 }
2182 
2183 static int gen8_emit_flush_render(struct i915_request *request,
2184 				  u32 mode)
2185 {
2186 	struct intel_engine_cs *engine = request->engine;
2187 	u32 scratch_addr =
2188 		i915_scratch_offset(engine->i915) + 2 * CACHELINE_BYTES;
2189 	bool vf_flush_wa = false, dc_flush_wa = false;
2190 	u32 *cs, flags = 0;
2191 	int len;
2192 
2193 	flags |= PIPE_CONTROL_CS_STALL;
2194 
2195 	if (mode & EMIT_FLUSH) {
2196 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2197 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
2198 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
2199 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
2200 	}
2201 
2202 	if (mode & EMIT_INVALIDATE) {
2203 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
2204 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2205 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2206 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2207 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2208 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2209 		flags |= PIPE_CONTROL_QW_WRITE;
2210 		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2211 
2212 		/*
2213 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
2214 		 * pipe control.
2215 		 */
2216 		if (IS_GEN(request->i915, 9))
2217 			vf_flush_wa = true;
2218 
2219 		/* WaForGAMHang:kbl */
2220 		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
2221 			dc_flush_wa = true;
2222 	}
2223 
2224 	len = 6;
2225 
2226 	if (vf_flush_wa)
2227 		len += 6;
2228 
2229 	if (dc_flush_wa)
2230 		len += 12;
2231 
2232 	cs = intel_ring_begin(request, len);
2233 	if (IS_ERR(cs))
2234 		return PTR_ERR(cs);
2235 
2236 	if (vf_flush_wa)
2237 		cs = gen8_emit_pipe_control(cs, 0, 0);
2238 
2239 	if (dc_flush_wa)
2240 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
2241 					    0);
2242 
2243 	cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2244 
2245 	if (dc_flush_wa)
2246 		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
2247 
2248 	intel_ring_advance(request, cs);
2249 
2250 	return 0;
2251 }
2252 
2253 /*
2254  * Reserve space for 2 NOOPs at the end of each request to be
2255  * used as a workaround for not being allowed to do lite
2256  * restore with HEAD==TAIL (WaIdleLiteRestore).
2257  */
2258 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
2259 {
2260 	/* Ensure there's always at least one preemption point per-request. */
2261 	*cs++ = MI_ARB_CHECK;
2262 	*cs++ = MI_NOOP;
2263 	request->wa_tail = intel_ring_offset(request, cs);
2264 
2265 	return cs;
2266 }
2267 
2268 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
2269 {
2270 	cs = gen8_emit_ggtt_write(cs,
2271 				  request->fence.seqno,
2272 				  request->timeline->hwsp_offset,
2273 				  0);
2274 
2275 	cs = gen8_emit_ggtt_write(cs,
2276 				  intel_engine_next_hangcheck_seqno(request->engine),
2277 				  I915_GEM_HWS_HANGCHECK_ADDR,
2278 				  MI_FLUSH_DW_STORE_INDEX);
2279 
2280 
2281 	*cs++ = MI_USER_INTERRUPT;
2282 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2283 
2284 	request->tail = intel_ring_offset(request, cs);
2285 	assert_ring_tail_valid(request->ring, request->tail);
2286 
2287 	return gen8_emit_wa_tail(request, cs);
2288 }
2289 
2290 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
2291 {
2292 	cs = gen8_emit_ggtt_write_rcs(cs,
2293 				      request->fence.seqno,
2294 				      request->timeline->hwsp_offset,
2295 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
2296 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2297 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
2298 				      PIPE_CONTROL_FLUSH_ENABLE |
2299 				      PIPE_CONTROL_CS_STALL);
2300 
2301 	cs = gen8_emit_ggtt_write_rcs(cs,
2302 				      intel_engine_next_hangcheck_seqno(request->engine),
2303 				      I915_GEM_HWS_HANGCHECK_ADDR,
2304 				      PIPE_CONTROL_STORE_DATA_INDEX);
2305 
2306 	*cs++ = MI_USER_INTERRUPT;
2307 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2308 
2309 	request->tail = intel_ring_offset(request, cs);
2310 	assert_ring_tail_valid(request->ring, request->tail);
2311 
2312 	return gen8_emit_wa_tail(request, cs);
2313 }
2314 
2315 static int gen8_init_rcs_context(struct i915_request *rq)
2316 {
2317 	int ret;
2318 
2319 	ret = intel_engine_emit_ctx_wa(rq);
2320 	if (ret)
2321 		return ret;
2322 
2323 	ret = intel_rcs_context_init_mocs(rq);
2324 	/*
2325 	 * Failing to program the MOCS is non-fatal.The system will not
2326 	 * run at peak performance. So generate an error and carry on.
2327 	 */
2328 	if (ret)
2329 		DRM_ERROR("MOCS failed to program: expect performance issues.\n");
2330 
2331 	return i915_gem_render_state_emit(rq);
2332 }
2333 
2334 /**
2335  * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
2336  * @engine: Engine Command Streamer.
2337  */
2338 void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
2339 {
2340 	struct drm_i915_private *dev_priv;
2341 
2342 	/*
2343 	 * Tasklet cannot be active at this point due intel_mark_active/idle
2344 	 * so this is just for documentation.
2345 	 */
2346 	if (WARN_ON(test_bit(TASKLET_STATE_SCHED,
2347 			     &engine->execlists.tasklet.state)))
2348 		tasklet_kill(&engine->execlists.tasklet);
2349 
2350 	dev_priv = engine->i915;
2351 
2352 	if (engine->buffer) {
2353 		WARN_ON((ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
2354 	}
2355 
2356 	if (engine->cleanup)
2357 		engine->cleanup(engine);
2358 
2359 	intel_engine_cleanup_common(engine);
2360 
2361 	lrc_destroy_wa_ctx(engine);
2362 
2363 	engine->i915 = NULL;
2364 	dev_priv->engine[engine->id] = NULL;
2365 	kfree(engine);
2366 }
2367 
2368 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
2369 {
2370 	engine->submit_request = execlists_submit_request;
2371 	engine->cancel_requests = execlists_cancel_requests;
2372 	engine->schedule = i915_schedule;
2373 	engine->execlists.tasklet.func = execlists_submission_tasklet;
2374 
2375 	engine->reset.prepare = execlists_reset_prepare;
2376 	engine->reset.reset = execlists_reset;
2377 	engine->reset.finish = execlists_reset_finish;
2378 
2379 	engine->park = NULL;
2380 	engine->unpark = NULL;
2381 
2382 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
2383 	if (!intel_vgpu_active(engine->i915))
2384 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
2385 	if (engine->preempt_context &&
2386 	    HAS_LOGICAL_RING_PREEMPTION(engine->i915))
2387 		engine->flags |= I915_ENGINE_HAS_PREEMPTION;
2388 }
2389 
2390 static void
2391 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
2392 {
2393 	/* Default vfuncs which can be overriden by each engine. */
2394 	engine->resume = execlists_resume;
2395 
2396 	engine->reset.prepare = execlists_reset_prepare;
2397 	engine->reset.reset = execlists_reset;
2398 	engine->reset.finish = execlists_reset_finish;
2399 
2400 	engine->cops = &execlists_context_ops;
2401 	engine->request_alloc = execlists_request_alloc;
2402 
2403 	engine->emit_flush = gen8_emit_flush;
2404 	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
2405 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
2406 
2407 	engine->set_default_submission = intel_execlists_set_default_submission;
2408 
2409 	if (INTEL_GEN(engine->i915) < 11) {
2410 		engine->irq_enable = gen8_logical_ring_enable_irq;
2411 		engine->irq_disable = gen8_logical_ring_disable_irq;
2412 	} else {
2413 		/*
2414 		 * TODO: On Gen11 interrupt masks need to be clear
2415 		 * to allow C6 entry. Keep interrupts enabled at
2416 		 * and take the hit of generating extra interrupts
2417 		 * until a more refined solution exists.
2418 		 */
2419 	}
2420 	if (IS_GEN(engine->i915, 8))
2421 		engine->emit_bb_start = gen8_emit_bb_start;
2422 	else
2423 		engine->emit_bb_start = gen9_emit_bb_start;
2424 }
2425 
2426 static inline void
2427 logical_ring_default_irqs(struct intel_engine_cs *engine)
2428 {
2429 	unsigned int shift = 0;
2430 
2431 	if (INTEL_GEN(engine->i915) < 11) {
2432 		const u8 irq_shifts[] = {
2433 			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
2434 			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
2435 			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
2436 			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
2437 			[VECS0] = GEN8_VECS_IRQ_SHIFT,
2438 		};
2439 
2440 		shift = irq_shifts[engine->id];
2441 	}
2442 
2443 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
2444 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
2445 }
2446 
2447 static int
2448 logical_ring_setup(struct intel_engine_cs *engine)
2449 {
2450 	int err;
2451 
2452 	err = intel_engine_setup_common(engine);
2453 	if (err)
2454 		return err;
2455 
2456 	/* Intentionally left blank. */
2457 	engine->buffer = NULL;
2458 
2459 	tasklet_init(&engine->execlists.tasklet,
2460 		     execlists_submission_tasklet, (unsigned long)engine);
2461 
2462 	logical_ring_default_vfuncs(engine);
2463 	logical_ring_default_irqs(engine);
2464 
2465 	return 0;
2466 }
2467 
2468 static int logical_ring_init(struct intel_engine_cs *engine)
2469 {
2470 	struct drm_i915_private *i915 = engine->i915;
2471 	struct intel_engine_execlists * const execlists = &engine->execlists;
2472 	u32 base = engine->mmio_base;
2473 	int ret;
2474 
2475 	ret = intel_engine_init_common(engine);
2476 	if (ret)
2477 		return ret;
2478 
2479 	intel_engine_init_workarounds(engine);
2480 
2481 	if (HAS_LOGICAL_RING_ELSQ(i915)) {
2482 		execlists->submit_reg = i915->uncore.regs +
2483 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
2484 		execlists->ctrl_reg = i915->uncore.regs +
2485 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
2486 	} else {
2487 		execlists->submit_reg = i915->uncore.regs +
2488 			i915_mmio_reg_offset(RING_ELSP(base));
2489 	}
2490 
2491 	execlists->preempt_complete_status = ~0u;
2492 	if (engine->preempt_context)
2493 		execlists->preempt_complete_status =
2494 			upper_32_bits(engine->preempt_context->lrc_desc);
2495 
2496 	execlists->csb_status =
2497 		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
2498 
2499 	execlists->csb_write =
2500 		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
2501 
2502 	if (INTEL_GEN(engine->i915) < 11)
2503 		execlists->csb_size = GEN8_CSB_ENTRIES;
2504 	else
2505 		execlists->csb_size = GEN11_CSB_ENTRIES;
2506 
2507 	reset_csb_pointers(execlists);
2508 
2509 	return 0;
2510 }
2511 
2512 int logical_render_ring_init(struct intel_engine_cs *engine)
2513 {
2514 	int ret;
2515 
2516 	ret = logical_ring_setup(engine);
2517 	if (ret)
2518 		return ret;
2519 
2520 	/* Override some for render ring. */
2521 	engine->init_context = gen8_init_rcs_context;
2522 	engine->emit_flush = gen8_emit_flush_render;
2523 	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
2524 
2525 	ret = logical_ring_init(engine);
2526 	if (ret)
2527 		return ret;
2528 
2529 	ret = intel_init_workaround_bb(engine);
2530 	if (ret) {
2531 		/*
2532 		 * We continue even if we fail to initialize WA batch
2533 		 * because we only expect rare glitches but nothing
2534 		 * critical to prevent us from using GPU
2535 		 */
2536 		DRM_ERROR("WA batch buffer initialization failed: %d\n",
2537 			  ret);
2538 	}
2539 
2540 	intel_engine_init_whitelist(engine);
2541 
2542 	return 0;
2543 }
2544 
2545 int logical_xcs_ring_init(struct intel_engine_cs *engine)
2546 {
2547 	int err;
2548 
2549 	err = logical_ring_setup(engine);
2550 	if (err)
2551 		return err;
2552 
2553 	return logical_ring_init(engine);
2554 }
2555 
2556 static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
2557 {
2558 	u32 indirect_ctx_offset;
2559 
2560 	switch (INTEL_GEN(engine->i915)) {
2561 	default:
2562 		MISSING_CASE(INTEL_GEN(engine->i915));
2563 		/* fall through */
2564 	case 11:
2565 		indirect_ctx_offset =
2566 			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2567 		break;
2568 	case 10:
2569 		indirect_ctx_offset =
2570 			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2571 		break;
2572 	case 9:
2573 		indirect_ctx_offset =
2574 			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2575 		break;
2576 	case 8:
2577 		indirect_ctx_offset =
2578 			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2579 		break;
2580 	}
2581 
2582 	return indirect_ctx_offset;
2583 }
2584 
2585 static void execlists_init_reg_state(u32 *regs,
2586 				     struct intel_context *ce,
2587 				     struct intel_engine_cs *engine,
2588 				     struct intel_ring *ring)
2589 {
2590 	struct i915_hw_ppgtt *ppgtt = ce->gem_context->ppgtt;
2591 	bool rcs = engine->class == RENDER_CLASS;
2592 	u32 base = engine->mmio_base;
2593 
2594 	/* A context is actually a big batch buffer with several
2595 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
2596 	 * values we are setting here are only for the first context restore:
2597 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
2598 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
2599 	 * we are not initializing here).
2600 	 */
2601 	regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
2602 				 MI_LRI_FORCE_POSTED;
2603 
2604 	CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base),
2605 		_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
2606 		_MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH));
2607 	if (INTEL_GEN(engine->i915) < 11) {
2608 		regs[CTX_CONTEXT_CONTROL + 1] |=
2609 			_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
2610 					    CTX_CTRL_RS_CTX_ENABLE);
2611 	}
2612 	CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
2613 	CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
2614 	CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
2615 	CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base),
2616 		RING_CTL_SIZE(ring->size) | RING_VALID);
2617 	CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0);
2618 	CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0);
2619 	CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT);
2620 	CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
2621 	CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0);
2622 	CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0);
2623 	if (rcs) {
2624 		struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2625 
2626 		CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0);
2627 		CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET,
2628 			RING_INDIRECT_CTX_OFFSET(base), 0);
2629 		if (wa_ctx->indirect_ctx.size) {
2630 			u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
2631 
2632 			regs[CTX_RCS_INDIRECT_CTX + 1] =
2633 				(ggtt_offset + wa_ctx->indirect_ctx.offset) |
2634 				(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
2635 
2636 			regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
2637 				intel_lr_indirect_ctx_offset(engine) << 6;
2638 		}
2639 
2640 		CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0);
2641 		if (wa_ctx->per_ctx.size) {
2642 			u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
2643 
2644 			regs[CTX_BB_PER_CTX_PTR + 1] =
2645 				(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
2646 		}
2647 	}
2648 
2649 	regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
2650 
2651 	CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
2652 	/* PDP values well be assigned later if needed */
2653 	CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(base, 3), 0);
2654 	CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(base, 3), 0);
2655 	CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(base, 2), 0);
2656 	CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(base, 2), 0);
2657 	CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(base, 1), 0);
2658 	CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(base, 1), 0);
2659 	CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(base, 0), 0);
2660 	CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(base, 0), 0);
2661 
2662 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
2663 		/* 64b PPGTT (48bit canonical)
2664 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
2665 		 * other PDP Descriptors are ignored.
2666 		 */
2667 		ASSIGN_CTX_PML4(ppgtt, regs);
2668 	} else {
2669 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
2670 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
2671 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
2672 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
2673 	}
2674 
2675 	if (rcs) {
2676 		regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
2677 		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
2678 
2679 		i915_oa_init_reg_state(engine, ce, regs);
2680 	}
2681 
2682 	regs[CTX_END] = MI_BATCH_BUFFER_END;
2683 	if (INTEL_GEN(engine->i915) >= 10)
2684 		regs[CTX_END] |= BIT(0);
2685 }
2686 
2687 static int
2688 populate_lr_context(struct intel_context *ce,
2689 		    struct drm_i915_gem_object *ctx_obj,
2690 		    struct intel_engine_cs *engine,
2691 		    struct intel_ring *ring)
2692 {
2693 	void *vaddr;
2694 	u32 *regs;
2695 	int ret;
2696 
2697 	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
2698 	if (IS_ERR(vaddr)) {
2699 		ret = PTR_ERR(vaddr);
2700 		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
2701 		return ret;
2702 	}
2703 
2704 	if (engine->default_state) {
2705 		/*
2706 		 * We only want to copy over the template context state;
2707 		 * skipping over the headers reserved for GuC communication,
2708 		 * leaving those as zero.
2709 		 */
2710 		const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE;
2711 		void *defaults;
2712 
2713 		defaults = i915_gem_object_pin_map(engine->default_state,
2714 						   I915_MAP_WB);
2715 		if (IS_ERR(defaults)) {
2716 			ret = PTR_ERR(defaults);
2717 			goto err_unpin_ctx;
2718 		}
2719 
2720 		memcpy(vaddr + start, defaults + start, engine->context_size);
2721 		i915_gem_object_unpin_map(engine->default_state);
2722 	}
2723 
2724 	/* The second page of the context object contains some fields which must
2725 	 * be set up prior to the first execution. */
2726 	regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
2727 	execlists_init_reg_state(regs, ce, engine, ring);
2728 	if (!engine->default_state)
2729 		regs[CTX_CONTEXT_CONTROL + 1] |=
2730 			_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
2731 	if (ce->gem_context == engine->i915->preempt_context &&
2732 	    INTEL_GEN(engine->i915) < 11)
2733 		regs[CTX_CONTEXT_CONTROL + 1] |=
2734 			_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
2735 					   CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT);
2736 
2737 	ret = 0;
2738 err_unpin_ctx:
2739 	__i915_gem_object_flush_map(ctx_obj,
2740 				    LRC_HEADER_PAGES * PAGE_SIZE,
2741 				    engine->context_size);
2742 	i915_gem_object_unpin_map(ctx_obj);
2743 	return ret;
2744 }
2745 
2746 static struct i915_timeline *get_timeline(struct i915_gem_context *ctx)
2747 {
2748 	if (ctx->timeline)
2749 		return i915_timeline_get(ctx->timeline);
2750 	else
2751 		return i915_timeline_create(ctx->i915, NULL);
2752 }
2753 
2754 static int execlists_context_deferred_alloc(struct intel_context *ce,
2755 					    struct intel_engine_cs *engine)
2756 {
2757 	struct drm_i915_gem_object *ctx_obj;
2758 	struct i915_vma *vma;
2759 	u32 context_size;
2760 	struct intel_ring *ring;
2761 	struct i915_timeline *timeline;
2762 	int ret;
2763 
2764 	if (ce->state)
2765 		return 0;
2766 
2767 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
2768 
2769 	/*
2770 	 * Before the actual start of the context image, we insert a few pages
2771 	 * for our own use and for sharing with the GuC.
2772 	 */
2773 	context_size += LRC_HEADER_PAGES * PAGE_SIZE;
2774 
2775 	ctx_obj = i915_gem_object_create(engine->i915, context_size);
2776 	if (IS_ERR(ctx_obj))
2777 		return PTR_ERR(ctx_obj);
2778 
2779 	vma = i915_vma_instance(ctx_obj, &engine->i915->ggtt.vm, NULL);
2780 	if (IS_ERR(vma)) {
2781 		ret = PTR_ERR(vma);
2782 		goto error_deref_obj;
2783 	}
2784 
2785 	timeline = get_timeline(ce->gem_context);
2786 	if (IS_ERR(timeline)) {
2787 		ret = PTR_ERR(timeline);
2788 		goto error_deref_obj;
2789 	}
2790 
2791 	ring = intel_engine_create_ring(engine,
2792 					timeline,
2793 					ce->gem_context->ring_size);
2794 	i915_timeline_put(timeline);
2795 	if (IS_ERR(ring)) {
2796 		ret = PTR_ERR(ring);
2797 		goto error_deref_obj;
2798 	}
2799 
2800 	ret = populate_lr_context(ce, ctx_obj, engine, ring);
2801 	if (ret) {
2802 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
2803 		goto error_ring_free;
2804 	}
2805 
2806 	ce->ring = ring;
2807 	ce->state = vma;
2808 
2809 	return 0;
2810 
2811 error_ring_free:
2812 	intel_ring_put(ring);
2813 error_deref_obj:
2814 	i915_gem_object_put(ctx_obj);
2815 	return ret;
2816 }
2817 
2818 void intel_execlists_show_requests(struct intel_engine_cs *engine,
2819 				   struct drm_printer *m,
2820 				   void (*show_request)(struct drm_printer *m,
2821 							struct i915_request *rq,
2822 							const char *prefix),
2823 				   unsigned int max)
2824 {
2825 	const struct intel_engine_execlists *execlists = &engine->execlists;
2826 	struct i915_request *rq, *last;
2827 	unsigned long flags;
2828 	unsigned int count;
2829 	struct rb_node *rb;
2830 
2831 	spin_lock_irqsave(&engine->timeline.lock, flags);
2832 
2833 	last = NULL;
2834 	count = 0;
2835 	list_for_each_entry(rq, &engine->timeline.requests, link) {
2836 		if (count++ < max - 1)
2837 			show_request(m, rq, "\t\tE ");
2838 		else
2839 			last = rq;
2840 	}
2841 	if (last) {
2842 		if (count > max) {
2843 			drm_printf(m,
2844 				   "\t\t...skipping %d executing requests...\n",
2845 				   count - max);
2846 		}
2847 		show_request(m, last, "\t\tE ");
2848 	}
2849 
2850 	last = NULL;
2851 	count = 0;
2852 	if (execlists->queue_priority_hint != INT_MIN)
2853 		drm_printf(m, "\t\tQueue priority hint: %d\n",
2854 			   execlists->queue_priority_hint);
2855 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
2856 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
2857 		int i;
2858 
2859 		priolist_for_each_request(rq, p, i) {
2860 			if (count++ < max - 1)
2861 				show_request(m, rq, "\t\tQ ");
2862 			else
2863 				last = rq;
2864 		}
2865 	}
2866 	if (last) {
2867 		if (count > max) {
2868 			drm_printf(m,
2869 				   "\t\t...skipping %d queued requests...\n",
2870 				   count - max);
2871 		}
2872 		show_request(m, last, "\t\tQ ");
2873 	}
2874 
2875 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
2876 }
2877 
2878 void intel_lr_context_reset(struct intel_engine_cs *engine,
2879 			    struct intel_context *ce,
2880 			    u32 head,
2881 			    bool scrub)
2882 {
2883 	/*
2884 	 * We want a simple context + ring to execute the breadcrumb update.
2885 	 * We cannot rely on the context being intact across the GPU hang,
2886 	 * so clear it and rebuild just what we need for the breadcrumb.
2887 	 * All pending requests for this context will be zapped, and any
2888 	 * future request will be after userspace has had the opportunity
2889 	 * to recreate its own state.
2890 	 */
2891 	if (scrub) {
2892 		u32 *regs = ce->lrc_reg_state;
2893 
2894 		if (engine->pinned_default_state) {
2895 			memcpy(regs, /* skip restoring the vanilla PPHWSP */
2896 			       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
2897 			       engine->context_size - PAGE_SIZE);
2898 		}
2899 		execlists_init_reg_state(regs, ce, engine, ce->ring);
2900 	}
2901 
2902 	/* Rerun the request; its payload has been neutered (if guilty). */
2903 	ce->ring->head = head;
2904 	intel_ring_update_space(ce->ring);
2905 
2906 	__execlists_update_reg_state(ce, engine);
2907 }
2908 
2909 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2910 #include "selftest_lrc.c"
2911 #endif
2912