1 /*
2  * Copyright © 2008-2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *    Zou Nan hai <nanhai.zou@intel.com>
26  *    Xiang Hai hao<haihao.xiang@intel.com>
27  *
28  */
29 
30 #include <linux/log2.h>
31 
32 #include "gem/i915_gem_context.h"
33 
34 #include "gen6_ppgtt.h"
35 #include "i915_drv.h"
36 #include "i915_trace.h"
37 #include "intel_context.h"
38 #include "intel_gt.h"
39 #include "intel_gt_irq.h"
40 #include "intel_gt_pm_irq.h"
41 #include "intel_reset.h"
42 #include "intel_ring.h"
43 #include "intel_workarounds.h"
44 
45 /* Rough estimate of the typical request size, performing a flush,
46  * set-context and then emitting the batch.
47  */
48 #define LEGACY_REQUEST_SIZE 200
49 
50 static int
51 gen2_render_ring_flush(struct i915_request *rq, u32 mode)
52 {
53 	unsigned int num_store_dw;
54 	u32 cmd, *cs;
55 
56 	cmd = MI_FLUSH;
57 	num_store_dw = 0;
58 	if (mode & EMIT_INVALIDATE)
59 		cmd |= MI_READ_FLUSH;
60 	if (mode & EMIT_FLUSH)
61 		num_store_dw = 4;
62 
63 	cs = intel_ring_begin(rq, 2 + 3 * num_store_dw);
64 	if (IS_ERR(cs))
65 		return PTR_ERR(cs);
66 
67 	*cs++ = cmd;
68 	while (num_store_dw--) {
69 		*cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
70 		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
71 						INTEL_GT_SCRATCH_FIELD_DEFAULT);
72 		*cs++ = 0;
73 	}
74 	*cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
75 
76 	intel_ring_advance(rq, cs);
77 
78 	return 0;
79 }
80 
81 static int
82 gen4_render_ring_flush(struct i915_request *rq, u32 mode)
83 {
84 	u32 cmd, *cs;
85 	int i;
86 
87 	/*
88 	 * read/write caches:
89 	 *
90 	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
91 	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
92 	 * also flushed at 2d versus 3d pipeline switches.
93 	 *
94 	 * read-only caches:
95 	 *
96 	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
97 	 * MI_READ_FLUSH is set, and is always flushed on 965.
98 	 *
99 	 * I915_GEM_DOMAIN_COMMAND may not exist?
100 	 *
101 	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
102 	 * invalidated when MI_EXE_FLUSH is set.
103 	 *
104 	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
105 	 * invalidated with every MI_FLUSH.
106 	 *
107 	 * TLBs:
108 	 *
109 	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
110 	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
111 	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
112 	 * are flushed at any MI_FLUSH.
113 	 */
114 
115 	cmd = MI_FLUSH;
116 	if (mode & EMIT_INVALIDATE) {
117 		cmd |= MI_EXE_FLUSH;
118 		if (IS_G4X(rq->i915) || IS_GEN(rq->i915, 5))
119 			cmd |= MI_INVALIDATE_ISP;
120 	}
121 
122 	i = 2;
123 	if (mode & EMIT_INVALIDATE)
124 		i += 20;
125 
126 	cs = intel_ring_begin(rq, i);
127 	if (IS_ERR(cs))
128 		return PTR_ERR(cs);
129 
130 	*cs++ = cmd;
131 
132 	/*
133 	 * A random delay to let the CS invalidate take effect? Without this
134 	 * delay, the GPU relocation path fails as the CS does not see
135 	 * the updated contents. Just as important, if we apply the flushes
136 	 * to the EMIT_FLUSH branch (i.e. immediately after the relocation
137 	 * write and before the invalidate on the next batch), the relocations
138 	 * still fail. This implies that is a delay following invalidation
139 	 * that is required to reset the caches as opposed to a delay to
140 	 * ensure the memory is written.
141 	 */
142 	if (mode & EMIT_INVALIDATE) {
143 		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
144 		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
145 						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
146 			PIPE_CONTROL_GLOBAL_GTT;
147 		*cs++ = 0;
148 		*cs++ = 0;
149 
150 		for (i = 0; i < 12; i++)
151 			*cs++ = MI_FLUSH;
152 
153 		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
154 		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
155 						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
156 			PIPE_CONTROL_GLOBAL_GTT;
157 		*cs++ = 0;
158 		*cs++ = 0;
159 	}
160 
161 	*cs++ = cmd;
162 
163 	intel_ring_advance(rq, cs);
164 
165 	return 0;
166 }
167 
168 /*
169  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
170  * implementing two workarounds on gen6.  From section 1.4.7.1
171  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
172  *
173  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
174  * produced by non-pipelined state commands), software needs to first
175  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
176  * 0.
177  *
178  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
179  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
180  *
181  * And the workaround for these two requires this workaround first:
182  *
183  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
184  * BEFORE the pipe-control with a post-sync op and no write-cache
185  * flushes.
186  *
187  * And this last workaround is tricky because of the requirements on
188  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
189  * volume 2 part 1:
190  *
191  *     "1 of the following must also be set:
192  *      - Render Target Cache Flush Enable ([12] of DW1)
193  *      - Depth Cache Flush Enable ([0] of DW1)
194  *      - Stall at Pixel Scoreboard ([1] of DW1)
195  *      - Depth Stall ([13] of DW1)
196  *      - Post-Sync Operation ([13] of DW1)
197  *      - Notify Enable ([8] of DW1)"
198  *
199  * The cache flushes require the workaround flush that triggered this
200  * one, so we can't use it.  Depth stall would trigger the same.
201  * Post-sync nonzero is what triggered this second workaround, so we
202  * can't use that one either.  Notify enable is IRQs, which aren't
203  * really our business.  That leaves only stall at scoreboard.
204  */
205 static int
206 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
207 {
208 	u32 scratch_addr =
209 		intel_gt_scratch_offset(rq->engine->gt,
210 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
211 	u32 *cs;
212 
213 	cs = intel_ring_begin(rq, 6);
214 	if (IS_ERR(cs))
215 		return PTR_ERR(cs);
216 
217 	*cs++ = GFX_OP_PIPE_CONTROL(5);
218 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
219 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
220 	*cs++ = 0; /* low dword */
221 	*cs++ = 0; /* high dword */
222 	*cs++ = MI_NOOP;
223 	intel_ring_advance(rq, cs);
224 
225 	cs = intel_ring_begin(rq, 6);
226 	if (IS_ERR(cs))
227 		return PTR_ERR(cs);
228 
229 	*cs++ = GFX_OP_PIPE_CONTROL(5);
230 	*cs++ = PIPE_CONTROL_QW_WRITE;
231 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
232 	*cs++ = 0;
233 	*cs++ = 0;
234 	*cs++ = MI_NOOP;
235 	intel_ring_advance(rq, cs);
236 
237 	return 0;
238 }
239 
240 static int
241 gen6_render_ring_flush(struct i915_request *rq, u32 mode)
242 {
243 	u32 scratch_addr =
244 		intel_gt_scratch_offset(rq->engine->gt,
245 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
246 	u32 *cs, flags = 0;
247 	int ret;
248 
249 	/* Force SNB workarounds for PIPE_CONTROL flushes */
250 	ret = gen6_emit_post_sync_nonzero_flush(rq);
251 	if (ret)
252 		return ret;
253 
254 	/* Just flush everything.  Experiments have shown that reducing the
255 	 * number of bits based on the write domains has little performance
256 	 * impact.
257 	 */
258 	if (mode & EMIT_FLUSH) {
259 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
260 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
261 		/*
262 		 * Ensure that any following seqno writes only happen
263 		 * when the render cache is indeed flushed.
264 		 */
265 		flags |= PIPE_CONTROL_CS_STALL;
266 	}
267 	if (mode & EMIT_INVALIDATE) {
268 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
269 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
270 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
271 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
272 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
273 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
274 		/*
275 		 * TLB invalidate requires a post-sync write.
276 		 */
277 		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
278 	}
279 
280 	cs = intel_ring_begin(rq, 4);
281 	if (IS_ERR(cs))
282 		return PTR_ERR(cs);
283 
284 	*cs++ = GFX_OP_PIPE_CONTROL(4);
285 	*cs++ = flags;
286 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
287 	*cs++ = 0;
288 	intel_ring_advance(rq, cs);
289 
290 	return 0;
291 }
292 
293 static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
294 {
295 	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
296 	*cs++ = GFX_OP_PIPE_CONTROL(4);
297 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
298 	*cs++ = 0;
299 	*cs++ = 0;
300 
301 	*cs++ = GFX_OP_PIPE_CONTROL(4);
302 	*cs++ = PIPE_CONTROL_QW_WRITE;
303 	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
304 					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
305 		PIPE_CONTROL_GLOBAL_GTT;
306 	*cs++ = 0;
307 
308 	/* Finally we can flush and with it emit the breadcrumb */
309 	*cs++ = GFX_OP_PIPE_CONTROL(4);
310 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
311 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
312 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
313 		 PIPE_CONTROL_QW_WRITE |
314 		 PIPE_CONTROL_CS_STALL);
315 	*cs++ = i915_request_active_timeline(rq)->hwsp_offset |
316 		PIPE_CONTROL_GLOBAL_GTT;
317 	*cs++ = rq->fence.seqno;
318 
319 	*cs++ = MI_USER_INTERRUPT;
320 	*cs++ = MI_NOOP;
321 
322 	rq->tail = intel_ring_offset(rq, cs);
323 	assert_ring_tail_valid(rq->ring, rq->tail);
324 
325 	return cs;
326 }
327 
328 static int
329 gen7_render_ring_cs_stall_wa(struct i915_request *rq)
330 {
331 	u32 *cs;
332 
333 	cs = intel_ring_begin(rq, 4);
334 	if (IS_ERR(cs))
335 		return PTR_ERR(cs);
336 
337 	*cs++ = GFX_OP_PIPE_CONTROL(4);
338 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
339 	*cs++ = 0;
340 	*cs++ = 0;
341 	intel_ring_advance(rq, cs);
342 
343 	return 0;
344 }
345 
346 static int
347 gen7_render_ring_flush(struct i915_request *rq, u32 mode)
348 {
349 	u32 scratch_addr =
350 		intel_gt_scratch_offset(rq->engine->gt,
351 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
352 	u32 *cs, flags = 0;
353 
354 	/*
355 	 * Ensure that any following seqno writes only happen when the render
356 	 * cache is indeed flushed.
357 	 *
358 	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
359 	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
360 	 * don't try to be clever and just set it unconditionally.
361 	 */
362 	flags |= PIPE_CONTROL_CS_STALL;
363 
364 	/*
365 	 * CS_STALL suggests at least a post-sync write.
366 	 */
367 	flags |= PIPE_CONTROL_QW_WRITE;
368 	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
369 
370 	/* Just flush everything.  Experiments have shown that reducing the
371 	 * number of bits based on the write domains has little performance
372 	 * impact.
373 	 */
374 	if (mode & EMIT_FLUSH) {
375 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
376 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
377 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
378 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
379 	}
380 	if (mode & EMIT_INVALIDATE) {
381 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
382 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
383 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
384 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
385 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
386 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
387 		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
388 
389 		/* Workaround: we must issue a pipe_control with CS-stall bit
390 		 * set before a pipe_control command that has the state cache
391 		 * invalidate bit set. */
392 		gen7_render_ring_cs_stall_wa(rq);
393 	}
394 
395 	cs = intel_ring_begin(rq, 4);
396 	if (IS_ERR(cs))
397 		return PTR_ERR(cs);
398 
399 	*cs++ = GFX_OP_PIPE_CONTROL(4);
400 	*cs++ = flags;
401 	*cs++ = scratch_addr;
402 	*cs++ = 0;
403 	intel_ring_advance(rq, cs);
404 
405 	return 0;
406 }
407 
408 static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
409 {
410 	*cs++ = GFX_OP_PIPE_CONTROL(4);
411 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
412 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
413 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
414 		 PIPE_CONTROL_FLUSH_ENABLE |
415 		 PIPE_CONTROL_QW_WRITE |
416 		 PIPE_CONTROL_GLOBAL_GTT_IVB |
417 		 PIPE_CONTROL_CS_STALL);
418 	*cs++ = i915_request_active_timeline(rq)->hwsp_offset;
419 	*cs++ = rq->fence.seqno;
420 
421 	*cs++ = MI_USER_INTERRUPT;
422 	*cs++ = MI_NOOP;
423 
424 	rq->tail = intel_ring_offset(rq, cs);
425 	assert_ring_tail_valid(rq->ring, rq->tail);
426 
427 	return cs;
428 }
429 
430 static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
431 {
432 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
433 	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
434 
435 	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
436 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
437 	*cs++ = rq->fence.seqno;
438 
439 	*cs++ = MI_USER_INTERRUPT;
440 
441 	rq->tail = intel_ring_offset(rq, cs);
442 	assert_ring_tail_valid(rq->ring, rq->tail);
443 
444 	return cs;
445 }
446 
447 #define GEN7_XCS_WA 32
448 static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
449 {
450 	int i;
451 
452 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
453 	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
454 
455 	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
456 		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
457 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
458 	*cs++ = rq->fence.seqno;
459 
460 	for (i = 0; i < GEN7_XCS_WA; i++) {
461 		*cs++ = MI_STORE_DWORD_INDEX;
462 		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
463 		*cs++ = rq->fence.seqno;
464 	}
465 
466 	*cs++ = MI_FLUSH_DW;
467 	*cs++ = 0;
468 	*cs++ = 0;
469 
470 	*cs++ = MI_USER_INTERRUPT;
471 	*cs++ = MI_NOOP;
472 
473 	rq->tail = intel_ring_offset(rq, cs);
474 	assert_ring_tail_valid(rq->ring, rq->tail);
475 
476 	return cs;
477 }
478 #undef GEN7_XCS_WA
479 
480 static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
481 {
482 	/*
483 	 * Keep the render interrupt unmasked as this papers over
484 	 * lost interrupts following a reset.
485 	 */
486 	if (engine->class == RENDER_CLASS) {
487 		if (INTEL_GEN(engine->i915) >= 6)
488 			mask &= ~BIT(0);
489 		else
490 			mask &= ~I915_USER_INTERRUPT;
491 	}
492 
493 	intel_engine_set_hwsp_writemask(engine, mask);
494 }
495 
496 static void set_hws_pga(struct intel_engine_cs *engine, phys_addr_t phys)
497 {
498 	u32 addr;
499 
500 	addr = lower_32_bits(phys);
501 	if (INTEL_GEN(engine->i915) >= 4)
502 		addr |= (phys >> 28) & 0xf0;
503 
504 	intel_uncore_write(engine->uncore, HWS_PGA, addr);
505 }
506 
507 static struct page *status_page(struct intel_engine_cs *engine)
508 {
509 	struct drm_i915_gem_object *obj = engine->status_page.vma->obj;
510 
511 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
512 	return sg_page(obj->mm.pages->sgl);
513 }
514 
515 static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
516 {
517 	set_hws_pga(engine, PFN_PHYS(page_to_pfn(status_page(engine))));
518 	set_hwstam(engine, ~0u);
519 }
520 
521 static void set_hwsp(struct intel_engine_cs *engine, u32 offset)
522 {
523 	i915_reg_t hwsp;
524 
525 	/*
526 	 * The ring status page addresses are no longer next to the rest of
527 	 * the ring registers as of gen7.
528 	 */
529 	if (IS_GEN(engine->i915, 7)) {
530 		switch (engine->id) {
531 		/*
532 		 * No more rings exist on Gen7. Default case is only to shut up
533 		 * gcc switch check warning.
534 		 */
535 		default:
536 			GEM_BUG_ON(engine->id);
537 			/* fallthrough */
538 		case RCS0:
539 			hwsp = RENDER_HWS_PGA_GEN7;
540 			break;
541 		case BCS0:
542 			hwsp = BLT_HWS_PGA_GEN7;
543 			break;
544 		case VCS0:
545 			hwsp = BSD_HWS_PGA_GEN7;
546 			break;
547 		case VECS0:
548 			hwsp = VEBOX_HWS_PGA_GEN7;
549 			break;
550 		}
551 	} else if (IS_GEN(engine->i915, 6)) {
552 		hwsp = RING_HWS_PGA_GEN6(engine->mmio_base);
553 	} else {
554 		hwsp = RING_HWS_PGA(engine->mmio_base);
555 	}
556 
557 	intel_uncore_write(engine->uncore, hwsp, offset);
558 	intel_uncore_posting_read(engine->uncore, hwsp);
559 }
560 
561 static void flush_cs_tlb(struct intel_engine_cs *engine)
562 {
563 	struct drm_i915_private *dev_priv = engine->i915;
564 
565 	if (!IS_GEN_RANGE(dev_priv, 6, 7))
566 		return;
567 
568 	/* ring should be idle before issuing a sync flush*/
569 	drm_WARN_ON(&dev_priv->drm,
570 		    (ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
571 
572 	ENGINE_WRITE(engine, RING_INSTPM,
573 		     _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE |
574 					INSTPM_SYNC_FLUSH));
575 	if (intel_wait_for_register(engine->uncore,
576 				    RING_INSTPM(engine->mmio_base),
577 				    INSTPM_SYNC_FLUSH, 0,
578 				    1000))
579 		DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n",
580 			  engine->name);
581 }
582 
583 static void ring_setup_status_page(struct intel_engine_cs *engine)
584 {
585 	set_hwsp(engine, i915_ggtt_offset(engine->status_page.vma));
586 	set_hwstam(engine, ~0u);
587 
588 	flush_cs_tlb(engine);
589 }
590 
591 static bool stop_ring(struct intel_engine_cs *engine)
592 {
593 	struct drm_i915_private *dev_priv = engine->i915;
594 
595 	if (INTEL_GEN(dev_priv) > 2) {
596 		ENGINE_WRITE(engine,
597 			     RING_MI_MODE, _MASKED_BIT_ENABLE(STOP_RING));
598 		if (intel_wait_for_register(engine->uncore,
599 					    RING_MI_MODE(engine->mmio_base),
600 					    MODE_IDLE,
601 					    MODE_IDLE,
602 					    1000)) {
603 			DRM_ERROR("%s : timed out trying to stop ring\n",
604 				  engine->name);
605 
606 			/*
607 			 * Sometimes we observe that the idle flag is not
608 			 * set even though the ring is empty. So double
609 			 * check before giving up.
610 			 */
611 			if (ENGINE_READ(engine, RING_HEAD) !=
612 			    ENGINE_READ(engine, RING_TAIL))
613 				return false;
614 		}
615 	}
616 
617 	ENGINE_WRITE(engine, RING_HEAD, ENGINE_READ(engine, RING_TAIL));
618 
619 	ENGINE_WRITE(engine, RING_HEAD, 0);
620 	ENGINE_WRITE(engine, RING_TAIL, 0);
621 
622 	/* The ring must be empty before it is disabled */
623 	ENGINE_WRITE(engine, RING_CTL, 0);
624 
625 	return (ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) == 0;
626 }
627 
628 static struct i915_address_space *vm_alias(struct i915_address_space *vm)
629 {
630 	if (i915_is_ggtt(vm))
631 		vm = &i915_vm_to_ggtt(vm)->alias->vm;
632 
633 	return vm;
634 }
635 
636 static void set_pp_dir(struct intel_engine_cs *engine)
637 {
638 	struct i915_address_space *vm = vm_alias(engine->gt->vm);
639 
640 	if (vm) {
641 		struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(vm);
642 
643 		ENGINE_WRITE(engine, RING_PP_DIR_DCLV, PP_DIR_DCLV_2G);
644 		ENGINE_WRITE(engine, RING_PP_DIR_BASE,
645 			     px_base(ppgtt->pd)->ggtt_offset << 10);
646 	}
647 }
648 
649 static int xcs_resume(struct intel_engine_cs *engine)
650 {
651 	struct drm_i915_private *dev_priv = engine->i915;
652 	struct intel_ring *ring = engine->legacy.ring;
653 	int ret = 0;
654 
655 	ENGINE_TRACE(engine, "ring:{HEAD:%04x, TAIL:%04x}\n",
656 		     ring->head, ring->tail);
657 
658 	intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
659 
660 	/* WaClearRingBufHeadRegAtInit:ctg,elk */
661 	if (!stop_ring(engine)) {
662 		/* G45 ring initialization often fails to reset head to zero */
663 		DRM_DEBUG_DRIVER("%s head not reset to zero "
664 				"ctl %08x head %08x tail %08x start %08x\n",
665 				engine->name,
666 				ENGINE_READ(engine, RING_CTL),
667 				ENGINE_READ(engine, RING_HEAD),
668 				ENGINE_READ(engine, RING_TAIL),
669 				ENGINE_READ(engine, RING_START));
670 
671 		if (!stop_ring(engine)) {
672 			DRM_ERROR("failed to set %s head to zero "
673 				  "ctl %08x head %08x tail %08x start %08x\n",
674 				  engine->name,
675 				  ENGINE_READ(engine, RING_CTL),
676 				  ENGINE_READ(engine, RING_HEAD),
677 				  ENGINE_READ(engine, RING_TAIL),
678 				  ENGINE_READ(engine, RING_START));
679 			ret = -EIO;
680 			goto out;
681 		}
682 	}
683 
684 	if (HWS_NEEDS_PHYSICAL(dev_priv))
685 		ring_setup_phys_status_page(engine);
686 	else
687 		ring_setup_status_page(engine);
688 
689 	intel_engine_reset_breadcrumbs(engine);
690 
691 	/* Enforce ordering by reading HEAD register back */
692 	ENGINE_POSTING_READ(engine, RING_HEAD);
693 
694 	/*
695 	 * Initialize the ring. This must happen _after_ we've cleared the ring
696 	 * registers with the above sequence (the readback of the HEAD registers
697 	 * also enforces ordering), otherwise the hw might lose the new ring
698 	 * register values.
699 	 */
700 	ENGINE_WRITE(engine, RING_START, i915_ggtt_offset(ring->vma));
701 
702 	/* Check that the ring offsets point within the ring! */
703 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
704 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
705 	intel_ring_update_space(ring);
706 
707 	set_pp_dir(engine);
708 
709 	/* First wake the ring up to an empty/idle ring */
710 	ENGINE_WRITE(engine, RING_HEAD, ring->head);
711 	ENGINE_WRITE(engine, RING_TAIL, ring->head);
712 	ENGINE_POSTING_READ(engine, RING_TAIL);
713 
714 	ENGINE_WRITE(engine, RING_CTL, RING_CTL_SIZE(ring->size) | RING_VALID);
715 
716 	/* If the head is still not zero, the ring is dead */
717 	if (intel_wait_for_register(engine->uncore,
718 				    RING_CTL(engine->mmio_base),
719 				    RING_VALID, RING_VALID,
720 				    50)) {
721 		DRM_ERROR("%s initialization failed "
722 			  "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n",
723 			  engine->name,
724 			  ENGINE_READ(engine, RING_CTL),
725 			  ENGINE_READ(engine, RING_CTL) & RING_VALID,
726 			  ENGINE_READ(engine, RING_HEAD), ring->head,
727 			  ENGINE_READ(engine, RING_TAIL), ring->tail,
728 			  ENGINE_READ(engine, RING_START),
729 			  i915_ggtt_offset(ring->vma));
730 		ret = -EIO;
731 		goto out;
732 	}
733 
734 	if (INTEL_GEN(dev_priv) > 2)
735 		ENGINE_WRITE(engine,
736 			     RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
737 
738 	/* Now awake, let it get started */
739 	if (ring->tail != ring->head) {
740 		ENGINE_WRITE(engine, RING_TAIL, ring->tail);
741 		ENGINE_POSTING_READ(engine, RING_TAIL);
742 	}
743 
744 	/* Papering over lost _interrupts_ immediately following the restart */
745 	intel_engine_signal_breadcrumbs(engine);
746 out:
747 	intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
748 
749 	return ret;
750 }
751 
752 static void reset_prepare(struct intel_engine_cs *engine)
753 {
754 	struct intel_uncore *uncore = engine->uncore;
755 	const u32 base = engine->mmio_base;
756 
757 	/*
758 	 * We stop engines, otherwise we might get failed reset and a
759 	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
760 	 * from system hang if batchbuffer is progressing when
761 	 * the reset is issued, regardless of READY_TO_RESET ack.
762 	 * Thus assume it is best to stop engines on all gens
763 	 * where we have a gpu reset.
764 	 *
765 	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
766 	 *
767 	 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
768 	 *
769 	 * FIXME: Wa for more modern gens needs to be validated
770 	 */
771 	ENGINE_TRACE(engine, "\n");
772 
773 	if (intel_engine_stop_cs(engine))
774 		ENGINE_TRACE(engine, "timed out on STOP_RING\n");
775 
776 	intel_uncore_write_fw(uncore,
777 			      RING_HEAD(base),
778 			      intel_uncore_read_fw(uncore, RING_TAIL(base)));
779 	intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */
780 
781 	intel_uncore_write_fw(uncore, RING_HEAD(base), 0);
782 	intel_uncore_write_fw(uncore, RING_TAIL(base), 0);
783 	intel_uncore_posting_read_fw(uncore, RING_TAIL(base));
784 
785 	/* The ring must be empty before it is disabled */
786 	intel_uncore_write_fw(uncore, RING_CTL(base), 0);
787 
788 	/* Check acts as a post */
789 	if (intel_uncore_read_fw(uncore, RING_HEAD(base)))
790 		ENGINE_TRACE(engine, "ring head [%x] not parked\n",
791 			     intel_uncore_read_fw(uncore, RING_HEAD(base)));
792 }
793 
794 static void reset_rewind(struct intel_engine_cs *engine, bool stalled)
795 {
796 	struct i915_request *pos, *rq;
797 	unsigned long flags;
798 	u32 head;
799 
800 	rq = NULL;
801 	spin_lock_irqsave(&engine->active.lock, flags);
802 	list_for_each_entry(pos, &engine->active.requests, sched.link) {
803 		if (!i915_request_completed(pos)) {
804 			rq = pos;
805 			break;
806 		}
807 	}
808 
809 	/*
810 	 * The guilty request will get skipped on a hung engine.
811 	 *
812 	 * Users of client default contexts do not rely on logical
813 	 * state preserved between batches so it is safe to execute
814 	 * queued requests following the hang. Non default contexts
815 	 * rely on preserved state, so skipping a batch loses the
816 	 * evolution of the state and it needs to be considered corrupted.
817 	 * Executing more queued batches on top of corrupted state is
818 	 * risky. But we take the risk by trying to advance through
819 	 * the queued requests in order to make the client behaviour
820 	 * more predictable around resets, by not throwing away random
821 	 * amount of batches it has prepared for execution. Sophisticated
822 	 * clients can use gem_reset_stats_ioctl and dma fence status
823 	 * (exported via sync_file info ioctl on explicit fences) to observe
824 	 * when it loses the context state and should rebuild accordingly.
825 	 *
826 	 * The context ban, and ultimately the client ban, mechanism are safety
827 	 * valves if client submission ends up resulting in nothing more than
828 	 * subsequent hangs.
829 	 */
830 
831 	if (rq) {
832 		/*
833 		 * Try to restore the logical GPU state to match the
834 		 * continuation of the request queue. If we skip the
835 		 * context/PD restore, then the next request may try to execute
836 		 * assuming that its context is valid and loaded on the GPU and
837 		 * so may try to access invalid memory, prompting repeated GPU
838 		 * hangs.
839 		 *
840 		 * If the request was guilty, we still restore the logical
841 		 * state in case the next request requires it (e.g. the
842 		 * aliasing ppgtt), but skip over the hung batch.
843 		 *
844 		 * If the request was innocent, we try to replay the request
845 		 * with the restored context.
846 		 */
847 		__i915_request_reset(rq, stalled);
848 
849 		GEM_BUG_ON(rq->ring != engine->legacy.ring);
850 		head = rq->head;
851 	} else {
852 		head = engine->legacy.ring->tail;
853 	}
854 	engine->legacy.ring->head = intel_ring_wrap(engine->legacy.ring, head);
855 
856 	spin_unlock_irqrestore(&engine->active.lock, flags);
857 }
858 
859 static void reset_finish(struct intel_engine_cs *engine)
860 {
861 }
862 
863 static int rcs_resume(struct intel_engine_cs *engine)
864 {
865 	struct drm_i915_private *i915 = engine->i915;
866 	struct intel_uncore *uncore = engine->uncore;
867 
868 	/*
869 	 * Disable CONSTANT_BUFFER before it is loaded from the context
870 	 * image. For as it is loaded, it is executed and the stored
871 	 * address may no longer be valid, leading to a GPU hang.
872 	 *
873 	 * This imposes the requirement that userspace reload their
874 	 * CONSTANT_BUFFER on every batch, fortunately a requirement
875 	 * they are already accustomed to from before contexts were
876 	 * enabled.
877 	 */
878 	if (IS_GEN(i915, 4))
879 		intel_uncore_write(uncore, ECOSKPD,
880 			   _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE));
881 
882 	if (IS_GEN_RANGE(i915, 6, 7))
883 		intel_uncore_write(uncore, INSTPM,
884 				   _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
885 
886 	return xcs_resume(engine);
887 }
888 
889 static void reset_cancel(struct intel_engine_cs *engine)
890 {
891 	struct i915_request *request;
892 	unsigned long flags;
893 
894 	spin_lock_irqsave(&engine->active.lock, flags);
895 
896 	/* Mark all submitted requests as skipped. */
897 	list_for_each_entry(request, &engine->active.requests, sched.link) {
898 		i915_request_set_error_once(request, -EIO);
899 		i915_request_mark_complete(request);
900 	}
901 
902 	/* Remaining _unready_ requests will be nop'ed when submitted */
903 
904 	spin_unlock_irqrestore(&engine->active.lock, flags);
905 }
906 
907 static void i9xx_submit_request(struct i915_request *request)
908 {
909 	i915_request_submit(request);
910 	wmb(); /* paranoid flush writes out of the WCB before mmio */
911 
912 	ENGINE_WRITE(request->engine, RING_TAIL,
913 		     intel_ring_set_tail(request->ring, request->tail));
914 }
915 
916 static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
917 {
918 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
919 	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
920 
921 	*cs++ = MI_FLUSH;
922 
923 	*cs++ = MI_STORE_DWORD_INDEX;
924 	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
925 	*cs++ = rq->fence.seqno;
926 
927 	*cs++ = MI_USER_INTERRUPT;
928 	*cs++ = MI_NOOP;
929 
930 	rq->tail = intel_ring_offset(rq, cs);
931 	assert_ring_tail_valid(rq->ring, rq->tail);
932 
933 	return cs;
934 }
935 
936 #define GEN5_WA_STORES 8 /* must be at least 1! */
937 static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
938 {
939 	int i;
940 
941 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
942 	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
943 
944 	*cs++ = MI_FLUSH;
945 
946 	BUILD_BUG_ON(GEN5_WA_STORES < 1);
947 	for (i = 0; i < GEN5_WA_STORES; i++) {
948 		*cs++ = MI_STORE_DWORD_INDEX;
949 		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
950 		*cs++ = rq->fence.seqno;
951 	}
952 
953 	*cs++ = MI_USER_INTERRUPT;
954 
955 	rq->tail = intel_ring_offset(rq, cs);
956 	assert_ring_tail_valid(rq->ring, rq->tail);
957 
958 	return cs;
959 }
960 #undef GEN5_WA_STORES
961 
962 static void
963 gen5_irq_enable(struct intel_engine_cs *engine)
964 {
965 	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
966 }
967 
968 static void
969 gen5_irq_disable(struct intel_engine_cs *engine)
970 {
971 	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
972 }
973 
974 static void
975 i9xx_irq_enable(struct intel_engine_cs *engine)
976 {
977 	engine->i915->irq_mask &= ~engine->irq_enable_mask;
978 	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
979 	intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
980 }
981 
982 static void
983 i9xx_irq_disable(struct intel_engine_cs *engine)
984 {
985 	engine->i915->irq_mask |= engine->irq_enable_mask;
986 	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
987 }
988 
989 static void
990 i8xx_irq_enable(struct intel_engine_cs *engine)
991 {
992 	struct drm_i915_private *i915 = engine->i915;
993 
994 	i915->irq_mask &= ~engine->irq_enable_mask;
995 	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
996 	ENGINE_POSTING_READ16(engine, RING_IMR);
997 }
998 
999 static void
1000 i8xx_irq_disable(struct intel_engine_cs *engine)
1001 {
1002 	struct drm_i915_private *i915 = engine->i915;
1003 
1004 	i915->irq_mask |= engine->irq_enable_mask;
1005 	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
1006 }
1007 
1008 static int
1009 bsd_ring_flush(struct i915_request *rq, u32 mode)
1010 {
1011 	u32 *cs;
1012 
1013 	cs = intel_ring_begin(rq, 2);
1014 	if (IS_ERR(cs))
1015 		return PTR_ERR(cs);
1016 
1017 	*cs++ = MI_FLUSH;
1018 	*cs++ = MI_NOOP;
1019 	intel_ring_advance(rq, cs);
1020 	return 0;
1021 }
1022 
1023 static void
1024 gen6_irq_enable(struct intel_engine_cs *engine)
1025 {
1026 	ENGINE_WRITE(engine, RING_IMR,
1027 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
1028 
1029 	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
1030 	ENGINE_POSTING_READ(engine, RING_IMR);
1031 
1032 	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
1033 }
1034 
1035 static void
1036 gen6_irq_disable(struct intel_engine_cs *engine)
1037 {
1038 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
1039 	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
1040 }
1041 
1042 static void
1043 hsw_vebox_irq_enable(struct intel_engine_cs *engine)
1044 {
1045 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
1046 
1047 	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
1048 	ENGINE_POSTING_READ(engine, RING_IMR);
1049 
1050 	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
1051 }
1052 
1053 static void
1054 hsw_vebox_irq_disable(struct intel_engine_cs *engine)
1055 {
1056 	ENGINE_WRITE(engine, RING_IMR, ~0);
1057 	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
1058 }
1059 
1060 static int
1061 i965_emit_bb_start(struct i915_request *rq,
1062 		   u64 offset, u32 length,
1063 		   unsigned int dispatch_flags)
1064 {
1065 	u32 *cs;
1066 
1067 	cs = intel_ring_begin(rq, 2);
1068 	if (IS_ERR(cs))
1069 		return PTR_ERR(cs);
1070 
1071 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | (dispatch_flags &
1072 		I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE_I965);
1073 	*cs++ = offset;
1074 	intel_ring_advance(rq, cs);
1075 
1076 	return 0;
1077 }
1078 
1079 /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
1080 #define I830_BATCH_LIMIT SZ_256K
1081 #define I830_TLB_ENTRIES (2)
1082 #define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
1083 static int
1084 i830_emit_bb_start(struct i915_request *rq,
1085 		   u64 offset, u32 len,
1086 		   unsigned int dispatch_flags)
1087 {
1088 	u32 *cs, cs_offset =
1089 		intel_gt_scratch_offset(rq->engine->gt,
1090 					INTEL_GT_SCRATCH_FIELD_DEFAULT);
1091 
1092 	GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE);
1093 
1094 	cs = intel_ring_begin(rq, 6);
1095 	if (IS_ERR(cs))
1096 		return PTR_ERR(cs);
1097 
1098 	/* Evict the invalid PTE TLBs */
1099 	*cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
1100 	*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
1101 	*cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
1102 	*cs++ = cs_offset;
1103 	*cs++ = 0xdeadbeef;
1104 	*cs++ = MI_NOOP;
1105 	intel_ring_advance(rq, cs);
1106 
1107 	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
1108 		if (len > I830_BATCH_LIMIT)
1109 			return -ENOSPC;
1110 
1111 		cs = intel_ring_begin(rq, 6 + 2);
1112 		if (IS_ERR(cs))
1113 			return PTR_ERR(cs);
1114 
1115 		/* Blit the batch (which has now all relocs applied) to the
1116 		 * stable batch scratch bo area (so that the CS never
1117 		 * stumbles over its tlb invalidation bug) ...
1118 		 */
1119 		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
1120 		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
1121 		*cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
1122 		*cs++ = cs_offset;
1123 		*cs++ = 4096;
1124 		*cs++ = offset;
1125 
1126 		*cs++ = MI_FLUSH;
1127 		*cs++ = MI_NOOP;
1128 		intel_ring_advance(rq, cs);
1129 
1130 		/* ... and execute it. */
1131 		offset = cs_offset;
1132 	}
1133 
1134 	cs = intel_ring_begin(rq, 2);
1135 	if (IS_ERR(cs))
1136 		return PTR_ERR(cs);
1137 
1138 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1139 	*cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1140 		MI_BATCH_NON_SECURE);
1141 	intel_ring_advance(rq, cs);
1142 
1143 	return 0;
1144 }
1145 
1146 static int
1147 i915_emit_bb_start(struct i915_request *rq,
1148 		   u64 offset, u32 len,
1149 		   unsigned int dispatch_flags)
1150 {
1151 	u32 *cs;
1152 
1153 	cs = intel_ring_begin(rq, 2);
1154 	if (IS_ERR(cs))
1155 		return PTR_ERR(cs);
1156 
1157 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1158 	*cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
1159 		MI_BATCH_NON_SECURE);
1160 	intel_ring_advance(rq, cs);
1161 
1162 	return 0;
1163 }
1164 
1165 static void __ring_context_fini(struct intel_context *ce)
1166 {
1167 	i915_vma_put(ce->state);
1168 }
1169 
1170 static void ring_context_destroy(struct kref *ref)
1171 {
1172 	struct intel_context *ce = container_of(ref, typeof(*ce), ref);
1173 
1174 	GEM_BUG_ON(intel_context_is_pinned(ce));
1175 
1176 	if (ce->state)
1177 		__ring_context_fini(ce);
1178 
1179 	intel_context_fini(ce);
1180 	intel_context_free(ce);
1181 }
1182 
1183 static int __context_pin_ppgtt(struct intel_context *ce)
1184 {
1185 	struct i915_address_space *vm;
1186 	int err = 0;
1187 
1188 	vm = vm_alias(ce->vm);
1189 	if (vm)
1190 		err = gen6_ppgtt_pin(i915_vm_to_ppgtt((vm)));
1191 
1192 	return err;
1193 }
1194 
1195 static void __context_unpin_ppgtt(struct intel_context *ce)
1196 {
1197 	struct i915_address_space *vm;
1198 
1199 	vm = vm_alias(ce->vm);
1200 	if (vm)
1201 		gen6_ppgtt_unpin(i915_vm_to_ppgtt(vm));
1202 }
1203 
1204 static void ring_context_unpin(struct intel_context *ce)
1205 {
1206 	__context_unpin_ppgtt(ce);
1207 }
1208 
1209 static struct i915_vma *
1210 alloc_context_vma(struct intel_engine_cs *engine)
1211 {
1212 	struct drm_i915_private *i915 = engine->i915;
1213 	struct drm_i915_gem_object *obj;
1214 	struct i915_vma *vma;
1215 	int err;
1216 
1217 	obj = i915_gem_object_create_shmem(i915, engine->context_size);
1218 	if (IS_ERR(obj))
1219 		return ERR_CAST(obj);
1220 
1221 	/*
1222 	 * Try to make the context utilize L3 as well as LLC.
1223 	 *
1224 	 * On VLV we don't have L3 controls in the PTEs so we
1225 	 * shouldn't touch the cache level, especially as that
1226 	 * would make the object snooped which might have a
1227 	 * negative performance impact.
1228 	 *
1229 	 * Snooping is required on non-llc platforms in execlist
1230 	 * mode, but since all GGTT accesses use PAT entry 0 we
1231 	 * get snooping anyway regardless of cache_level.
1232 	 *
1233 	 * This is only applicable for Ivy Bridge devices since
1234 	 * later platforms don't have L3 control bits in the PTE.
1235 	 */
1236 	if (IS_IVYBRIDGE(i915))
1237 		i915_gem_object_set_cache_coherency(obj, I915_CACHE_L3_LLC);
1238 
1239 	if (engine->default_state) {
1240 		void *defaults, *vaddr;
1241 
1242 		vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
1243 		if (IS_ERR(vaddr)) {
1244 			err = PTR_ERR(vaddr);
1245 			goto err_obj;
1246 		}
1247 
1248 		defaults = i915_gem_object_pin_map(engine->default_state,
1249 						   I915_MAP_WB);
1250 		if (IS_ERR(defaults)) {
1251 			err = PTR_ERR(defaults);
1252 			goto err_map;
1253 		}
1254 
1255 		memcpy(vaddr, defaults, engine->context_size);
1256 		i915_gem_object_unpin_map(engine->default_state);
1257 
1258 		i915_gem_object_flush_map(obj);
1259 		i915_gem_object_unpin_map(obj);
1260 	}
1261 
1262 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1263 	if (IS_ERR(vma)) {
1264 		err = PTR_ERR(vma);
1265 		goto err_obj;
1266 	}
1267 
1268 	return vma;
1269 
1270 err_map:
1271 	i915_gem_object_unpin_map(obj);
1272 err_obj:
1273 	i915_gem_object_put(obj);
1274 	return ERR_PTR(err);
1275 }
1276 
1277 static int ring_context_alloc(struct intel_context *ce)
1278 {
1279 	struct intel_engine_cs *engine = ce->engine;
1280 
1281 	/* One ringbuffer to rule them all */
1282 	GEM_BUG_ON(!engine->legacy.ring);
1283 	ce->ring = engine->legacy.ring;
1284 	ce->timeline = intel_timeline_get(engine->legacy.timeline);
1285 
1286 	GEM_BUG_ON(ce->state);
1287 	if (engine->context_size) {
1288 		struct i915_vma *vma;
1289 
1290 		vma = alloc_context_vma(engine);
1291 		if (IS_ERR(vma))
1292 			return PTR_ERR(vma);
1293 
1294 		ce->state = vma;
1295 		if (engine->default_state)
1296 			__set_bit(CONTEXT_VALID_BIT, &ce->flags);
1297 	}
1298 
1299 	return 0;
1300 }
1301 
1302 static int ring_context_pin(struct intel_context *ce)
1303 {
1304 	return __context_pin_ppgtt(ce);
1305 }
1306 
1307 static void ring_context_reset(struct intel_context *ce)
1308 {
1309 	intel_ring_reset(ce->ring, ce->ring->emit);
1310 }
1311 
1312 static const struct intel_context_ops ring_context_ops = {
1313 	.alloc = ring_context_alloc,
1314 
1315 	.pin = ring_context_pin,
1316 	.unpin = ring_context_unpin,
1317 
1318 	.enter = intel_context_enter_engine,
1319 	.exit = intel_context_exit_engine,
1320 
1321 	.reset = ring_context_reset,
1322 	.destroy = ring_context_destroy,
1323 };
1324 
1325 static int load_pd_dir(struct i915_request *rq,
1326 		       const struct i915_ppgtt *ppgtt,
1327 		       u32 valid)
1328 {
1329 	const struct intel_engine_cs * const engine = rq->engine;
1330 	u32 *cs;
1331 
1332 	cs = intel_ring_begin(rq, 12);
1333 	if (IS_ERR(cs))
1334 		return PTR_ERR(cs);
1335 
1336 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1337 	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
1338 	*cs++ = valid;
1339 
1340 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1341 	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
1342 	*cs++ = px_base(ppgtt->pd)->ggtt_offset << 10;
1343 
1344 	/* Stall until the page table load is complete? */
1345 	*cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1346 	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
1347 	*cs++ = intel_gt_scratch_offset(engine->gt,
1348 					INTEL_GT_SCRATCH_FIELD_DEFAULT);
1349 
1350 	*cs++ = MI_LOAD_REGISTER_IMM(1);
1351 	*cs++ = i915_mmio_reg_offset(RING_INSTPM(engine->mmio_base));
1352 	*cs++ = _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE);
1353 
1354 	intel_ring_advance(rq, cs);
1355 
1356 	return rq->engine->emit_flush(rq, EMIT_FLUSH);
1357 }
1358 
1359 static inline int mi_set_context(struct i915_request *rq,
1360 				 struct intel_context *ce,
1361 				 u32 flags)
1362 {
1363 	struct drm_i915_private *i915 = rq->i915;
1364 	struct intel_engine_cs *engine = rq->engine;
1365 	enum intel_engine_id id;
1366 	const int num_engines =
1367 		IS_HASWELL(i915) ? RUNTIME_INFO(i915)->num_engines - 1 : 0;
1368 	bool force_restore = false;
1369 	int len;
1370 	u32 *cs;
1371 
1372 	len = 4;
1373 	if (IS_GEN(i915, 7))
1374 		len += 2 + (num_engines ? 4 * num_engines + 6 : 0);
1375 	else if (IS_GEN(i915, 5))
1376 		len += 2;
1377 	if (flags & MI_FORCE_RESTORE) {
1378 		GEM_BUG_ON(flags & MI_RESTORE_INHIBIT);
1379 		flags &= ~MI_FORCE_RESTORE;
1380 		force_restore = true;
1381 		len += 2;
1382 	}
1383 
1384 	cs = intel_ring_begin(rq, len);
1385 	if (IS_ERR(cs))
1386 		return PTR_ERR(cs);
1387 
1388 	/* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */
1389 	if (IS_GEN(i915, 7)) {
1390 		*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1391 		if (num_engines) {
1392 			struct intel_engine_cs *signaller;
1393 
1394 			*cs++ = MI_LOAD_REGISTER_IMM(num_engines);
1395 			for_each_engine(signaller, engine->gt, id) {
1396 				if (signaller == engine)
1397 					continue;
1398 
1399 				*cs++ = i915_mmio_reg_offset(
1400 					   RING_PSMI_CTL(signaller->mmio_base));
1401 				*cs++ = _MASKED_BIT_ENABLE(
1402 						GEN6_PSMI_SLEEP_MSG_DISABLE);
1403 			}
1404 		}
1405 	} else if (IS_GEN(i915, 5)) {
1406 		/*
1407 		 * This w/a is only listed for pre-production ilk a/b steppings,
1408 		 * but is also mentioned for programming the powerctx. To be
1409 		 * safe, just apply the workaround; we do not use SyncFlush so
1410 		 * this should never take effect and so be a no-op!
1411 		 */
1412 		*cs++ = MI_SUSPEND_FLUSH | MI_SUSPEND_FLUSH_EN;
1413 	}
1414 
1415 	if (force_restore) {
1416 		/*
1417 		 * The HW doesn't handle being told to restore the current
1418 		 * context very well. Quite often it likes goes to go off and
1419 		 * sulk, especially when it is meant to be reloading PP_DIR.
1420 		 * A very simple fix to force the reload is to simply switch
1421 		 * away from the current context and back again.
1422 		 *
1423 		 * Note that the kernel_context will contain random state
1424 		 * following the INHIBIT_RESTORE. We accept this since we
1425 		 * never use the kernel_context state; it is merely a
1426 		 * placeholder we use to flush other contexts.
1427 		 */
1428 		*cs++ = MI_SET_CONTEXT;
1429 		*cs++ = i915_ggtt_offset(engine->kernel_context->state) |
1430 			MI_MM_SPACE_GTT |
1431 			MI_RESTORE_INHIBIT;
1432 	}
1433 
1434 	*cs++ = MI_NOOP;
1435 	*cs++ = MI_SET_CONTEXT;
1436 	*cs++ = i915_ggtt_offset(ce->state) | flags;
1437 	/*
1438 	 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
1439 	 * WaMiSetContext_Hang:snb,ivb,vlv
1440 	 */
1441 	*cs++ = MI_NOOP;
1442 
1443 	if (IS_GEN(i915, 7)) {
1444 		if (num_engines) {
1445 			struct intel_engine_cs *signaller;
1446 			i915_reg_t last_reg = {}; /* keep gcc quiet */
1447 
1448 			*cs++ = MI_LOAD_REGISTER_IMM(num_engines);
1449 			for_each_engine(signaller, engine->gt, id) {
1450 				if (signaller == engine)
1451 					continue;
1452 
1453 				last_reg = RING_PSMI_CTL(signaller->mmio_base);
1454 				*cs++ = i915_mmio_reg_offset(last_reg);
1455 				*cs++ = _MASKED_BIT_DISABLE(
1456 						GEN6_PSMI_SLEEP_MSG_DISABLE);
1457 			}
1458 
1459 			/* Insert a delay before the next switch! */
1460 			*cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
1461 			*cs++ = i915_mmio_reg_offset(last_reg);
1462 			*cs++ = intel_gt_scratch_offset(engine->gt,
1463 							INTEL_GT_SCRATCH_FIELD_DEFAULT);
1464 			*cs++ = MI_NOOP;
1465 		}
1466 		*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1467 	} else if (IS_GEN(i915, 5)) {
1468 		*cs++ = MI_SUSPEND_FLUSH;
1469 	}
1470 
1471 	intel_ring_advance(rq, cs);
1472 
1473 	return 0;
1474 }
1475 
1476 static int remap_l3_slice(struct i915_request *rq, int slice)
1477 {
1478 	u32 *cs, *remap_info = rq->i915->l3_parity.remap_info[slice];
1479 	int i;
1480 
1481 	if (!remap_info)
1482 		return 0;
1483 
1484 	cs = intel_ring_begin(rq, GEN7_L3LOG_SIZE/4 * 2 + 2);
1485 	if (IS_ERR(cs))
1486 		return PTR_ERR(cs);
1487 
1488 	/*
1489 	 * Note: We do not worry about the concurrent register cacheline hang
1490 	 * here because no other code should access these registers other than
1491 	 * at initialization time.
1492 	 */
1493 	*cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
1494 	for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
1495 		*cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
1496 		*cs++ = remap_info[i];
1497 	}
1498 	*cs++ = MI_NOOP;
1499 	intel_ring_advance(rq, cs);
1500 
1501 	return 0;
1502 }
1503 
1504 static int remap_l3(struct i915_request *rq)
1505 {
1506 	struct i915_gem_context *ctx = i915_request_gem_context(rq);
1507 	int i, err;
1508 
1509 	if (!ctx || !ctx->remap_slice)
1510 		return 0;
1511 
1512 	for (i = 0; i < MAX_L3_SLICES; i++) {
1513 		if (!(ctx->remap_slice & BIT(i)))
1514 			continue;
1515 
1516 		err = remap_l3_slice(rq, i);
1517 		if (err)
1518 			return err;
1519 	}
1520 
1521 	ctx->remap_slice = 0;
1522 	return 0;
1523 }
1524 
1525 static int switch_mm(struct i915_request *rq, struct i915_address_space *vm)
1526 {
1527 	int ret;
1528 
1529 	if (!vm)
1530 		return 0;
1531 
1532 	ret = rq->engine->emit_flush(rq, EMIT_FLUSH);
1533 	if (ret)
1534 		return ret;
1535 
1536 	/*
1537 	 * Not only do we need a full barrier (post-sync write) after
1538 	 * invalidating the TLBs, but we need to wait a little bit
1539 	 * longer. Whether this is merely delaying us, or the
1540 	 * subsequent flush is a key part of serialising with the
1541 	 * post-sync op, this extra pass appears vital before a
1542 	 * mm switch!
1543 	 */
1544 	ret = load_pd_dir(rq, i915_vm_to_ppgtt(vm), PP_DIR_DCLV_2G);
1545 	if (ret)
1546 		return ret;
1547 
1548 	return rq->engine->emit_flush(rq, EMIT_INVALIDATE);
1549 }
1550 
1551 static int clear_residuals(struct i915_request *rq)
1552 {
1553 	struct intel_engine_cs *engine = rq->engine;
1554 	int ret;
1555 
1556 	ret = switch_mm(rq, vm_alias(engine->kernel_context->vm));
1557 	if (ret)
1558 		return ret;
1559 
1560 	if (engine->kernel_context->state) {
1561 		ret = mi_set_context(rq,
1562 				     engine->kernel_context,
1563 				     MI_MM_SPACE_GTT | MI_RESTORE_INHIBIT);
1564 		if (ret)
1565 			return ret;
1566 	}
1567 
1568 	ret = engine->emit_bb_start(rq,
1569 				    engine->wa_ctx.vma->node.start, 0,
1570 				    0);
1571 	if (ret)
1572 		return ret;
1573 
1574 	ret = engine->emit_flush(rq, EMIT_FLUSH);
1575 	if (ret)
1576 		return ret;
1577 
1578 	/* Always invalidate before the next switch_mm() */
1579 	return engine->emit_flush(rq, EMIT_INVALIDATE);
1580 }
1581 
1582 static int switch_context(struct i915_request *rq)
1583 {
1584 	struct intel_engine_cs *engine = rq->engine;
1585 	struct intel_context *ce = rq->context;
1586 	void **residuals = NULL;
1587 	int ret;
1588 
1589 	GEM_BUG_ON(HAS_EXECLISTS(rq->i915));
1590 
1591 	if (engine->wa_ctx.vma && ce != engine->kernel_context) {
1592 		if (engine->wa_ctx.vma->private != ce) {
1593 			ret = clear_residuals(rq);
1594 			if (ret)
1595 				return ret;
1596 
1597 			residuals = &engine->wa_ctx.vma->private;
1598 		}
1599 	}
1600 
1601 	ret = switch_mm(rq, vm_alias(ce->vm));
1602 	if (ret)
1603 		return ret;
1604 
1605 	if (ce->state) {
1606 		u32 flags;
1607 
1608 		GEM_BUG_ON(engine->id != RCS0);
1609 
1610 		/* For resource streamer on HSW+ and power context elsewhere */
1611 		BUILD_BUG_ON(HSW_MI_RS_SAVE_STATE_EN != MI_SAVE_EXT_STATE_EN);
1612 		BUILD_BUG_ON(HSW_MI_RS_RESTORE_STATE_EN != MI_RESTORE_EXT_STATE_EN);
1613 
1614 		flags = MI_SAVE_EXT_STATE_EN | MI_MM_SPACE_GTT;
1615 		if (test_bit(CONTEXT_VALID_BIT, &ce->flags))
1616 			flags |= MI_RESTORE_EXT_STATE_EN;
1617 		else
1618 			flags |= MI_RESTORE_INHIBIT;
1619 
1620 		ret = mi_set_context(rq, ce, flags);
1621 		if (ret)
1622 			return ret;
1623 	}
1624 
1625 	ret = remap_l3(rq);
1626 	if (ret)
1627 		return ret;
1628 
1629 	/*
1630 	 * Now past the point of no return, this request _will_ be emitted.
1631 	 *
1632 	 * Or at least this preamble will be emitted, the request may be
1633 	 * interrupted prior to submitting the user payload. If so, we
1634 	 * still submit the "empty" request in order to preserve global
1635 	 * state tracking such as this, our tracking of the current
1636 	 * dirty context.
1637 	 */
1638 	if (residuals) {
1639 		intel_context_put(*residuals);
1640 		*residuals = intel_context_get(ce);
1641 	}
1642 
1643 	return 0;
1644 }
1645 
1646 static int ring_request_alloc(struct i915_request *request)
1647 {
1648 	int ret;
1649 
1650 	GEM_BUG_ON(!intel_context_is_pinned(request->context));
1651 	GEM_BUG_ON(i915_request_timeline(request)->has_initial_breadcrumb);
1652 
1653 	/*
1654 	 * Flush enough space to reduce the likelihood of waiting after
1655 	 * we start building the request - in which case we will just
1656 	 * have to repeat work.
1657 	 */
1658 	request->reserved_space += LEGACY_REQUEST_SIZE;
1659 
1660 	/* Unconditionally invalidate GPU caches and TLBs. */
1661 	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
1662 	if (ret)
1663 		return ret;
1664 
1665 	ret = switch_context(request);
1666 	if (ret)
1667 		return ret;
1668 
1669 	request->reserved_space -= LEGACY_REQUEST_SIZE;
1670 	return 0;
1671 }
1672 
1673 static void gen6_bsd_submit_request(struct i915_request *request)
1674 {
1675 	struct intel_uncore *uncore = request->engine->uncore;
1676 
1677 	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
1678 
1679        /* Every tail move must follow the sequence below */
1680 
1681 	/* Disable notification that the ring is IDLE. The GT
1682 	 * will then assume that it is busy and bring it out of rc6.
1683 	 */
1684 	intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
1685 			      _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
1686 
1687 	/* Clear the context id. Here be magic! */
1688 	intel_uncore_write64_fw(uncore, GEN6_BSD_RNCID, 0x0);
1689 
1690 	/* Wait for the ring not to be idle, i.e. for it to wake up. */
1691 	if (__intel_wait_for_register_fw(uncore,
1692 					 GEN6_BSD_SLEEP_PSMI_CONTROL,
1693 					 GEN6_BSD_SLEEP_INDICATOR,
1694 					 0,
1695 					 1000, 0, NULL))
1696 		drm_err(&uncore->i915->drm,
1697 			"timed out waiting for the BSD ring to wake up\n");
1698 
1699 	/* Now that the ring is fully powered up, update the tail */
1700 	i9xx_submit_request(request);
1701 
1702 	/* Let the ring send IDLE messages to the GT again,
1703 	 * and so let it sleep to conserve power when idle.
1704 	 */
1705 	intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL,
1706 			      _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE));
1707 
1708 	intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL);
1709 }
1710 
1711 static int mi_flush_dw(struct i915_request *rq, u32 flags)
1712 {
1713 	u32 cmd, *cs;
1714 
1715 	cs = intel_ring_begin(rq, 4);
1716 	if (IS_ERR(cs))
1717 		return PTR_ERR(cs);
1718 
1719 	cmd = MI_FLUSH_DW;
1720 
1721 	/*
1722 	 * We always require a command barrier so that subsequent
1723 	 * commands, such as breadcrumb interrupts, are strictly ordered
1724 	 * wrt the contents of the write cache being flushed to memory
1725 	 * (and thus being coherent from the CPU).
1726 	 */
1727 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
1728 
1729 	/*
1730 	 * Bspec vol 1c.3 - blitter engine command streamer:
1731 	 * "If ENABLED, all TLBs will be invalidated once the flush
1732 	 * operation is complete. This bit is only valid when the
1733 	 * Post-Sync Operation field is a value of 1h or 3h."
1734 	 */
1735 	cmd |= flags;
1736 
1737 	*cs++ = cmd;
1738 	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
1739 	*cs++ = 0;
1740 	*cs++ = MI_NOOP;
1741 
1742 	intel_ring_advance(rq, cs);
1743 
1744 	return 0;
1745 }
1746 
1747 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
1748 {
1749 	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
1750 }
1751 
1752 static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode)
1753 {
1754 	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
1755 }
1756 
1757 static int
1758 hsw_emit_bb_start(struct i915_request *rq,
1759 		  u64 offset, u32 len,
1760 		  unsigned int dispatch_flags)
1761 {
1762 	u32 *cs;
1763 
1764 	cs = intel_ring_begin(rq, 2);
1765 	if (IS_ERR(cs))
1766 		return PTR_ERR(cs);
1767 
1768 	*cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
1769 		0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW);
1770 	/* bit0-7 is the length on GEN6+ */
1771 	*cs++ = offset;
1772 	intel_ring_advance(rq, cs);
1773 
1774 	return 0;
1775 }
1776 
1777 static int
1778 gen6_emit_bb_start(struct i915_request *rq,
1779 		   u64 offset, u32 len,
1780 		   unsigned int dispatch_flags)
1781 {
1782 	u32 *cs;
1783 
1784 	cs = intel_ring_begin(rq, 2);
1785 	if (IS_ERR(cs))
1786 		return PTR_ERR(cs);
1787 
1788 	*cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
1789 		0 : MI_BATCH_NON_SECURE_I965);
1790 	/* bit0-7 is the length on GEN6+ */
1791 	*cs++ = offset;
1792 	intel_ring_advance(rq, cs);
1793 
1794 	return 0;
1795 }
1796 
1797 /* Blitter support (SandyBridge+) */
1798 
1799 static int gen6_ring_flush(struct i915_request *rq, u32 mode)
1800 {
1801 	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
1802 }
1803 
1804 static void i9xx_set_default_submission(struct intel_engine_cs *engine)
1805 {
1806 	engine->submit_request = i9xx_submit_request;
1807 
1808 	engine->park = NULL;
1809 	engine->unpark = NULL;
1810 }
1811 
1812 static void gen6_bsd_set_default_submission(struct intel_engine_cs *engine)
1813 {
1814 	i9xx_set_default_submission(engine);
1815 	engine->submit_request = gen6_bsd_submit_request;
1816 }
1817 
1818 static void ring_release(struct intel_engine_cs *engine)
1819 {
1820 	struct drm_i915_private *dev_priv = engine->i915;
1821 
1822 	drm_WARN_ON(&dev_priv->drm, INTEL_GEN(dev_priv) > 2 &&
1823 		    (ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0);
1824 
1825 	intel_engine_cleanup_common(engine);
1826 
1827 	if (engine->wa_ctx.vma) {
1828 		intel_context_put(engine->wa_ctx.vma->private);
1829 		i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1830 	}
1831 
1832 	intel_ring_unpin(engine->legacy.ring);
1833 	intel_ring_put(engine->legacy.ring);
1834 
1835 	intel_timeline_unpin(engine->legacy.timeline);
1836 	intel_timeline_put(engine->legacy.timeline);
1837 }
1838 
1839 static void setup_irq(struct intel_engine_cs *engine)
1840 {
1841 	struct drm_i915_private *i915 = engine->i915;
1842 
1843 	if (INTEL_GEN(i915) >= 6) {
1844 		engine->irq_enable = gen6_irq_enable;
1845 		engine->irq_disable = gen6_irq_disable;
1846 	} else if (INTEL_GEN(i915) >= 5) {
1847 		engine->irq_enable = gen5_irq_enable;
1848 		engine->irq_disable = gen5_irq_disable;
1849 	} else if (INTEL_GEN(i915) >= 3) {
1850 		engine->irq_enable = i9xx_irq_enable;
1851 		engine->irq_disable = i9xx_irq_disable;
1852 	} else {
1853 		engine->irq_enable = i8xx_irq_enable;
1854 		engine->irq_disable = i8xx_irq_disable;
1855 	}
1856 }
1857 
1858 static void setup_common(struct intel_engine_cs *engine)
1859 {
1860 	struct drm_i915_private *i915 = engine->i915;
1861 
1862 	/* gen8+ are only supported with execlists */
1863 	GEM_BUG_ON(INTEL_GEN(i915) >= 8);
1864 
1865 	setup_irq(engine);
1866 
1867 	engine->resume = xcs_resume;
1868 	engine->reset.prepare = reset_prepare;
1869 	engine->reset.rewind = reset_rewind;
1870 	engine->reset.cancel = reset_cancel;
1871 	engine->reset.finish = reset_finish;
1872 
1873 	engine->cops = &ring_context_ops;
1874 	engine->request_alloc = ring_request_alloc;
1875 
1876 	/*
1877 	 * Using a global execution timeline; the previous final breadcrumb is
1878 	 * equivalent to our next initial bread so we can elide
1879 	 * engine->emit_init_breadcrumb().
1880 	 */
1881 	engine->emit_fini_breadcrumb = i9xx_emit_breadcrumb;
1882 	if (IS_GEN(i915, 5))
1883 		engine->emit_fini_breadcrumb = gen5_emit_breadcrumb;
1884 
1885 	engine->set_default_submission = i9xx_set_default_submission;
1886 
1887 	if (INTEL_GEN(i915) >= 6)
1888 		engine->emit_bb_start = gen6_emit_bb_start;
1889 	else if (INTEL_GEN(i915) >= 4)
1890 		engine->emit_bb_start = i965_emit_bb_start;
1891 	else if (IS_I830(i915) || IS_I845G(i915))
1892 		engine->emit_bb_start = i830_emit_bb_start;
1893 	else
1894 		engine->emit_bb_start = i915_emit_bb_start;
1895 }
1896 
1897 static void setup_rcs(struct intel_engine_cs *engine)
1898 {
1899 	struct drm_i915_private *i915 = engine->i915;
1900 
1901 	if (HAS_L3_DPF(i915))
1902 		engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
1903 
1904 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
1905 
1906 	if (INTEL_GEN(i915) >= 7) {
1907 		engine->emit_flush = gen7_render_ring_flush;
1908 		engine->emit_fini_breadcrumb = gen7_rcs_emit_breadcrumb;
1909 	} else if (IS_GEN(i915, 6)) {
1910 		engine->emit_flush = gen6_render_ring_flush;
1911 		engine->emit_fini_breadcrumb = gen6_rcs_emit_breadcrumb;
1912 	} else if (IS_GEN(i915, 5)) {
1913 		engine->emit_flush = gen4_render_ring_flush;
1914 	} else {
1915 		if (INTEL_GEN(i915) < 4)
1916 			engine->emit_flush = gen2_render_ring_flush;
1917 		else
1918 			engine->emit_flush = gen4_render_ring_flush;
1919 		engine->irq_enable_mask = I915_USER_INTERRUPT;
1920 	}
1921 
1922 	if (IS_HASWELL(i915))
1923 		engine->emit_bb_start = hsw_emit_bb_start;
1924 
1925 	engine->resume = rcs_resume;
1926 }
1927 
1928 static void setup_vcs(struct intel_engine_cs *engine)
1929 {
1930 	struct drm_i915_private *i915 = engine->i915;
1931 
1932 	if (INTEL_GEN(i915) >= 6) {
1933 		/* gen6 bsd needs a special wa for tail updates */
1934 		if (IS_GEN(i915, 6))
1935 			engine->set_default_submission = gen6_bsd_set_default_submission;
1936 		engine->emit_flush = gen6_bsd_ring_flush;
1937 		engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
1938 
1939 		if (IS_GEN(i915, 6))
1940 			engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
1941 		else
1942 			engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
1943 	} else {
1944 		engine->emit_flush = bsd_ring_flush;
1945 		if (IS_GEN(i915, 5))
1946 			engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
1947 		else
1948 			engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
1949 	}
1950 }
1951 
1952 static void setup_bcs(struct intel_engine_cs *engine)
1953 {
1954 	struct drm_i915_private *i915 = engine->i915;
1955 
1956 	engine->emit_flush = gen6_ring_flush;
1957 	engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
1958 
1959 	if (IS_GEN(i915, 6))
1960 		engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
1961 	else
1962 		engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
1963 }
1964 
1965 static void setup_vecs(struct intel_engine_cs *engine)
1966 {
1967 	struct drm_i915_private *i915 = engine->i915;
1968 
1969 	GEM_BUG_ON(INTEL_GEN(i915) < 7);
1970 
1971 	engine->emit_flush = gen6_ring_flush;
1972 	engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
1973 	engine->irq_enable = hsw_vebox_irq_enable;
1974 	engine->irq_disable = hsw_vebox_irq_disable;
1975 
1976 	engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
1977 }
1978 
1979 static int gen7_ctx_switch_bb_setup(struct intel_engine_cs * const engine,
1980 				    struct i915_vma * const vma)
1981 {
1982 	return 0;
1983 }
1984 
1985 static int gen7_ctx_switch_bb_init(struct intel_engine_cs *engine)
1986 {
1987 	struct drm_i915_gem_object *obj;
1988 	struct i915_vma *vma;
1989 	int size;
1990 	int err;
1991 
1992 	size = gen7_ctx_switch_bb_setup(engine, NULL /* probe size */);
1993 	if (size <= 0)
1994 		return size;
1995 
1996 	size = ALIGN(size, PAGE_SIZE);
1997 	obj = i915_gem_object_create_internal(engine->i915, size);
1998 	if (IS_ERR(obj))
1999 		return PTR_ERR(obj);
2000 
2001 	vma = i915_vma_instance(obj, engine->gt->vm, NULL);
2002 	if (IS_ERR(vma)) {
2003 		err = PTR_ERR(vma);
2004 		goto err_obj;
2005 	}
2006 
2007 	vma->private = intel_context_create(engine); /* dummy residuals */
2008 	if (IS_ERR(vma->private)) {
2009 		err = PTR_ERR(vma->private);
2010 		goto err_obj;
2011 	}
2012 
2013 	err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH);
2014 	if (err)
2015 		goto err_private;
2016 
2017 	err = gen7_ctx_switch_bb_setup(engine, vma);
2018 	if (err)
2019 		goto err_unpin;
2020 
2021 	engine->wa_ctx.vma = vma;
2022 	return 0;
2023 
2024 err_unpin:
2025 	i915_vma_unpin(vma);
2026 err_private:
2027 	intel_context_put(vma->private);
2028 err_obj:
2029 	i915_gem_object_put(obj);
2030 	return err;
2031 }
2032 
2033 int intel_ring_submission_setup(struct intel_engine_cs *engine)
2034 {
2035 	struct intel_timeline *timeline;
2036 	struct intel_ring *ring;
2037 	int err;
2038 
2039 	setup_common(engine);
2040 
2041 	switch (engine->class) {
2042 	case RENDER_CLASS:
2043 		setup_rcs(engine);
2044 		break;
2045 	case VIDEO_DECODE_CLASS:
2046 		setup_vcs(engine);
2047 		break;
2048 	case COPY_ENGINE_CLASS:
2049 		setup_bcs(engine);
2050 		break;
2051 	case VIDEO_ENHANCEMENT_CLASS:
2052 		setup_vecs(engine);
2053 		break;
2054 	default:
2055 		MISSING_CASE(engine->class);
2056 		return -ENODEV;
2057 	}
2058 
2059 	timeline = intel_timeline_create(engine->gt, engine->status_page.vma);
2060 	if (IS_ERR(timeline)) {
2061 		err = PTR_ERR(timeline);
2062 		goto err;
2063 	}
2064 	GEM_BUG_ON(timeline->has_initial_breadcrumb);
2065 
2066 	err = intel_timeline_pin(timeline);
2067 	if (err)
2068 		goto err_timeline;
2069 
2070 	ring = intel_engine_create_ring(engine, SZ_16K);
2071 	if (IS_ERR(ring)) {
2072 		err = PTR_ERR(ring);
2073 		goto err_timeline_unpin;
2074 	}
2075 
2076 	err = intel_ring_pin(ring);
2077 	if (err)
2078 		goto err_ring;
2079 
2080 	GEM_BUG_ON(engine->legacy.ring);
2081 	engine->legacy.ring = ring;
2082 	engine->legacy.timeline = timeline;
2083 
2084 	GEM_BUG_ON(timeline->hwsp_ggtt != engine->status_page.vma);
2085 
2086 	if (IS_GEN(engine->i915, 7) && engine->class == RENDER_CLASS) {
2087 		err = gen7_ctx_switch_bb_init(engine);
2088 		if (err)
2089 			goto err_ring_unpin;
2090 	}
2091 
2092 	/* Finally, take ownership and responsibility for cleanup! */
2093 	engine->release = ring_release;
2094 
2095 	return 0;
2096 
2097 err_ring_unpin:
2098 	intel_ring_unpin(ring);
2099 err_ring:
2100 	intel_ring_put(ring);
2101 err_timeline_unpin:
2102 	intel_timeline_unpin(timeline);
2103 err_timeline:
2104 	intel_timeline_put(timeline);
2105 err:
2106 	intel_engine_cleanup_common(engine);
2107 	return err;
2108 }
2109 
2110 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2111 #include "selftest_ring_submission.c"
2112 #endif
2113