1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5 
6 #include "gen6_engine_cs.h"
7 #include "intel_engine.h"
8 #include "intel_gpu_commands.h"
9 #include "intel_gt.h"
10 #include "intel_gt_irq.h"
11 #include "intel_gt_pm_irq.h"
12 #include "intel_ring.h"
13 
14 #define HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))
15 
16 /*
17  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
18  * implementing two workarounds on gen6.  From section 1.4.7.1
19  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
20  *
21  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
22  * produced by non-pipelined state commands), software needs to first
23  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
24  * 0.
25  *
26  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
27  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
28  *
29  * And the workaround for these two requires this workaround first:
30  *
31  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
32  * BEFORE the pipe-control with a post-sync op and no write-cache
33  * flushes.
34  *
35  * And this last workaround is tricky because of the requirements on
36  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
37  * volume 2 part 1:
38  *
39  *     "1 of the following must also be set:
40  *      - Render Target Cache Flush Enable ([12] of DW1)
41  *      - Depth Cache Flush Enable ([0] of DW1)
42  *      - Stall at Pixel Scoreboard ([1] of DW1)
43  *      - Depth Stall ([13] of DW1)
44  *      - Post-Sync Operation ([13] of DW1)
45  *      - Notify Enable ([8] of DW1)"
46  *
47  * The cache flushes require the workaround flush that triggered this
48  * one, so we can't use it.  Depth stall would trigger the same.
49  * Post-sync nonzero is what triggered this second workaround, so we
50  * can't use that one either.  Notify enable is IRQs, which aren't
51  * really our business.  That leaves only stall at scoreboard.
52  */
53 static int
54 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
55 {
56 	u32 scratch_addr =
57 		intel_gt_scratch_offset(rq->engine->gt,
58 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
59 	u32 *cs;
60 
61 	cs = intel_ring_begin(rq, 6);
62 	if (IS_ERR(cs))
63 		return PTR_ERR(cs);
64 
65 	*cs++ = GFX_OP_PIPE_CONTROL(5);
66 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
67 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
68 	*cs++ = 0; /* low dword */
69 	*cs++ = 0; /* high dword */
70 	*cs++ = MI_NOOP;
71 	intel_ring_advance(rq, cs);
72 
73 	cs = intel_ring_begin(rq, 6);
74 	if (IS_ERR(cs))
75 		return PTR_ERR(cs);
76 
77 	*cs++ = GFX_OP_PIPE_CONTROL(5);
78 	*cs++ = PIPE_CONTROL_QW_WRITE;
79 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
80 	*cs++ = 0;
81 	*cs++ = 0;
82 	*cs++ = MI_NOOP;
83 	intel_ring_advance(rq, cs);
84 
85 	return 0;
86 }
87 
88 int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
89 {
90 	u32 scratch_addr =
91 		intel_gt_scratch_offset(rq->engine->gt,
92 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
93 	u32 *cs, flags = 0;
94 	int ret;
95 
96 	/* Force SNB workarounds for PIPE_CONTROL flushes */
97 	ret = gen6_emit_post_sync_nonzero_flush(rq);
98 	if (ret)
99 		return ret;
100 
101 	/*
102 	 * Just flush everything.  Experiments have shown that reducing the
103 	 * number of bits based on the write domains has little performance
104 	 * impact. And when rearranging requests, the order of flushes is
105 	 * unknown.
106 	 */
107 	if (mode & EMIT_FLUSH) {
108 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
109 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
110 		/*
111 		 * Ensure that any following seqno writes only happen
112 		 * when the render cache is indeed flushed.
113 		 */
114 		flags |= PIPE_CONTROL_CS_STALL;
115 	}
116 	if (mode & EMIT_INVALIDATE) {
117 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
118 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
119 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
120 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
121 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
122 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
123 		/*
124 		 * TLB invalidate requires a post-sync write.
125 		 */
126 		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
127 	}
128 
129 	cs = intel_ring_begin(rq, 4);
130 	if (IS_ERR(cs))
131 		return PTR_ERR(cs);
132 
133 	*cs++ = GFX_OP_PIPE_CONTROL(4);
134 	*cs++ = flags;
135 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
136 	*cs++ = 0;
137 	intel_ring_advance(rq, cs);
138 
139 	return 0;
140 }
141 
142 u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
143 {
144 	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
145 	*cs++ = GFX_OP_PIPE_CONTROL(4);
146 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
147 	*cs++ = 0;
148 	*cs++ = 0;
149 
150 	*cs++ = GFX_OP_PIPE_CONTROL(4);
151 	*cs++ = PIPE_CONTROL_QW_WRITE;
152 	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
153 					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
154 		PIPE_CONTROL_GLOBAL_GTT;
155 	*cs++ = 0;
156 
157 	/* Finally we can flush and with it emit the breadcrumb */
158 	*cs++ = GFX_OP_PIPE_CONTROL(4);
159 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
160 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
161 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
162 		 PIPE_CONTROL_QW_WRITE |
163 		 PIPE_CONTROL_CS_STALL);
164 	*cs++ = i915_request_active_timeline(rq)->hwsp_offset |
165 		PIPE_CONTROL_GLOBAL_GTT;
166 	*cs++ = rq->fence.seqno;
167 
168 	*cs++ = MI_USER_INTERRUPT;
169 	*cs++ = MI_NOOP;
170 
171 	rq->tail = intel_ring_offset(rq, cs);
172 	assert_ring_tail_valid(rq->ring, rq->tail);
173 
174 	return cs;
175 }
176 
177 static int mi_flush_dw(struct i915_request *rq, u32 flags)
178 {
179 	u32 cmd, *cs;
180 
181 	cs = intel_ring_begin(rq, 4);
182 	if (IS_ERR(cs))
183 		return PTR_ERR(cs);
184 
185 	cmd = MI_FLUSH_DW;
186 
187 	/*
188 	 * We always require a command barrier so that subsequent
189 	 * commands, such as breadcrumb interrupts, are strictly ordered
190 	 * wrt the contents of the write cache being flushed to memory
191 	 * (and thus being coherent from the CPU).
192 	 */
193 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
194 
195 	/*
196 	 * Bspec vol 1c.3 - blitter engine command streamer:
197 	 * "If ENABLED, all TLBs will be invalidated once the flush
198 	 * operation is complete. This bit is only valid when the
199 	 * Post-Sync Operation field is a value of 1h or 3h."
200 	 */
201 	cmd |= flags;
202 
203 	*cs++ = cmd;
204 	*cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
205 	*cs++ = 0;
206 	*cs++ = MI_NOOP;
207 
208 	intel_ring_advance(rq, cs);
209 
210 	return 0;
211 }
212 
213 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
214 {
215 	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
216 }
217 
218 int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
219 {
220 	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
221 }
222 
223 int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
224 {
225 	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
226 }
227 
228 int gen6_emit_bb_start(struct i915_request *rq,
229 		       u64 offset, u32 len,
230 		       unsigned int dispatch_flags)
231 {
232 	u32 security;
233 	u32 *cs;
234 
235 	security = MI_BATCH_NON_SECURE_I965;
236 	if (dispatch_flags & I915_DISPATCH_SECURE)
237 		security = 0;
238 
239 	cs = intel_ring_begin(rq, 2);
240 	if (IS_ERR(cs))
241 		return PTR_ERR(cs);
242 
243 	cs = __gen6_emit_bb_start(cs, offset, security);
244 	intel_ring_advance(rq, cs);
245 
246 	return 0;
247 }
248 
249 int
250 hsw_emit_bb_start(struct i915_request *rq,
251 		  u64 offset, u32 len,
252 		  unsigned int dispatch_flags)
253 {
254 	u32 security;
255 	u32 *cs;
256 
257 	security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
258 	if (dispatch_flags & I915_DISPATCH_SECURE)
259 		security = 0;
260 
261 	cs = intel_ring_begin(rq, 2);
262 	if (IS_ERR(cs))
263 		return PTR_ERR(cs);
264 
265 	cs = __gen6_emit_bb_start(cs, offset, security);
266 	intel_ring_advance(rq, cs);
267 
268 	return 0;
269 }
270 
271 static int gen7_stall_cs(struct i915_request *rq)
272 {
273 	u32 *cs;
274 
275 	cs = intel_ring_begin(rq, 4);
276 	if (IS_ERR(cs))
277 		return PTR_ERR(cs);
278 
279 	*cs++ = GFX_OP_PIPE_CONTROL(4);
280 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
281 	*cs++ = 0;
282 	*cs++ = 0;
283 	intel_ring_advance(rq, cs);
284 
285 	return 0;
286 }
287 
288 int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
289 {
290 	u32 scratch_addr =
291 		intel_gt_scratch_offset(rq->engine->gt,
292 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
293 	u32 *cs, flags = 0;
294 
295 	/*
296 	 * Ensure that any following seqno writes only happen when the render
297 	 * cache is indeed flushed.
298 	 *
299 	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
300 	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
301 	 * don't try to be clever and just set it unconditionally.
302 	 */
303 	flags |= PIPE_CONTROL_CS_STALL;
304 
305 	/*
306 	 * CS_STALL suggests at least a post-sync write.
307 	 */
308 	flags |= PIPE_CONTROL_QW_WRITE;
309 	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
310 
311 	/*
312 	 * Just flush everything.  Experiments have shown that reducing the
313 	 * number of bits based on the write domains has little performance
314 	 * impact.
315 	 */
316 	if (mode & EMIT_FLUSH) {
317 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
318 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
319 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
320 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
321 	}
322 	if (mode & EMIT_INVALIDATE) {
323 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
324 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
325 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
326 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
327 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
328 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
329 		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
330 
331 		/*
332 		 * Workaround: we must issue a pipe_control with CS-stall bit
333 		 * set before a pipe_control command that has the state cache
334 		 * invalidate bit set.
335 		 */
336 		gen7_stall_cs(rq);
337 	}
338 
339 	cs = intel_ring_begin(rq, 4);
340 	if (IS_ERR(cs))
341 		return PTR_ERR(cs);
342 
343 	*cs++ = GFX_OP_PIPE_CONTROL(4);
344 	*cs++ = flags;
345 	*cs++ = scratch_addr;
346 	*cs++ = 0;
347 	intel_ring_advance(rq, cs);
348 
349 	return 0;
350 }
351 
352 u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
353 {
354 	*cs++ = GFX_OP_PIPE_CONTROL(4);
355 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
356 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
357 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
358 		 PIPE_CONTROL_FLUSH_ENABLE |
359 		 PIPE_CONTROL_QW_WRITE |
360 		 PIPE_CONTROL_GLOBAL_GTT_IVB |
361 		 PIPE_CONTROL_CS_STALL);
362 	*cs++ = i915_request_active_timeline(rq)->hwsp_offset;
363 	*cs++ = rq->fence.seqno;
364 
365 	*cs++ = MI_USER_INTERRUPT;
366 	*cs++ = MI_NOOP;
367 
368 	rq->tail = intel_ring_offset(rq, cs);
369 	assert_ring_tail_valid(rq->ring, rq->tail);
370 
371 	return cs;
372 }
373 
374 u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
375 {
376 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
377 	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
378 
379 	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
380 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
381 	*cs++ = rq->fence.seqno;
382 
383 	*cs++ = MI_USER_INTERRUPT;
384 
385 	rq->tail = intel_ring_offset(rq, cs);
386 	assert_ring_tail_valid(rq->ring, rq->tail);
387 
388 	return cs;
389 }
390 
391 #define GEN7_XCS_WA 32
392 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
393 {
394 	int i;
395 
396 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
397 	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
398 
399 	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
400 		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
401 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
402 	*cs++ = rq->fence.seqno;
403 
404 	for (i = 0; i < GEN7_XCS_WA; i++) {
405 		*cs++ = MI_STORE_DWORD_INDEX;
406 		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
407 		*cs++ = rq->fence.seqno;
408 	}
409 
410 	*cs++ = MI_FLUSH_DW;
411 	*cs++ = 0;
412 	*cs++ = 0;
413 
414 	*cs++ = MI_USER_INTERRUPT;
415 	*cs++ = MI_NOOP;
416 
417 	rq->tail = intel_ring_offset(rq, cs);
418 	assert_ring_tail_valid(rq->ring, rq->tail);
419 
420 	return cs;
421 }
422 #undef GEN7_XCS_WA
423 
424 void gen6_irq_enable(struct intel_engine_cs *engine)
425 {
426 	ENGINE_WRITE(engine, RING_IMR,
427 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
428 
429 	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
430 	ENGINE_POSTING_READ(engine, RING_IMR);
431 
432 	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
433 }
434 
435 void gen6_irq_disable(struct intel_engine_cs *engine)
436 {
437 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
438 	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
439 }
440 
441 void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
442 {
443 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
444 
445 	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
446 	ENGINE_POSTING_READ(engine, RING_IMR);
447 
448 	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
449 }
450 
451 void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
452 {
453 	ENGINE_WRITE(engine, RING_IMR, ~0);
454 	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
455 }
456