xref: /openbmc/linux/drivers/gpu/drm/i915/gt/gen2_engine_cs.c (revision 75b1a8f9d62e50f05d0e4e9f3c8bcde32527ffc1)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5 
6 #include "gen2_engine_cs.h"
7 #include "i915_drv.h"
8 #include "intel_engine.h"
9 #include "intel_gpu_commands.h"
10 #include "intel_gt.h"
11 #include "intel_gt_irq.h"
12 #include "intel_ring.h"
13 
14 int gen2_emit_flush(struct i915_request *rq, u32 mode)
15 {
16 	unsigned int num_store_dw = 12;
17 	u32 cmd, *cs;
18 
19 	cmd = MI_FLUSH;
20 	if (mode & EMIT_INVALIDATE)
21 		cmd |= MI_READ_FLUSH;
22 
23 	cs = intel_ring_begin(rq, 2 + 4 * num_store_dw);
24 	if (IS_ERR(cs))
25 		return PTR_ERR(cs);
26 
27 	*cs++ = cmd;
28 	while (num_store_dw--) {
29 		*cs++ = MI_STORE_DWORD_INDEX;
30 		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
31 		*cs++ = 0;
32 		*cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
33 	}
34 	*cs++ = cmd;
35 
36 	intel_ring_advance(rq, cs);
37 
38 	return 0;
39 }
40 
41 int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode)
42 {
43 	u32 cmd, *cs;
44 	int i;
45 
46 	/*
47 	 * read/write caches:
48 	 *
49 	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
50 	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
51 	 * also flushed at 2d versus 3d pipeline switches.
52 	 *
53 	 * read-only caches:
54 	 *
55 	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
56 	 * MI_READ_FLUSH is set, and is always flushed on 965.
57 	 *
58 	 * I915_GEM_DOMAIN_COMMAND may not exist?
59 	 *
60 	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
61 	 * invalidated when MI_EXE_FLUSH is set.
62 	 *
63 	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
64 	 * invalidated with every MI_FLUSH.
65 	 *
66 	 * TLBs:
67 	 *
68 	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
69 	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
70 	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
71 	 * are flushed at any MI_FLUSH.
72 	 */
73 
74 	cmd = MI_FLUSH;
75 	if (mode & EMIT_INVALIDATE) {
76 		cmd |= MI_EXE_FLUSH;
77 		if (IS_G4X(rq->engine->i915) || IS_GEN(rq->engine->i915, 5))
78 			cmd |= MI_INVALIDATE_ISP;
79 	}
80 
81 	i = 2;
82 	if (mode & EMIT_INVALIDATE)
83 		i += 20;
84 
85 	cs = intel_ring_begin(rq, i);
86 	if (IS_ERR(cs))
87 		return PTR_ERR(cs);
88 
89 	*cs++ = cmd;
90 
91 	/*
92 	 * A random delay to let the CS invalidate take effect? Without this
93 	 * delay, the GPU relocation path fails as the CS does not see
94 	 * the updated contents. Just as important, if we apply the flushes
95 	 * to the EMIT_FLUSH branch (i.e. immediately after the relocation
96 	 * write and before the invalidate on the next batch), the relocations
97 	 * still fail. This implies that is a delay following invalidation
98 	 * that is required to reset the caches as opposed to a delay to
99 	 * ensure the memory is written.
100 	 */
101 	if (mode & EMIT_INVALIDATE) {
102 		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
103 		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
104 						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
105 			PIPE_CONTROL_GLOBAL_GTT;
106 		*cs++ = 0;
107 		*cs++ = 0;
108 
109 		for (i = 0; i < 12; i++)
110 			*cs++ = MI_FLUSH;
111 
112 		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
113 		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
114 						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
115 			PIPE_CONTROL_GLOBAL_GTT;
116 		*cs++ = 0;
117 		*cs++ = 0;
118 	}
119 
120 	*cs++ = cmd;
121 
122 	intel_ring_advance(rq, cs);
123 
124 	return 0;
125 }
126 
127 int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode)
128 {
129 	u32 *cs;
130 
131 	cs = intel_ring_begin(rq, 2);
132 	if (IS_ERR(cs))
133 		return PTR_ERR(cs);
134 
135 	*cs++ = MI_FLUSH;
136 	*cs++ = MI_NOOP;
137 	intel_ring_advance(rq, cs);
138 
139 	return 0;
140 }
141 
142 static u32 *__gen2_emit_breadcrumb(struct i915_request *rq, u32 *cs,
143 				   int flush, int post)
144 {
145 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
146 	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
147 
148 	*cs++ = MI_FLUSH;
149 
150 	while (flush--) {
151 		*cs++ = MI_STORE_DWORD_INDEX;
152 		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
153 		*cs++ = rq->fence.seqno;
154 	}
155 
156 	while (post--) {
157 		*cs++ = MI_STORE_DWORD_INDEX;
158 		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
159 		*cs++ = rq->fence.seqno;
160 	}
161 
162 	*cs++ = MI_USER_INTERRUPT;
163 
164 	rq->tail = intel_ring_offset(rq, cs);
165 	assert_ring_tail_valid(rq->ring, rq->tail);
166 
167 	return cs;
168 }
169 
170 u32 *gen3_emit_breadcrumb(struct i915_request *rq, u32 *cs)
171 {
172 	return __gen2_emit_breadcrumb(rq, cs, 16, 8);
173 }
174 
175 u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
176 {
177 	return __gen2_emit_breadcrumb(rq, cs, 8, 8);
178 }
179 
180 /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
181 #define I830_BATCH_LIMIT SZ_256K
182 #define I830_TLB_ENTRIES (2)
183 #define I830_WA_SIZE max(I830_TLB_ENTRIES * SZ_4K, I830_BATCH_LIMIT)
184 int i830_emit_bb_start(struct i915_request *rq,
185 		       u64 offset, u32 len,
186 		       unsigned int dispatch_flags)
187 {
188 	u32 *cs, cs_offset =
189 		intel_gt_scratch_offset(rq->engine->gt,
190 					INTEL_GT_SCRATCH_FIELD_DEFAULT);
191 
192 	GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE);
193 
194 	cs = intel_ring_begin(rq, 6);
195 	if (IS_ERR(cs))
196 		return PTR_ERR(cs);
197 
198 	/* Evict the invalid PTE TLBs */
199 	*cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
200 	*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
201 	*cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
202 	*cs++ = cs_offset;
203 	*cs++ = 0xdeadbeef;
204 	*cs++ = MI_NOOP;
205 	intel_ring_advance(rq, cs);
206 
207 	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
208 		if (len > I830_BATCH_LIMIT)
209 			return -ENOSPC;
210 
211 		cs = intel_ring_begin(rq, 6 + 2);
212 		if (IS_ERR(cs))
213 			return PTR_ERR(cs);
214 
215 		/*
216 		 * Blit the batch (which has now all relocs applied) to the
217 		 * stable batch scratch bo area (so that the CS never
218 		 * stumbles over its tlb invalidation bug) ...
219 		 */
220 		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
221 		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
222 		*cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
223 		*cs++ = cs_offset;
224 		*cs++ = 4096;
225 		*cs++ = offset;
226 
227 		*cs++ = MI_FLUSH;
228 		*cs++ = MI_NOOP;
229 		intel_ring_advance(rq, cs);
230 
231 		/* ... and execute it. */
232 		offset = cs_offset;
233 	}
234 
235 	if (!(dispatch_flags & I915_DISPATCH_SECURE))
236 		offset |= MI_BATCH_NON_SECURE;
237 
238 	cs = intel_ring_begin(rq, 2);
239 	if (IS_ERR(cs))
240 		return PTR_ERR(cs);
241 
242 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
243 	*cs++ = offset;
244 	intel_ring_advance(rq, cs);
245 
246 	return 0;
247 }
248 
249 int gen3_emit_bb_start(struct i915_request *rq,
250 		       u64 offset, u32 len,
251 		       unsigned int dispatch_flags)
252 {
253 	u32 *cs;
254 
255 	if (!(dispatch_flags & I915_DISPATCH_SECURE))
256 		offset |= MI_BATCH_NON_SECURE;
257 
258 	cs = intel_ring_begin(rq, 2);
259 	if (IS_ERR(cs))
260 		return PTR_ERR(cs);
261 
262 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
263 	*cs++ = offset;
264 	intel_ring_advance(rq, cs);
265 
266 	return 0;
267 }
268 
269 int gen4_emit_bb_start(struct i915_request *rq,
270 		       u64 offset, u32 length,
271 		       unsigned int dispatch_flags)
272 {
273 	u32 security;
274 	u32 *cs;
275 
276 	security = MI_BATCH_NON_SECURE_I965;
277 	if (dispatch_flags & I915_DISPATCH_SECURE)
278 		security = 0;
279 
280 	cs = intel_ring_begin(rq, 2);
281 	if (IS_ERR(cs))
282 		return PTR_ERR(cs);
283 
284 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | security;
285 	*cs++ = offset;
286 	intel_ring_advance(rq, cs);
287 
288 	return 0;
289 }
290 
291 void gen2_irq_enable(struct intel_engine_cs *engine)
292 {
293 	struct drm_i915_private *i915 = engine->i915;
294 
295 	i915->irq_mask &= ~engine->irq_enable_mask;
296 	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
297 	ENGINE_POSTING_READ16(engine, RING_IMR);
298 }
299 
300 void gen2_irq_disable(struct intel_engine_cs *engine)
301 {
302 	struct drm_i915_private *i915 = engine->i915;
303 
304 	i915->irq_mask |= engine->irq_enable_mask;
305 	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
306 }
307 
308 void gen3_irq_enable(struct intel_engine_cs *engine)
309 {
310 	engine->i915->irq_mask &= ~engine->irq_enable_mask;
311 	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
312 	intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
313 }
314 
315 void gen3_irq_disable(struct intel_engine_cs *engine)
316 {
317 	engine->i915->irq_mask |= engine->irq_enable_mask;
318 	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
319 }
320 
321 void gen5_irq_enable(struct intel_engine_cs *engine)
322 {
323 	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
324 }
325 
326 void gen5_irq_disable(struct intel_engine_cs *engine)
327 {
328 	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
329 }
330