1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include "gen6_engine_cs.h" 7 #include "intel_engine.h" 8 #include "intel_gpu_commands.h" 9 #include "intel_gt.h" 10 #include "intel_gt_irq.h" 11 #include "intel_gt_pm_irq.h" 12 #include "intel_ring.h" 13 14 #define HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH * sizeof(u32)) 15 16 /* 17 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for 18 * implementing two workarounds on gen6. From section 1.4.7.1 19 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1: 20 * 21 * [DevSNB-C+{W/A}] Before any depth stall flush (including those 22 * produced by non-pipelined state commands), software needs to first 23 * send a PIPE_CONTROL with no bits set except Post-Sync Operation != 24 * 0. 25 * 26 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable 27 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required. 28 * 29 * And the workaround for these two requires this workaround first: 30 * 31 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent 32 * BEFORE the pipe-control with a post-sync op and no write-cache 33 * flushes. 34 * 35 * And this last workaround is tricky because of the requirements on 36 * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM 37 * volume 2 part 1: 38 * 39 * "1 of the following must also be set: 40 * - Render Target Cache Flush Enable ([12] of DW1) 41 * - Depth Cache Flush Enable ([0] of DW1) 42 * - Stall at Pixel Scoreboard ([1] of DW1) 43 * - Depth Stall ([13] of DW1) 44 * - Post-Sync Operation ([13] of DW1) 45 * - Notify Enable ([8] of DW1)" 46 * 47 * The cache flushes require the workaround flush that triggered this 48 * one, so we can't use it. Depth stall would trigger the same. 49 * Post-sync nonzero is what triggered this second workaround, so we 50 * can't use that one either. Notify enable is IRQs, which aren't 51 * really our business. That leaves only stall at scoreboard. 52 */ 53 static int 54 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq) 55 { 56 u32 scratch_addr = 57 intel_gt_scratch_offset(rq->engine->gt, 58 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); 59 u32 *cs; 60 61 cs = intel_ring_begin(rq, 6); 62 if (IS_ERR(cs)) 63 return PTR_ERR(cs); 64 65 *cs++ = GFX_OP_PIPE_CONTROL(5); 66 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; 67 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; 68 *cs++ = 0; /* low dword */ 69 *cs++ = 0; /* high dword */ 70 *cs++ = MI_NOOP; 71 intel_ring_advance(rq, cs); 72 73 cs = intel_ring_begin(rq, 6); 74 if (IS_ERR(cs)) 75 return PTR_ERR(cs); 76 77 *cs++ = GFX_OP_PIPE_CONTROL(5); 78 *cs++ = PIPE_CONTROL_QW_WRITE; 79 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; 80 *cs++ = 0; 81 *cs++ = 0; 82 *cs++ = MI_NOOP; 83 intel_ring_advance(rq, cs); 84 85 return 0; 86 } 87 88 int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode) 89 { 90 u32 scratch_addr = 91 intel_gt_scratch_offset(rq->engine->gt, 92 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); 93 u32 *cs, flags = 0; 94 int ret; 95 96 /* Force SNB workarounds for PIPE_CONTROL flushes */ 97 ret = gen6_emit_post_sync_nonzero_flush(rq); 98 if (ret) 99 return ret; 100 101 /* 102 * Just flush everything. Experiments have shown that reducing the 103 * number of bits based on the write domains has little performance 104 * impact. And when rearranging requests, the order of flushes is 105 * unknown. 106 */ 107 if (mode & EMIT_FLUSH) { 108 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 109 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 110 /* 111 * Ensure that any following seqno writes only happen 112 * when the render cache is indeed flushed. 113 */ 114 flags |= PIPE_CONTROL_CS_STALL; 115 } 116 if (mode & EMIT_INVALIDATE) { 117 flags |= PIPE_CONTROL_TLB_INVALIDATE; 118 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 119 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 120 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 121 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 122 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 123 /* 124 * TLB invalidate requires a post-sync write. 125 */ 126 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL; 127 } 128 129 cs = intel_ring_begin(rq, 4); 130 if (IS_ERR(cs)) 131 return PTR_ERR(cs); 132 133 *cs++ = GFX_OP_PIPE_CONTROL(4); 134 *cs++ = flags; 135 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; 136 *cs++ = 0; 137 intel_ring_advance(rq, cs); 138 139 return 0; 140 } 141 142 u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs) 143 { 144 /* First we do the gen6_emit_post_sync_nonzero_flush w/a */ 145 *cs++ = GFX_OP_PIPE_CONTROL(4); 146 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; 147 *cs++ = 0; 148 *cs++ = 0; 149 150 *cs++ = GFX_OP_PIPE_CONTROL(4); 151 *cs++ = PIPE_CONTROL_QW_WRITE; 152 *cs++ = intel_gt_scratch_offset(rq->engine->gt, 153 INTEL_GT_SCRATCH_FIELD_DEFAULT) | 154 PIPE_CONTROL_GLOBAL_GTT; 155 *cs++ = 0; 156 157 /* Finally we can flush and with it emit the breadcrumb */ 158 *cs++ = GFX_OP_PIPE_CONTROL(4); 159 *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 160 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 161 PIPE_CONTROL_DC_FLUSH_ENABLE | 162 PIPE_CONTROL_QW_WRITE | 163 PIPE_CONTROL_CS_STALL); 164 *cs++ = i915_request_active_timeline(rq)->hwsp_offset | 165 PIPE_CONTROL_GLOBAL_GTT; 166 *cs++ = rq->fence.seqno; 167 168 *cs++ = MI_USER_INTERRUPT; 169 *cs++ = MI_NOOP; 170 171 rq->tail = intel_ring_offset(rq, cs); 172 assert_ring_tail_valid(rq->ring, rq->tail); 173 174 return cs; 175 } 176 177 static int mi_flush_dw(struct i915_request *rq, u32 flags) 178 { 179 u32 cmd, *cs; 180 181 cs = intel_ring_begin(rq, 4); 182 if (IS_ERR(cs)) 183 return PTR_ERR(cs); 184 185 cmd = MI_FLUSH_DW; 186 187 /* 188 * We always require a command barrier so that subsequent 189 * commands, such as breadcrumb interrupts, are strictly ordered 190 * wrt the contents of the write cache being flushed to memory 191 * (and thus being coherent from the CPU). 192 */ 193 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 194 195 /* 196 * Bspec vol 1c.3 - blitter engine command streamer: 197 * "If ENABLED, all TLBs will be invalidated once the flush 198 * operation is complete. This bit is only valid when the 199 * Post-Sync Operation field is a value of 1h or 3h." 200 */ 201 cmd |= flags; 202 203 *cs++ = cmd; 204 *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; 205 *cs++ = 0; 206 *cs++ = MI_NOOP; 207 208 intel_ring_advance(rq, cs); 209 210 return 0; 211 } 212 213 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags) 214 { 215 return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0); 216 } 217 218 int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode) 219 { 220 return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB); 221 } 222 223 int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode) 224 { 225 return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD); 226 } 227 228 int gen6_emit_bb_start(struct i915_request *rq, 229 u64 offset, u32 len, 230 unsigned int dispatch_flags) 231 { 232 u32 security; 233 u32 *cs; 234 235 security = MI_BATCH_NON_SECURE_I965; 236 if (dispatch_flags & I915_DISPATCH_SECURE) 237 security = 0; 238 239 cs = intel_ring_begin(rq, 2); 240 if (IS_ERR(cs)) 241 return PTR_ERR(cs); 242 243 cs = __gen6_emit_bb_start(cs, offset, security); 244 intel_ring_advance(rq, cs); 245 246 return 0; 247 } 248 249 int 250 hsw_emit_bb_start(struct i915_request *rq, 251 u64 offset, u32 len, 252 unsigned int dispatch_flags) 253 { 254 u32 security; 255 u32 *cs; 256 257 security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW; 258 if (dispatch_flags & I915_DISPATCH_SECURE) 259 security = 0; 260 261 cs = intel_ring_begin(rq, 2); 262 if (IS_ERR(cs)) 263 return PTR_ERR(cs); 264 265 cs = __gen6_emit_bb_start(cs, offset, security); 266 intel_ring_advance(rq, cs); 267 268 return 0; 269 } 270 271 static int gen7_stall_cs(struct i915_request *rq) 272 { 273 u32 *cs; 274 275 cs = intel_ring_begin(rq, 4); 276 if (IS_ERR(cs)) 277 return PTR_ERR(cs); 278 279 *cs++ = GFX_OP_PIPE_CONTROL(4); 280 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; 281 *cs++ = 0; 282 *cs++ = 0; 283 intel_ring_advance(rq, cs); 284 285 return 0; 286 } 287 288 int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode) 289 { 290 u32 scratch_addr = 291 intel_gt_scratch_offset(rq->engine->gt, 292 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); 293 u32 *cs, flags = 0; 294 295 /* 296 * Ensure that any following seqno writes only happen when the render 297 * cache is indeed flushed. 298 * 299 * Workaround: 4th PIPE_CONTROL command (except the ones with only 300 * read-cache invalidate bits set) must have the CS_STALL bit set. We 301 * don't try to be clever and just set it unconditionally. 302 */ 303 flags |= PIPE_CONTROL_CS_STALL; 304 305 /* 306 * CS_STALL suggests at least a post-sync write. 307 */ 308 flags |= PIPE_CONTROL_QW_WRITE; 309 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 310 311 /* 312 * Just flush everything. Experiments have shown that reducing the 313 * number of bits based on the write domains has little performance 314 * impact. 315 */ 316 if (mode & EMIT_FLUSH) { 317 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 318 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 319 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 320 flags |= PIPE_CONTROL_FLUSH_ENABLE; 321 } 322 if (mode & EMIT_INVALIDATE) { 323 flags |= PIPE_CONTROL_TLB_INVALIDATE; 324 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 325 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 326 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 327 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 328 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 329 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR; 330 331 /* 332 * Workaround: we must issue a pipe_control with CS-stall bit 333 * set before a pipe_control command that has the state cache 334 * invalidate bit set. 335 */ 336 gen7_stall_cs(rq); 337 } 338 339 cs = intel_ring_begin(rq, 4); 340 if (IS_ERR(cs)) 341 return PTR_ERR(cs); 342 343 *cs++ = GFX_OP_PIPE_CONTROL(4); 344 *cs++ = flags; 345 *cs++ = scratch_addr; 346 *cs++ = 0; 347 intel_ring_advance(rq, cs); 348 349 return 0; 350 } 351 352 u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs) 353 { 354 *cs++ = GFX_OP_PIPE_CONTROL(4); 355 *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 356 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 357 PIPE_CONTROL_DC_FLUSH_ENABLE | 358 PIPE_CONTROL_FLUSH_ENABLE | 359 PIPE_CONTROL_QW_WRITE | 360 PIPE_CONTROL_GLOBAL_GTT_IVB | 361 PIPE_CONTROL_CS_STALL); 362 *cs++ = i915_request_active_timeline(rq)->hwsp_offset; 363 *cs++ = rq->fence.seqno; 364 365 *cs++ = MI_USER_INTERRUPT; 366 *cs++ = MI_NOOP; 367 368 rq->tail = intel_ring_offset(rq, cs); 369 assert_ring_tail_valid(rq->ring, rq->tail); 370 371 return cs; 372 } 373 374 u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs) 375 { 376 GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); 377 GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR); 378 379 *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; 380 *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; 381 *cs++ = rq->fence.seqno; 382 383 *cs++ = MI_USER_INTERRUPT; 384 385 rq->tail = intel_ring_offset(rq, cs); 386 assert_ring_tail_valid(rq->ring, rq->tail); 387 388 return cs; 389 } 390 391 #define GEN7_XCS_WA 32 392 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs) 393 { 394 int i; 395 396 GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); 397 GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR); 398 399 *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB | 400 MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; 401 *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; 402 *cs++ = rq->fence.seqno; 403 404 for (i = 0; i < GEN7_XCS_WA; i++) { 405 *cs++ = MI_STORE_DWORD_INDEX; 406 *cs++ = I915_GEM_HWS_SEQNO_ADDR; 407 *cs++ = rq->fence.seqno; 408 } 409 410 *cs++ = MI_FLUSH_DW; 411 *cs++ = 0; 412 *cs++ = 0; 413 414 *cs++ = MI_USER_INTERRUPT; 415 *cs++ = MI_NOOP; 416 417 rq->tail = intel_ring_offset(rq, cs); 418 assert_ring_tail_valid(rq->ring, rq->tail); 419 420 return cs; 421 } 422 #undef GEN7_XCS_WA 423 424 void gen6_irq_enable(struct intel_engine_cs *engine) 425 { 426 ENGINE_WRITE(engine, RING_IMR, 427 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 428 429 /* Flush/delay to ensure the RING_IMR is active before the GT IMR */ 430 ENGINE_POSTING_READ(engine, RING_IMR); 431 432 gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask); 433 } 434 435 void gen6_irq_disable(struct intel_engine_cs *engine) 436 { 437 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 438 gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask); 439 } 440 441 void hsw_irq_enable_vecs(struct intel_engine_cs *engine) 442 { 443 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask); 444 445 /* Flush/delay to ensure the RING_IMR is active before the GT IMR */ 446 ENGINE_POSTING_READ(engine, RING_IMR); 447 448 gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask); 449 } 450 451 void hsw_irq_disable_vecs(struct intel_engine_cs *engine) 452 { 453 ENGINE_WRITE(engine, RING_IMR, ~0); 454 gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask); 455 } 456