1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include "gem/i915_gem_lmem.h" 7 8 #include "gen8_engine_cs.h" 9 #include "i915_drv.h" 10 #include "i915_perf.h" 11 #include "intel_engine.h" 12 #include "intel_gpu_commands.h" 13 #include "intel_gt.h" 14 #include "intel_lrc.h" 15 #include "intel_lrc_reg.h" 16 #include "intel_ring.h" 17 #include "shmem_utils.h" 18 19 static void set_offsets(u32 *regs, 20 const u8 *data, 21 const struct intel_engine_cs *engine, 22 bool close) 23 #define NOP(x) (BIT(7) | (x)) 24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 25 #define POSTED BIT(0) 26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 27 #define REG16(x) \ 28 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 29 (((x) >> 2) & 0x7f) 30 #define END 0 31 { 32 const u32 base = engine->mmio_base; 33 34 while (*data) { 35 u8 count, flags; 36 37 if (*data & BIT(7)) { /* skip */ 38 count = *data++ & ~BIT(7); 39 regs += count; 40 continue; 41 } 42 43 count = *data & 0x3f; 44 flags = *data >> 6; 45 data++; 46 47 *regs = MI_LOAD_REGISTER_IMM(count); 48 if (flags & POSTED) 49 *regs |= MI_LRI_FORCE_POSTED; 50 if (INTEL_GEN(engine->i915) >= 11) 51 *regs |= MI_LRI_LRM_CS_MMIO; 52 regs++; 53 54 GEM_BUG_ON(!count); 55 do { 56 u32 offset = 0; 57 u8 v; 58 59 do { 60 v = *data++; 61 offset <<= 7; 62 offset |= v & ~BIT(7); 63 } while (v & BIT(7)); 64 65 regs[0] = base + (offset << 2); 66 regs += 2; 67 } while (--count); 68 } 69 70 if (close) { 71 /* Close the batch; used mainly by live_lrc_layout() */ 72 *regs = MI_BATCH_BUFFER_END; 73 if (INTEL_GEN(engine->i915) >= 10) 74 *regs |= BIT(0); 75 } 76 } 77 78 static const u8 gen8_xcs_offsets[] = { 79 NOP(1), 80 LRI(11, 0), 81 REG16(0x244), 82 REG(0x034), 83 REG(0x030), 84 REG(0x038), 85 REG(0x03c), 86 REG(0x168), 87 REG(0x140), 88 REG(0x110), 89 REG(0x11c), 90 REG(0x114), 91 REG(0x118), 92 93 NOP(9), 94 LRI(9, 0), 95 REG16(0x3a8), 96 REG16(0x28c), 97 REG16(0x288), 98 REG16(0x284), 99 REG16(0x280), 100 REG16(0x27c), 101 REG16(0x278), 102 REG16(0x274), 103 REG16(0x270), 104 105 NOP(13), 106 LRI(2, 0), 107 REG16(0x200), 108 REG(0x028), 109 110 END 111 }; 112 113 static const u8 gen9_xcs_offsets[] = { 114 NOP(1), 115 LRI(14, POSTED), 116 REG16(0x244), 117 REG(0x034), 118 REG(0x030), 119 REG(0x038), 120 REG(0x03c), 121 REG(0x168), 122 REG(0x140), 123 REG(0x110), 124 REG(0x11c), 125 REG(0x114), 126 REG(0x118), 127 REG(0x1c0), 128 REG(0x1c4), 129 REG(0x1c8), 130 131 NOP(3), 132 LRI(9, POSTED), 133 REG16(0x3a8), 134 REG16(0x28c), 135 REG16(0x288), 136 REG16(0x284), 137 REG16(0x280), 138 REG16(0x27c), 139 REG16(0x278), 140 REG16(0x274), 141 REG16(0x270), 142 143 NOP(13), 144 LRI(1, POSTED), 145 REG16(0x200), 146 147 NOP(13), 148 LRI(44, POSTED), 149 REG(0x028), 150 REG(0x09c), 151 REG(0x0c0), 152 REG(0x178), 153 REG(0x17c), 154 REG16(0x358), 155 REG(0x170), 156 REG(0x150), 157 REG(0x154), 158 REG(0x158), 159 REG16(0x41c), 160 REG16(0x600), 161 REG16(0x604), 162 REG16(0x608), 163 REG16(0x60c), 164 REG16(0x610), 165 REG16(0x614), 166 REG16(0x618), 167 REG16(0x61c), 168 REG16(0x620), 169 REG16(0x624), 170 REG16(0x628), 171 REG16(0x62c), 172 REG16(0x630), 173 REG16(0x634), 174 REG16(0x638), 175 REG16(0x63c), 176 REG16(0x640), 177 REG16(0x644), 178 REG16(0x648), 179 REG16(0x64c), 180 REG16(0x650), 181 REG16(0x654), 182 REG16(0x658), 183 REG16(0x65c), 184 REG16(0x660), 185 REG16(0x664), 186 REG16(0x668), 187 REG16(0x66c), 188 REG16(0x670), 189 REG16(0x674), 190 REG16(0x678), 191 REG16(0x67c), 192 REG(0x068), 193 194 END 195 }; 196 197 static const u8 gen12_xcs_offsets[] = { 198 NOP(1), 199 LRI(13, POSTED), 200 REG16(0x244), 201 REG(0x034), 202 REG(0x030), 203 REG(0x038), 204 REG(0x03c), 205 REG(0x168), 206 REG(0x140), 207 REG(0x110), 208 REG(0x1c0), 209 REG(0x1c4), 210 REG(0x1c8), 211 REG(0x180), 212 REG16(0x2b4), 213 214 NOP(5), 215 LRI(9, POSTED), 216 REG16(0x3a8), 217 REG16(0x28c), 218 REG16(0x288), 219 REG16(0x284), 220 REG16(0x280), 221 REG16(0x27c), 222 REG16(0x278), 223 REG16(0x274), 224 REG16(0x270), 225 226 END 227 }; 228 229 static const u8 gen8_rcs_offsets[] = { 230 NOP(1), 231 LRI(14, POSTED), 232 REG16(0x244), 233 REG(0x034), 234 REG(0x030), 235 REG(0x038), 236 REG(0x03c), 237 REG(0x168), 238 REG(0x140), 239 REG(0x110), 240 REG(0x11c), 241 REG(0x114), 242 REG(0x118), 243 REG(0x1c0), 244 REG(0x1c4), 245 REG(0x1c8), 246 247 NOP(3), 248 LRI(9, POSTED), 249 REG16(0x3a8), 250 REG16(0x28c), 251 REG16(0x288), 252 REG16(0x284), 253 REG16(0x280), 254 REG16(0x27c), 255 REG16(0x278), 256 REG16(0x274), 257 REG16(0x270), 258 259 NOP(13), 260 LRI(1, 0), 261 REG(0x0c8), 262 263 END 264 }; 265 266 static const u8 gen9_rcs_offsets[] = { 267 NOP(1), 268 LRI(14, POSTED), 269 REG16(0x244), 270 REG(0x34), 271 REG(0x30), 272 REG(0x38), 273 REG(0x3c), 274 REG(0x168), 275 REG(0x140), 276 REG(0x110), 277 REG(0x11c), 278 REG(0x114), 279 REG(0x118), 280 REG(0x1c0), 281 REG(0x1c4), 282 REG(0x1c8), 283 284 NOP(3), 285 LRI(9, POSTED), 286 REG16(0x3a8), 287 REG16(0x28c), 288 REG16(0x288), 289 REG16(0x284), 290 REG16(0x280), 291 REG16(0x27c), 292 REG16(0x278), 293 REG16(0x274), 294 REG16(0x270), 295 296 NOP(13), 297 LRI(1, 0), 298 REG(0xc8), 299 300 NOP(13), 301 LRI(44, POSTED), 302 REG(0x28), 303 REG(0x9c), 304 REG(0xc0), 305 REG(0x178), 306 REG(0x17c), 307 REG16(0x358), 308 REG(0x170), 309 REG(0x150), 310 REG(0x154), 311 REG(0x158), 312 REG16(0x41c), 313 REG16(0x600), 314 REG16(0x604), 315 REG16(0x608), 316 REG16(0x60c), 317 REG16(0x610), 318 REG16(0x614), 319 REG16(0x618), 320 REG16(0x61c), 321 REG16(0x620), 322 REG16(0x624), 323 REG16(0x628), 324 REG16(0x62c), 325 REG16(0x630), 326 REG16(0x634), 327 REG16(0x638), 328 REG16(0x63c), 329 REG16(0x640), 330 REG16(0x644), 331 REG16(0x648), 332 REG16(0x64c), 333 REG16(0x650), 334 REG16(0x654), 335 REG16(0x658), 336 REG16(0x65c), 337 REG16(0x660), 338 REG16(0x664), 339 REG16(0x668), 340 REG16(0x66c), 341 REG16(0x670), 342 REG16(0x674), 343 REG16(0x678), 344 REG16(0x67c), 345 REG(0x68), 346 347 END 348 }; 349 350 static const u8 gen11_rcs_offsets[] = { 351 NOP(1), 352 LRI(15, POSTED), 353 REG16(0x244), 354 REG(0x034), 355 REG(0x030), 356 REG(0x038), 357 REG(0x03c), 358 REG(0x168), 359 REG(0x140), 360 REG(0x110), 361 REG(0x11c), 362 REG(0x114), 363 REG(0x118), 364 REG(0x1c0), 365 REG(0x1c4), 366 REG(0x1c8), 367 REG(0x180), 368 369 NOP(1), 370 LRI(9, POSTED), 371 REG16(0x3a8), 372 REG16(0x28c), 373 REG16(0x288), 374 REG16(0x284), 375 REG16(0x280), 376 REG16(0x27c), 377 REG16(0x278), 378 REG16(0x274), 379 REG16(0x270), 380 381 LRI(1, POSTED), 382 REG(0x1b0), 383 384 NOP(10), 385 LRI(1, 0), 386 REG(0x0c8), 387 388 END 389 }; 390 391 static const u8 gen12_rcs_offsets[] = { 392 NOP(1), 393 LRI(13, POSTED), 394 REG16(0x244), 395 REG(0x034), 396 REG(0x030), 397 REG(0x038), 398 REG(0x03c), 399 REG(0x168), 400 REG(0x140), 401 REG(0x110), 402 REG(0x1c0), 403 REG(0x1c4), 404 REG(0x1c8), 405 REG(0x180), 406 REG16(0x2b4), 407 408 NOP(5), 409 LRI(9, POSTED), 410 REG16(0x3a8), 411 REG16(0x28c), 412 REG16(0x288), 413 REG16(0x284), 414 REG16(0x280), 415 REG16(0x27c), 416 REG16(0x278), 417 REG16(0x274), 418 REG16(0x270), 419 420 LRI(3, POSTED), 421 REG(0x1b0), 422 REG16(0x5a8), 423 REG16(0x5ac), 424 425 NOP(6), 426 LRI(1, 0), 427 REG(0x0c8), 428 NOP(3 + 9 + 1), 429 430 LRI(51, POSTED), 431 REG16(0x588), 432 REG16(0x588), 433 REG16(0x588), 434 REG16(0x588), 435 REG16(0x588), 436 REG16(0x588), 437 REG(0x028), 438 REG(0x09c), 439 REG(0x0c0), 440 REG(0x178), 441 REG(0x17c), 442 REG16(0x358), 443 REG(0x170), 444 REG(0x150), 445 REG(0x154), 446 REG(0x158), 447 REG16(0x41c), 448 REG16(0x600), 449 REG16(0x604), 450 REG16(0x608), 451 REG16(0x60c), 452 REG16(0x610), 453 REG16(0x614), 454 REG16(0x618), 455 REG16(0x61c), 456 REG16(0x620), 457 REG16(0x624), 458 REG16(0x628), 459 REG16(0x62c), 460 REG16(0x630), 461 REG16(0x634), 462 REG16(0x638), 463 REG16(0x63c), 464 REG16(0x640), 465 REG16(0x644), 466 REG16(0x648), 467 REG16(0x64c), 468 REG16(0x650), 469 REG16(0x654), 470 REG16(0x658), 471 REG16(0x65c), 472 REG16(0x660), 473 REG16(0x664), 474 REG16(0x668), 475 REG16(0x66c), 476 REG16(0x670), 477 REG16(0x674), 478 REG16(0x678), 479 REG16(0x67c), 480 REG(0x068), 481 REG(0x084), 482 NOP(1), 483 484 END 485 }; 486 487 #undef END 488 #undef REG16 489 #undef REG 490 #undef LRI 491 #undef NOP 492 493 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 494 { 495 /* 496 * The gen12+ lists only have the registers we program in the basic 497 * default state. We rely on the context image using relative 498 * addressing to automatic fixup the register state between the 499 * physical engines for virtual engine. 500 */ 501 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 502 !intel_engine_has_relative_mmio(engine)); 503 504 if (engine->class == RENDER_CLASS) { 505 if (INTEL_GEN(engine->i915) >= 12) 506 return gen12_rcs_offsets; 507 else if (INTEL_GEN(engine->i915) >= 11) 508 return gen11_rcs_offsets; 509 else if (INTEL_GEN(engine->i915) >= 9) 510 return gen9_rcs_offsets; 511 else 512 return gen8_rcs_offsets; 513 } else { 514 if (INTEL_GEN(engine->i915) >= 12) 515 return gen12_xcs_offsets; 516 else if (INTEL_GEN(engine->i915) >= 9) 517 return gen9_xcs_offsets; 518 else 519 return gen8_xcs_offsets; 520 } 521 } 522 523 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 524 { 525 if (INTEL_GEN(engine->i915) >= 12) 526 return 0x60; 527 else if (INTEL_GEN(engine->i915) >= 9) 528 return 0x54; 529 else if (engine->class == RENDER_CLASS) 530 return 0x58; 531 else 532 return -1; 533 } 534 535 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 536 { 537 if (INTEL_GEN(engine->i915) >= 12) 538 return 0x74; 539 else if (INTEL_GEN(engine->i915) >= 9) 540 return 0x68; 541 else if (engine->class == RENDER_CLASS) 542 return 0xd8; 543 else 544 return -1; 545 } 546 547 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 548 { 549 if (INTEL_GEN(engine->i915) >= 12) 550 return 0x12; 551 else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) 552 return 0x18; 553 else 554 return -1; 555 } 556 557 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 558 { 559 int x; 560 561 x = lrc_ring_wa_bb_per_ctx(engine); 562 if (x < 0) 563 return x; 564 565 return x + 2; 566 } 567 568 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 569 { 570 int x; 571 572 x = lrc_ring_indirect_ptr(engine); 573 if (x < 0) 574 return x; 575 576 return x + 2; 577 } 578 579 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 580 { 581 if (engine->class != RENDER_CLASS) 582 return -1; 583 584 if (INTEL_GEN(engine->i915) >= 12) 585 return 0xb6; 586 else if (INTEL_GEN(engine->i915) >= 11) 587 return 0xaa; 588 else 589 return -1; 590 } 591 592 static u32 593 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 594 { 595 switch (INTEL_GEN(engine->i915)) { 596 default: 597 MISSING_CASE(INTEL_GEN(engine->i915)); 598 fallthrough; 599 case 12: 600 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 601 case 11: 602 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 603 case 10: 604 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 605 case 9: 606 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 607 case 8: 608 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 609 } 610 } 611 612 static void 613 lrc_setup_indirect_ctx(u32 *regs, 614 const struct intel_engine_cs *engine, 615 u32 ctx_bb_ggtt_addr, 616 u32 size) 617 { 618 GEM_BUG_ON(!size); 619 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 620 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 621 regs[lrc_ring_indirect_ptr(engine) + 1] = 622 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 623 624 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 625 regs[lrc_ring_indirect_offset(engine) + 1] = 626 lrc_ring_indirect_offset_default(engine) << 6; 627 } 628 629 static void init_common_regs(u32 * const regs, 630 const struct intel_context *ce, 631 const struct intel_engine_cs *engine, 632 bool inhibit) 633 { 634 u32 ctl; 635 636 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 637 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 638 if (inhibit) 639 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 640 if (INTEL_GEN(engine->i915) < 11) 641 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 642 CTX_CTRL_RS_CTX_ENABLE); 643 regs[CTX_CONTEXT_CONTROL] = ctl; 644 645 regs[CTX_TIMESTAMP] = ce->runtime.last; 646 } 647 648 static void init_wa_bb_regs(u32 * const regs, 649 const struct intel_engine_cs *engine) 650 { 651 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 652 653 if (wa_ctx->per_ctx.size) { 654 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 655 656 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 657 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 658 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 659 } 660 661 if (wa_ctx->indirect_ctx.size) { 662 lrc_setup_indirect_ctx(regs, engine, 663 i915_ggtt_offset(wa_ctx->vma) + 664 wa_ctx->indirect_ctx.offset, 665 wa_ctx->indirect_ctx.size); 666 } 667 } 668 669 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 670 { 671 if (i915_vm_is_4lvl(&ppgtt->vm)) { 672 /* 64b PPGTT (48bit canonical) 673 * PDP0_DESCRIPTOR contains the base address to PML4 and 674 * other PDP Descriptors are ignored. 675 */ 676 ASSIGN_CTX_PML4(ppgtt, regs); 677 } else { 678 ASSIGN_CTX_PDP(ppgtt, regs, 3); 679 ASSIGN_CTX_PDP(ppgtt, regs, 2); 680 ASSIGN_CTX_PDP(ppgtt, regs, 1); 681 ASSIGN_CTX_PDP(ppgtt, regs, 0); 682 } 683 } 684 685 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 686 { 687 if (i915_is_ggtt(vm)) 688 return i915_vm_to_ggtt(vm)->alias; 689 else 690 return i915_vm_to_ppgtt(vm); 691 } 692 693 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 694 { 695 int x; 696 697 x = lrc_ring_mi_mode(engine); 698 if (x != -1) { 699 regs[x + 1] &= ~STOP_RING; 700 regs[x + 1] |= STOP_RING << 16; 701 } 702 } 703 704 static void __lrc_init_regs(u32 *regs, 705 const struct intel_context *ce, 706 const struct intel_engine_cs *engine, 707 bool inhibit) 708 { 709 /* 710 * A context is actually a big batch buffer with several 711 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 712 * values we are setting here are only for the first context restore: 713 * on a subsequent save, the GPU will recreate this batchbuffer with new 714 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 715 * we are not initializing here). 716 * 717 * Must keep consistent with virtual_update_register_offsets(). 718 */ 719 720 if (inhibit) 721 memset(regs, 0, PAGE_SIZE); 722 723 set_offsets(regs, reg_offsets(engine), engine, inhibit); 724 725 init_common_regs(regs, ce, engine, inhibit); 726 init_ppgtt_regs(regs, vm_alias(ce->vm)); 727 728 init_wa_bb_regs(regs, engine); 729 730 __reset_stop_ring(regs, engine); 731 } 732 733 void lrc_init_regs(const struct intel_context *ce, 734 const struct intel_engine_cs *engine, 735 bool inhibit) 736 { 737 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 738 } 739 740 void lrc_reset_regs(const struct intel_context *ce, 741 const struct intel_engine_cs *engine) 742 { 743 __reset_stop_ring(ce->lrc_reg_state, engine); 744 } 745 746 static void 747 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 748 { 749 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 750 return; 751 752 vaddr += engine->context_size; 753 754 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 755 } 756 757 static void 758 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 759 { 760 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 761 return; 762 763 vaddr += engine->context_size; 764 765 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 766 drm_err_once(&engine->i915->drm, 767 "%s context redzone overwritten!\n", 768 engine->name); 769 } 770 771 void lrc_init_state(struct intel_context *ce, 772 struct intel_engine_cs *engine, 773 void *state) 774 { 775 bool inhibit = true; 776 777 set_redzone(state, engine); 778 779 if (engine->default_state) { 780 shmem_read(engine->default_state, 0, 781 state, engine->context_size); 782 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 783 inhibit = false; 784 } 785 786 /* Clear the ppHWSP (inc. per-context counters) */ 787 memset(state, 0, PAGE_SIZE); 788 789 /* 790 * The second page of the context object contains some registers which 791 * must be set up prior to the first execution. 792 */ 793 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 794 } 795 796 static struct i915_vma * 797 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 798 { 799 struct drm_i915_gem_object *obj; 800 struct i915_vma *vma; 801 u32 context_size; 802 803 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 804 805 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 806 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 807 808 if (INTEL_GEN(engine->i915) == 12) { 809 ce->wa_bb_page = context_size / PAGE_SIZE; 810 context_size += PAGE_SIZE; 811 } 812 813 obj = i915_gem_object_create_lmem(engine->i915, context_size, 0); 814 if (IS_ERR(obj)) 815 obj = i915_gem_object_create_shmem(engine->i915, context_size); 816 if (IS_ERR(obj)) 817 return ERR_CAST(obj); 818 819 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 820 if (IS_ERR(vma)) { 821 i915_gem_object_put(obj); 822 return vma; 823 } 824 825 return vma; 826 } 827 828 static struct intel_timeline * 829 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 830 { 831 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 832 833 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 834 } 835 836 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 837 { 838 struct intel_ring *ring; 839 struct i915_vma *vma; 840 int err; 841 842 GEM_BUG_ON(ce->state); 843 844 vma = __lrc_alloc_state(ce, engine); 845 if (IS_ERR(vma)) 846 return PTR_ERR(vma); 847 848 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 849 if (IS_ERR(ring)) { 850 err = PTR_ERR(ring); 851 goto err_vma; 852 } 853 854 if (!page_mask_bits(ce->timeline)) { 855 struct intel_timeline *tl; 856 857 /* 858 * Use the static global HWSP for the kernel context, and 859 * a dynamically allocated cacheline for everyone else. 860 */ 861 if (unlikely(ce->timeline)) 862 tl = pinned_timeline(ce, engine); 863 else 864 tl = intel_timeline_create(engine->gt); 865 if (IS_ERR(tl)) { 866 err = PTR_ERR(tl); 867 goto err_ring; 868 } 869 870 ce->timeline = tl; 871 } 872 873 ce->ring = ring; 874 ce->state = vma; 875 876 return 0; 877 878 err_ring: 879 intel_ring_put(ring); 880 err_vma: 881 i915_vma_put(vma); 882 return err; 883 } 884 885 void lrc_reset(struct intel_context *ce) 886 { 887 GEM_BUG_ON(!intel_context_is_pinned(ce)); 888 889 intel_ring_reset(ce->ring, ce->ring->emit); 890 891 /* Scrub away the garbage */ 892 lrc_init_regs(ce, ce->engine, true); 893 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 894 } 895 896 int 897 lrc_pre_pin(struct intel_context *ce, 898 struct intel_engine_cs *engine, 899 struct i915_gem_ww_ctx *ww, 900 void **vaddr) 901 { 902 GEM_BUG_ON(!ce->state); 903 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 904 905 *vaddr = i915_gem_object_pin_map(ce->state->obj, 906 i915_coherent_map_type(ce->engine->i915, 907 ce->state->obj, 908 false) | 909 I915_MAP_OVERRIDE); 910 911 return PTR_ERR_OR_ZERO(*vaddr); 912 } 913 914 int 915 lrc_pin(struct intel_context *ce, 916 struct intel_engine_cs *engine, 917 void *vaddr) 918 { 919 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 920 921 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 922 lrc_init_state(ce, engine, vaddr); 923 924 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 925 return 0; 926 } 927 928 void lrc_unpin(struct intel_context *ce) 929 { 930 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 931 ce->engine); 932 } 933 934 void lrc_post_unpin(struct intel_context *ce) 935 { 936 i915_gem_object_unpin_map(ce->state->obj); 937 } 938 939 void lrc_fini(struct intel_context *ce) 940 { 941 if (!ce->state) 942 return; 943 944 intel_ring_put(fetch_and_zero(&ce->ring)); 945 i915_vma_put(fetch_and_zero(&ce->state)); 946 } 947 948 void lrc_destroy(struct kref *kref) 949 { 950 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 951 952 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 953 GEM_BUG_ON(intel_context_is_pinned(ce)); 954 955 lrc_fini(ce); 956 957 intel_context_fini(ce); 958 intel_context_free(ce); 959 } 960 961 static u32 * 962 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 963 { 964 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 965 MI_SRM_LRM_GLOBAL_GTT | 966 MI_LRI_LRM_CS_MMIO; 967 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 968 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 969 CTX_TIMESTAMP * sizeof(u32); 970 *cs++ = 0; 971 972 *cs++ = MI_LOAD_REGISTER_REG | 973 MI_LRR_SOURCE_CS_MMIO | 974 MI_LRI_LRM_CS_MMIO; 975 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 976 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 977 978 *cs++ = MI_LOAD_REGISTER_REG | 979 MI_LRR_SOURCE_CS_MMIO | 980 MI_LRI_LRM_CS_MMIO; 981 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 982 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 983 984 return cs; 985 } 986 987 static u32 * 988 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 989 { 990 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 991 992 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 993 MI_SRM_LRM_GLOBAL_GTT | 994 MI_LRI_LRM_CS_MMIO; 995 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 996 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 997 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 998 *cs++ = 0; 999 1000 return cs; 1001 } 1002 1003 static u32 * 1004 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1005 { 1006 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1007 1008 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1009 MI_SRM_LRM_GLOBAL_GTT | 1010 MI_LRI_LRM_CS_MMIO; 1011 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1012 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1013 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1014 *cs++ = 0; 1015 1016 *cs++ = MI_LOAD_REGISTER_REG | 1017 MI_LRR_SOURCE_CS_MMIO | 1018 MI_LRI_LRM_CS_MMIO; 1019 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1020 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1021 1022 return cs; 1023 } 1024 1025 static u32 * 1026 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1027 { 1028 cs = gen12_emit_timestamp_wa(ce, cs); 1029 cs = gen12_emit_cmd_buf_wa(ce, cs); 1030 cs = gen12_emit_restore_scratch(ce, cs); 1031 1032 return cs; 1033 } 1034 1035 static u32 * 1036 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1037 { 1038 cs = gen12_emit_timestamp_wa(ce, cs); 1039 cs = gen12_emit_restore_scratch(ce, cs); 1040 1041 return cs; 1042 } 1043 1044 static u32 context_wa_bb_offset(const struct intel_context *ce) 1045 { 1046 return PAGE_SIZE * ce->wa_bb_page; 1047 } 1048 1049 static u32 *context_indirect_bb(const struct intel_context *ce) 1050 { 1051 void *ptr; 1052 1053 GEM_BUG_ON(!ce->wa_bb_page); 1054 1055 ptr = ce->lrc_reg_state; 1056 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 1057 ptr += context_wa_bb_offset(ce); 1058 1059 return ptr; 1060 } 1061 1062 static void 1063 setup_indirect_ctx_bb(const struct intel_context *ce, 1064 const struct intel_engine_cs *engine, 1065 u32 *(*emit)(const struct intel_context *, u32 *)) 1066 { 1067 u32 * const start = context_indirect_bb(ce); 1068 u32 *cs; 1069 1070 cs = emit(ce, start); 1071 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1072 while ((unsigned long)cs % CACHELINE_BYTES) 1073 *cs++ = MI_NOOP; 1074 1075 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1076 i915_ggtt_offset(ce->state) + 1077 context_wa_bb_offset(ce), 1078 (cs - start) * sizeof(*cs)); 1079 } 1080 1081 /* 1082 * The context descriptor encodes various attributes of a context, 1083 * including its GTT address and some flags. Because it's fairly 1084 * expensive to calculate, we'll just do it once and cache the result, 1085 * which remains valid until the context is unpinned. 1086 * 1087 * This is what a descriptor looks like, from LSB to MSB:: 1088 * 1089 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1090 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1091 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1092 * bits 53-54: mbz, reserved for use by hardware 1093 * bits 55-63: group ID, currently unused and set to 0 1094 * 1095 * Starting from Gen11, the upper dword of the descriptor has a new format: 1096 * 1097 * bits 32-36: reserved 1098 * bits 37-47: SW context ID 1099 * bits 48:53: engine instance 1100 * bit 54: mbz, reserved for use by hardware 1101 * bits 55-60: SW counter 1102 * bits 61-63: engine class 1103 * 1104 * engine info, SW context ID and SW counter need to form a unique number 1105 * (Context ID) per lrc. 1106 */ 1107 static u32 lrc_descriptor(const struct intel_context *ce) 1108 { 1109 u32 desc; 1110 1111 desc = INTEL_LEGACY_32B_CONTEXT; 1112 if (i915_vm_is_4lvl(ce->vm)) 1113 desc = INTEL_LEGACY_64B_CONTEXT; 1114 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1115 1116 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1117 if (IS_GEN(ce->vm->i915, 8)) 1118 desc |= GEN8_CTX_L3LLC_COHERENT; 1119 1120 return i915_ggtt_offset(ce->state) | desc; 1121 } 1122 1123 u32 lrc_update_regs(const struct intel_context *ce, 1124 const struct intel_engine_cs *engine, 1125 u32 head) 1126 { 1127 struct intel_ring *ring = ce->ring; 1128 u32 *regs = ce->lrc_reg_state; 1129 1130 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1131 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1132 1133 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1134 regs[CTX_RING_HEAD] = head; 1135 regs[CTX_RING_TAIL] = ring->tail; 1136 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1137 1138 /* RPCS */ 1139 if (engine->class == RENDER_CLASS) { 1140 regs[CTX_R_PWR_CLK_STATE] = 1141 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1142 1143 i915_oa_init_reg_state(ce, engine); 1144 } 1145 1146 if (ce->wa_bb_page) { 1147 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1148 1149 fn = gen12_emit_indirect_ctx_xcs; 1150 if (ce->engine->class == RENDER_CLASS) 1151 fn = gen12_emit_indirect_ctx_rcs; 1152 1153 /* Mutually exclusive wrt to global indirect bb */ 1154 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1155 setup_indirect_ctx_bb(ce, engine, fn); 1156 } 1157 1158 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1159 } 1160 1161 void lrc_update_offsets(struct intel_context *ce, 1162 struct intel_engine_cs *engine) 1163 { 1164 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1165 } 1166 1167 void lrc_check_regs(const struct intel_context *ce, 1168 const struct intel_engine_cs *engine, 1169 const char *when) 1170 { 1171 const struct intel_ring *ring = ce->ring; 1172 u32 *regs = ce->lrc_reg_state; 1173 bool valid = true; 1174 int x; 1175 1176 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1177 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1178 engine->name, 1179 regs[CTX_RING_START], 1180 i915_ggtt_offset(ring->vma)); 1181 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1182 valid = false; 1183 } 1184 1185 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1186 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1187 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1188 engine->name, 1189 regs[CTX_RING_CTL], 1190 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1191 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1192 valid = false; 1193 } 1194 1195 x = lrc_ring_mi_mode(engine); 1196 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1197 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1198 engine->name, regs[x + 1]); 1199 regs[x + 1] &= ~STOP_RING; 1200 regs[x + 1] |= STOP_RING << 16; 1201 valid = false; 1202 } 1203 1204 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1205 } 1206 1207 /* 1208 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1209 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1210 * but there is a slight complication as this is applied in WA batch where the 1211 * values are only initialized once so we cannot take register value at the 1212 * beginning and reuse it further; hence we save its value to memory, upload a 1213 * constant value with bit21 set and then we restore it back with the saved value. 1214 * To simplify the WA, a constant value is formed by using the default value 1215 * of this register. This shouldn't be a problem because we are only modifying 1216 * it for a short period and this batch in non-premptible. We can ofcourse 1217 * use additional instructions that read the actual value of the register 1218 * at that time and set our bit of interest but it makes the WA complicated. 1219 * 1220 * This WA is also required for Gen9 so extracting as a function avoids 1221 * code duplication. 1222 */ 1223 static u32 * 1224 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1225 { 1226 /* NB no one else is allowed to scribble over scratch + 256! */ 1227 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1228 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1229 *batch++ = intel_gt_scratch_offset(engine->gt, 1230 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1231 *batch++ = 0; 1232 1233 *batch++ = MI_LOAD_REGISTER_IMM(1); 1234 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1235 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1236 1237 batch = gen8_emit_pipe_control(batch, 1238 PIPE_CONTROL_CS_STALL | 1239 PIPE_CONTROL_DC_FLUSH_ENABLE, 1240 0); 1241 1242 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1243 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1244 *batch++ = intel_gt_scratch_offset(engine->gt, 1245 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1246 *batch++ = 0; 1247 1248 return batch; 1249 } 1250 1251 /* 1252 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1253 * initialized at the beginning and shared across all contexts but this field 1254 * helps us to have multiple batches at different offsets and select them based 1255 * on a criteria. At the moment this batch always start at the beginning of the page 1256 * and at this point we don't have multiple wa_ctx batch buffers. 1257 * 1258 * The number of WA applied are not known at the beginning; we use this field 1259 * to return the no of DWORDS written. 1260 * 1261 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1262 * so it adds NOOPs as padding to make it cacheline aligned. 1263 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1264 * makes a complete batch buffer. 1265 */ 1266 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1267 { 1268 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1269 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1270 1271 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1272 if (IS_BROADWELL(engine->i915)) 1273 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1274 1275 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1276 /* Actual scratch location is at 128 bytes offset */ 1277 batch = gen8_emit_pipe_control(batch, 1278 PIPE_CONTROL_FLUSH_L3 | 1279 PIPE_CONTROL_STORE_DATA_INDEX | 1280 PIPE_CONTROL_CS_STALL | 1281 PIPE_CONTROL_QW_WRITE, 1282 LRC_PPHWSP_SCRATCH_ADDR); 1283 1284 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1285 1286 /* Pad to end of cacheline */ 1287 while ((unsigned long)batch % CACHELINE_BYTES) 1288 *batch++ = MI_NOOP; 1289 1290 /* 1291 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1292 * execution depends on the length specified in terms of cache lines 1293 * in the register CTX_RCS_INDIRECT_CTX 1294 */ 1295 1296 return batch; 1297 } 1298 1299 struct lri { 1300 i915_reg_t reg; 1301 u32 value; 1302 }; 1303 1304 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1305 { 1306 GEM_BUG_ON(!count || count > 63); 1307 1308 *batch++ = MI_LOAD_REGISTER_IMM(count); 1309 do { 1310 *batch++ = i915_mmio_reg_offset(lri->reg); 1311 *batch++ = lri->value; 1312 } while (lri++, --count); 1313 *batch++ = MI_NOOP; 1314 1315 return batch; 1316 } 1317 1318 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1319 { 1320 static const struct lri lri[] = { 1321 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1322 { 1323 COMMON_SLICE_CHICKEN2, 1324 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1325 0), 1326 }, 1327 1328 /* BSpec: 11391 */ 1329 { 1330 FF_SLICE_CHICKEN, 1331 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1332 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1333 }, 1334 1335 /* BSpec: 11299 */ 1336 { 1337 _3D_CHICKEN3, 1338 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1339 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1340 } 1341 }; 1342 1343 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1344 1345 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1346 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1347 1348 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1349 batch = gen8_emit_pipe_control(batch, 1350 PIPE_CONTROL_FLUSH_L3 | 1351 PIPE_CONTROL_STORE_DATA_INDEX | 1352 PIPE_CONTROL_CS_STALL | 1353 PIPE_CONTROL_QW_WRITE, 1354 LRC_PPHWSP_SCRATCH_ADDR); 1355 1356 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1357 1358 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1359 if (HAS_POOLED_EU(engine->i915)) { 1360 /* 1361 * EU pool configuration is setup along with golden context 1362 * during context initialization. This value depends on 1363 * device type (2x6 or 3x6) and needs to be updated based 1364 * on which subslice is disabled especially for 2x6 1365 * devices, however it is safe to load default 1366 * configuration of 3x6 device instead of masking off 1367 * corresponding bits because HW ignores bits of a disabled 1368 * subslice and drops down to appropriate config. Please 1369 * see render_state_setup() in i915_gem_render_state.c for 1370 * possible configurations, to avoid duplication they are 1371 * not shown here again. 1372 */ 1373 *batch++ = GEN9_MEDIA_POOL_STATE; 1374 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1375 *batch++ = 0x00777000; 1376 *batch++ = 0; 1377 *batch++ = 0; 1378 *batch++ = 0; 1379 } 1380 1381 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1382 1383 /* Pad to end of cacheline */ 1384 while ((unsigned long)batch % CACHELINE_BYTES) 1385 *batch++ = MI_NOOP; 1386 1387 return batch; 1388 } 1389 1390 static u32 * 1391 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1392 { 1393 int i; 1394 1395 /* 1396 * WaPipeControlBefore3DStateSamplePattern: cnl 1397 * 1398 * Ensure the engine is idle prior to programming a 1399 * 3DSTATE_SAMPLE_PATTERN during a context restore. 1400 */ 1401 batch = gen8_emit_pipe_control(batch, 1402 PIPE_CONTROL_CS_STALL, 1403 0); 1404 /* 1405 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 1406 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 1407 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 1408 * confusing. Since gen8_emit_pipe_control() already advances the 1409 * batch by 6 dwords, we advance the other 10 here, completing a 1410 * cacheline. It's not clear if the workaround requires this padding 1411 * before other commands, or if it's just the regular padding we would 1412 * already have for the workaround bb, so leave it here for now. 1413 */ 1414 for (i = 0; i < 10; i++) 1415 *batch++ = MI_NOOP; 1416 1417 /* Pad to end of cacheline */ 1418 while ((unsigned long)batch % CACHELINE_BYTES) 1419 *batch++ = MI_NOOP; 1420 1421 return batch; 1422 } 1423 1424 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1425 1426 static int lrc_create_wa_ctx(struct intel_engine_cs *engine) 1427 { 1428 struct drm_i915_gem_object *obj; 1429 struct i915_vma *vma; 1430 int err; 1431 1432 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1433 if (IS_ERR(obj)) 1434 return PTR_ERR(obj); 1435 1436 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1437 if (IS_ERR(vma)) { 1438 err = PTR_ERR(vma); 1439 goto err; 1440 } 1441 1442 engine->wa_ctx.vma = vma; 1443 return 0; 1444 1445 err: 1446 i915_gem_object_put(obj); 1447 return err; 1448 } 1449 1450 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1451 { 1452 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1453 } 1454 1455 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1456 1457 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1458 { 1459 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1460 struct i915_wa_ctx_bb *wa_bb[] = { 1461 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1462 }; 1463 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1464 struct i915_gem_ww_ctx ww; 1465 void *batch, *batch_ptr; 1466 unsigned int i; 1467 int err; 1468 1469 if (engine->class != RENDER_CLASS) 1470 return; 1471 1472 switch (INTEL_GEN(engine->i915)) { 1473 case 12: 1474 case 11: 1475 return; 1476 case 10: 1477 wa_bb_fn[0] = gen10_init_indirectctx_bb; 1478 wa_bb_fn[1] = NULL; 1479 break; 1480 case 9: 1481 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1482 wa_bb_fn[1] = NULL; 1483 break; 1484 case 8: 1485 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1486 wa_bb_fn[1] = NULL; 1487 break; 1488 default: 1489 MISSING_CASE(INTEL_GEN(engine->i915)); 1490 return; 1491 } 1492 1493 err = lrc_create_wa_ctx(engine); 1494 if (err) { 1495 /* 1496 * We continue even if we fail to initialize WA batch 1497 * because we only expect rare glitches but nothing 1498 * critical to prevent us from using GPU 1499 */ 1500 drm_err(&engine->i915->drm, 1501 "Ignoring context switch w/a allocation error:%d\n", 1502 err); 1503 return; 1504 } 1505 1506 if (!engine->wa_ctx.vma) 1507 return; 1508 1509 i915_gem_ww_ctx_init(&ww, true); 1510 retry: 1511 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); 1512 if (!err) 1513 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); 1514 if (err) 1515 goto err; 1516 1517 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1518 if (IS_ERR(batch)) { 1519 err = PTR_ERR(batch); 1520 goto err_unpin; 1521 } 1522 1523 /* 1524 * Emit the two workaround batch buffers, recording the offset from the 1525 * start of the workaround batch buffer object for each and their 1526 * respective sizes. 1527 */ 1528 batch_ptr = batch; 1529 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1530 wa_bb[i]->offset = batch_ptr - batch; 1531 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1532 CACHELINE_BYTES))) { 1533 err = -EINVAL; 1534 break; 1535 } 1536 if (wa_bb_fn[i]) 1537 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1538 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1539 } 1540 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1541 1542 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1543 __i915_gem_object_release_map(wa_ctx->vma->obj); 1544 1545 /* Verify that we can handle failure to setup the wa_ctx */ 1546 if (!err) 1547 err = i915_inject_probe_error(engine->i915, -ENODEV); 1548 1549 err_unpin: 1550 if (err) 1551 i915_vma_unpin(wa_ctx->vma); 1552 err: 1553 if (err == -EDEADLK) { 1554 err = i915_gem_ww_ctx_backoff(&ww); 1555 if (!err) 1556 goto retry; 1557 } 1558 i915_gem_ww_ctx_fini(&ww); 1559 1560 if (err) { 1561 i915_vma_put(engine->wa_ctx.vma); 1562 1563 /* Clear all flags to prevent further use */ 1564 memset(wa_ctx, 0, sizeof(*wa_ctx)); 1565 } 1566 } 1567 1568 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1569 { 1570 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1571 ce->runtime.num_underflow++; 1572 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1573 #endif 1574 } 1575 1576 void lrc_update_runtime(struct intel_context *ce) 1577 { 1578 u32 old; 1579 s32 dt; 1580 1581 if (intel_context_is_barrier(ce)) 1582 return; 1583 1584 old = ce->runtime.last; 1585 ce->runtime.last = lrc_get_runtime(ce); 1586 dt = ce->runtime.last - old; 1587 1588 if (unlikely(dt < 0)) { 1589 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1590 old, ce->runtime.last, dt); 1591 st_update_runtime_underflow(ce, dt); 1592 return; 1593 } 1594 1595 ewma_runtime_add(&ce->runtime.avg, dt); 1596 ce->runtime.total += dt; 1597 } 1598 1599 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1600 #include "selftest_lrc.c" 1601 #endif 1602