1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include "gem/i915_gem_lmem.h" 7 8 #include "gen8_engine_cs.h" 9 #include "i915_drv.h" 10 #include "i915_perf.h" 11 #include "intel_engine.h" 12 #include "intel_gpu_commands.h" 13 #include "intel_gt.h" 14 #include "intel_lrc.h" 15 #include "intel_lrc_reg.h" 16 #include "intel_ring.h" 17 #include "shmem_utils.h" 18 19 static void set_offsets(u32 *regs, 20 const u8 *data, 21 const struct intel_engine_cs *engine, 22 bool close) 23 #define NOP(x) (BIT(7) | (x)) 24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 25 #define POSTED BIT(0) 26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 27 #define REG16(x) \ 28 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 29 (((x) >> 2) & 0x7f) 30 #define END 0 31 { 32 const u32 base = engine->mmio_base; 33 34 while (*data) { 35 u8 count, flags; 36 37 if (*data & BIT(7)) { /* skip */ 38 count = *data++ & ~BIT(7); 39 regs += count; 40 continue; 41 } 42 43 count = *data & 0x3f; 44 flags = *data >> 6; 45 data++; 46 47 *regs = MI_LOAD_REGISTER_IMM(count); 48 if (flags & POSTED) 49 *regs |= MI_LRI_FORCE_POSTED; 50 if (INTEL_GEN(engine->i915) >= 11) 51 *regs |= MI_LRI_LRM_CS_MMIO; 52 regs++; 53 54 GEM_BUG_ON(!count); 55 do { 56 u32 offset = 0; 57 u8 v; 58 59 do { 60 v = *data++; 61 offset <<= 7; 62 offset |= v & ~BIT(7); 63 } while (v & BIT(7)); 64 65 regs[0] = base + (offset << 2); 66 regs += 2; 67 } while (--count); 68 } 69 70 if (close) { 71 /* Close the batch; used mainly by live_lrc_layout() */ 72 *regs = MI_BATCH_BUFFER_END; 73 if (INTEL_GEN(engine->i915) >= 10) 74 *regs |= BIT(0); 75 } 76 } 77 78 static const u8 gen8_xcs_offsets[] = { 79 NOP(1), 80 LRI(11, 0), 81 REG16(0x244), 82 REG(0x034), 83 REG(0x030), 84 REG(0x038), 85 REG(0x03c), 86 REG(0x168), 87 REG(0x140), 88 REG(0x110), 89 REG(0x11c), 90 REG(0x114), 91 REG(0x118), 92 93 NOP(9), 94 LRI(9, 0), 95 REG16(0x3a8), 96 REG16(0x28c), 97 REG16(0x288), 98 REG16(0x284), 99 REG16(0x280), 100 REG16(0x27c), 101 REG16(0x278), 102 REG16(0x274), 103 REG16(0x270), 104 105 NOP(13), 106 LRI(2, 0), 107 REG16(0x200), 108 REG(0x028), 109 110 END 111 }; 112 113 static const u8 gen9_xcs_offsets[] = { 114 NOP(1), 115 LRI(14, POSTED), 116 REG16(0x244), 117 REG(0x034), 118 REG(0x030), 119 REG(0x038), 120 REG(0x03c), 121 REG(0x168), 122 REG(0x140), 123 REG(0x110), 124 REG(0x11c), 125 REG(0x114), 126 REG(0x118), 127 REG(0x1c0), 128 REG(0x1c4), 129 REG(0x1c8), 130 131 NOP(3), 132 LRI(9, POSTED), 133 REG16(0x3a8), 134 REG16(0x28c), 135 REG16(0x288), 136 REG16(0x284), 137 REG16(0x280), 138 REG16(0x27c), 139 REG16(0x278), 140 REG16(0x274), 141 REG16(0x270), 142 143 NOP(13), 144 LRI(1, POSTED), 145 REG16(0x200), 146 147 NOP(13), 148 LRI(44, POSTED), 149 REG(0x028), 150 REG(0x09c), 151 REG(0x0c0), 152 REG(0x178), 153 REG(0x17c), 154 REG16(0x358), 155 REG(0x170), 156 REG(0x150), 157 REG(0x154), 158 REG(0x158), 159 REG16(0x41c), 160 REG16(0x600), 161 REG16(0x604), 162 REG16(0x608), 163 REG16(0x60c), 164 REG16(0x610), 165 REG16(0x614), 166 REG16(0x618), 167 REG16(0x61c), 168 REG16(0x620), 169 REG16(0x624), 170 REG16(0x628), 171 REG16(0x62c), 172 REG16(0x630), 173 REG16(0x634), 174 REG16(0x638), 175 REG16(0x63c), 176 REG16(0x640), 177 REG16(0x644), 178 REG16(0x648), 179 REG16(0x64c), 180 REG16(0x650), 181 REG16(0x654), 182 REG16(0x658), 183 REG16(0x65c), 184 REG16(0x660), 185 REG16(0x664), 186 REG16(0x668), 187 REG16(0x66c), 188 REG16(0x670), 189 REG16(0x674), 190 REG16(0x678), 191 REG16(0x67c), 192 REG(0x068), 193 194 END 195 }; 196 197 static const u8 gen12_xcs_offsets[] = { 198 NOP(1), 199 LRI(13, POSTED), 200 REG16(0x244), 201 REG(0x034), 202 REG(0x030), 203 REG(0x038), 204 REG(0x03c), 205 REG(0x168), 206 REG(0x140), 207 REG(0x110), 208 REG(0x1c0), 209 REG(0x1c4), 210 REG(0x1c8), 211 REG(0x180), 212 REG16(0x2b4), 213 214 NOP(5), 215 LRI(9, POSTED), 216 REG16(0x3a8), 217 REG16(0x28c), 218 REG16(0x288), 219 REG16(0x284), 220 REG16(0x280), 221 REG16(0x27c), 222 REG16(0x278), 223 REG16(0x274), 224 REG16(0x270), 225 226 END 227 }; 228 229 static const u8 gen8_rcs_offsets[] = { 230 NOP(1), 231 LRI(14, POSTED), 232 REG16(0x244), 233 REG(0x034), 234 REG(0x030), 235 REG(0x038), 236 REG(0x03c), 237 REG(0x168), 238 REG(0x140), 239 REG(0x110), 240 REG(0x11c), 241 REG(0x114), 242 REG(0x118), 243 REG(0x1c0), 244 REG(0x1c4), 245 REG(0x1c8), 246 247 NOP(3), 248 LRI(9, POSTED), 249 REG16(0x3a8), 250 REG16(0x28c), 251 REG16(0x288), 252 REG16(0x284), 253 REG16(0x280), 254 REG16(0x27c), 255 REG16(0x278), 256 REG16(0x274), 257 REG16(0x270), 258 259 NOP(13), 260 LRI(1, 0), 261 REG(0x0c8), 262 263 END 264 }; 265 266 static const u8 gen9_rcs_offsets[] = { 267 NOP(1), 268 LRI(14, POSTED), 269 REG16(0x244), 270 REG(0x34), 271 REG(0x30), 272 REG(0x38), 273 REG(0x3c), 274 REG(0x168), 275 REG(0x140), 276 REG(0x110), 277 REG(0x11c), 278 REG(0x114), 279 REG(0x118), 280 REG(0x1c0), 281 REG(0x1c4), 282 REG(0x1c8), 283 284 NOP(3), 285 LRI(9, POSTED), 286 REG16(0x3a8), 287 REG16(0x28c), 288 REG16(0x288), 289 REG16(0x284), 290 REG16(0x280), 291 REG16(0x27c), 292 REG16(0x278), 293 REG16(0x274), 294 REG16(0x270), 295 296 NOP(13), 297 LRI(1, 0), 298 REG(0xc8), 299 300 NOP(13), 301 LRI(44, POSTED), 302 REG(0x28), 303 REG(0x9c), 304 REG(0xc0), 305 REG(0x178), 306 REG(0x17c), 307 REG16(0x358), 308 REG(0x170), 309 REG(0x150), 310 REG(0x154), 311 REG(0x158), 312 REG16(0x41c), 313 REG16(0x600), 314 REG16(0x604), 315 REG16(0x608), 316 REG16(0x60c), 317 REG16(0x610), 318 REG16(0x614), 319 REG16(0x618), 320 REG16(0x61c), 321 REG16(0x620), 322 REG16(0x624), 323 REG16(0x628), 324 REG16(0x62c), 325 REG16(0x630), 326 REG16(0x634), 327 REG16(0x638), 328 REG16(0x63c), 329 REG16(0x640), 330 REG16(0x644), 331 REG16(0x648), 332 REG16(0x64c), 333 REG16(0x650), 334 REG16(0x654), 335 REG16(0x658), 336 REG16(0x65c), 337 REG16(0x660), 338 REG16(0x664), 339 REG16(0x668), 340 REG16(0x66c), 341 REG16(0x670), 342 REG16(0x674), 343 REG16(0x678), 344 REG16(0x67c), 345 REG(0x68), 346 347 END 348 }; 349 350 static const u8 gen11_rcs_offsets[] = { 351 NOP(1), 352 LRI(15, POSTED), 353 REG16(0x244), 354 REG(0x034), 355 REG(0x030), 356 REG(0x038), 357 REG(0x03c), 358 REG(0x168), 359 REG(0x140), 360 REG(0x110), 361 REG(0x11c), 362 REG(0x114), 363 REG(0x118), 364 REG(0x1c0), 365 REG(0x1c4), 366 REG(0x1c8), 367 REG(0x180), 368 369 NOP(1), 370 LRI(9, POSTED), 371 REG16(0x3a8), 372 REG16(0x28c), 373 REG16(0x288), 374 REG16(0x284), 375 REG16(0x280), 376 REG16(0x27c), 377 REG16(0x278), 378 REG16(0x274), 379 REG16(0x270), 380 381 LRI(1, POSTED), 382 REG(0x1b0), 383 384 NOP(10), 385 LRI(1, 0), 386 REG(0x0c8), 387 388 END 389 }; 390 391 static const u8 gen12_rcs_offsets[] = { 392 NOP(1), 393 LRI(13, POSTED), 394 REG16(0x244), 395 REG(0x034), 396 REG(0x030), 397 REG(0x038), 398 REG(0x03c), 399 REG(0x168), 400 REG(0x140), 401 REG(0x110), 402 REG(0x1c0), 403 REG(0x1c4), 404 REG(0x1c8), 405 REG(0x180), 406 REG16(0x2b4), 407 408 NOP(5), 409 LRI(9, POSTED), 410 REG16(0x3a8), 411 REG16(0x28c), 412 REG16(0x288), 413 REG16(0x284), 414 REG16(0x280), 415 REG16(0x27c), 416 REG16(0x278), 417 REG16(0x274), 418 REG16(0x270), 419 420 LRI(3, POSTED), 421 REG(0x1b0), 422 REG16(0x5a8), 423 REG16(0x5ac), 424 425 NOP(6), 426 LRI(1, 0), 427 REG(0x0c8), 428 NOP(3 + 9 + 1), 429 430 LRI(51, POSTED), 431 REG16(0x588), 432 REG16(0x588), 433 REG16(0x588), 434 REG16(0x588), 435 REG16(0x588), 436 REG16(0x588), 437 REG(0x028), 438 REG(0x09c), 439 REG(0x0c0), 440 REG(0x178), 441 REG(0x17c), 442 REG16(0x358), 443 REG(0x170), 444 REG(0x150), 445 REG(0x154), 446 REG(0x158), 447 REG16(0x41c), 448 REG16(0x600), 449 REG16(0x604), 450 REG16(0x608), 451 REG16(0x60c), 452 REG16(0x610), 453 REG16(0x614), 454 REG16(0x618), 455 REG16(0x61c), 456 REG16(0x620), 457 REG16(0x624), 458 REG16(0x628), 459 REG16(0x62c), 460 REG16(0x630), 461 REG16(0x634), 462 REG16(0x638), 463 REG16(0x63c), 464 REG16(0x640), 465 REG16(0x644), 466 REG16(0x648), 467 REG16(0x64c), 468 REG16(0x650), 469 REG16(0x654), 470 REG16(0x658), 471 REG16(0x65c), 472 REG16(0x660), 473 REG16(0x664), 474 REG16(0x668), 475 REG16(0x66c), 476 REG16(0x670), 477 REG16(0x674), 478 REG16(0x678), 479 REG16(0x67c), 480 REG(0x068), 481 REG(0x084), 482 NOP(1), 483 484 END 485 }; 486 487 #undef END 488 #undef REG16 489 #undef REG 490 #undef LRI 491 #undef NOP 492 493 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 494 { 495 /* 496 * The gen12+ lists only have the registers we program in the basic 497 * default state. We rely on the context image using relative 498 * addressing to automatic fixup the register state between the 499 * physical engines for virtual engine. 500 */ 501 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 502 !intel_engine_has_relative_mmio(engine)); 503 504 if (engine->class == RENDER_CLASS) { 505 if (INTEL_GEN(engine->i915) >= 12) 506 return gen12_rcs_offsets; 507 else if (INTEL_GEN(engine->i915) >= 11) 508 return gen11_rcs_offsets; 509 else if (INTEL_GEN(engine->i915) >= 9) 510 return gen9_rcs_offsets; 511 else 512 return gen8_rcs_offsets; 513 } else { 514 if (INTEL_GEN(engine->i915) >= 12) 515 return gen12_xcs_offsets; 516 else if (INTEL_GEN(engine->i915) >= 9) 517 return gen9_xcs_offsets; 518 else 519 return gen8_xcs_offsets; 520 } 521 } 522 523 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 524 { 525 if (INTEL_GEN(engine->i915) >= 12) 526 return 0x60; 527 else if (INTEL_GEN(engine->i915) >= 9) 528 return 0x54; 529 else if (engine->class == RENDER_CLASS) 530 return 0x58; 531 else 532 return -1; 533 } 534 535 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 536 { 537 if (INTEL_GEN(engine->i915) >= 12) 538 return 0x74; 539 else if (INTEL_GEN(engine->i915) >= 9) 540 return 0x68; 541 else if (engine->class == RENDER_CLASS) 542 return 0xd8; 543 else 544 return -1; 545 } 546 547 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 548 { 549 if (INTEL_GEN(engine->i915) >= 12) 550 return 0x12; 551 else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) 552 return 0x18; 553 else 554 return -1; 555 } 556 557 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 558 { 559 int x; 560 561 x = lrc_ring_wa_bb_per_ctx(engine); 562 if (x < 0) 563 return x; 564 565 return x + 2; 566 } 567 568 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 569 { 570 int x; 571 572 x = lrc_ring_indirect_ptr(engine); 573 if (x < 0) 574 return x; 575 576 return x + 2; 577 } 578 579 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 580 { 581 if (engine->class != RENDER_CLASS) 582 return -1; 583 584 if (INTEL_GEN(engine->i915) >= 12) 585 return 0xb6; 586 else if (INTEL_GEN(engine->i915) >= 11) 587 return 0xaa; 588 else 589 return -1; 590 } 591 592 static u32 593 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 594 { 595 switch (INTEL_GEN(engine->i915)) { 596 default: 597 MISSING_CASE(INTEL_GEN(engine->i915)); 598 fallthrough; 599 case 12: 600 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 601 case 11: 602 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 603 case 10: 604 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 605 case 9: 606 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 607 case 8: 608 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 609 } 610 } 611 612 static void 613 lrc_setup_indirect_ctx(u32 *regs, 614 const struct intel_engine_cs *engine, 615 u32 ctx_bb_ggtt_addr, 616 u32 size) 617 { 618 GEM_BUG_ON(!size); 619 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 620 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 621 regs[lrc_ring_indirect_ptr(engine) + 1] = 622 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 623 624 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 625 regs[lrc_ring_indirect_offset(engine) + 1] = 626 lrc_ring_indirect_offset_default(engine) << 6; 627 } 628 629 static void init_common_regs(u32 * const regs, 630 const struct intel_context *ce, 631 const struct intel_engine_cs *engine, 632 bool inhibit) 633 { 634 u32 ctl; 635 636 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 637 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 638 if (inhibit) 639 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 640 if (INTEL_GEN(engine->i915) < 11) 641 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 642 CTX_CTRL_RS_CTX_ENABLE); 643 regs[CTX_CONTEXT_CONTROL] = ctl; 644 645 regs[CTX_TIMESTAMP] = ce->runtime.last; 646 } 647 648 static void init_wa_bb_regs(u32 * const regs, 649 const struct intel_engine_cs *engine) 650 { 651 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 652 653 if (wa_ctx->per_ctx.size) { 654 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 655 656 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 657 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 658 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 659 } 660 661 if (wa_ctx->indirect_ctx.size) { 662 lrc_setup_indirect_ctx(regs, engine, 663 i915_ggtt_offset(wa_ctx->vma) + 664 wa_ctx->indirect_ctx.offset, 665 wa_ctx->indirect_ctx.size); 666 } 667 } 668 669 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 670 { 671 if (i915_vm_is_4lvl(&ppgtt->vm)) { 672 /* 64b PPGTT (48bit canonical) 673 * PDP0_DESCRIPTOR contains the base address to PML4 and 674 * other PDP Descriptors are ignored. 675 */ 676 ASSIGN_CTX_PML4(ppgtt, regs); 677 } else { 678 ASSIGN_CTX_PDP(ppgtt, regs, 3); 679 ASSIGN_CTX_PDP(ppgtt, regs, 2); 680 ASSIGN_CTX_PDP(ppgtt, regs, 1); 681 ASSIGN_CTX_PDP(ppgtt, regs, 0); 682 } 683 } 684 685 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 686 { 687 if (i915_is_ggtt(vm)) 688 return i915_vm_to_ggtt(vm)->alias; 689 else 690 return i915_vm_to_ppgtt(vm); 691 } 692 693 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 694 { 695 int x; 696 697 x = lrc_ring_mi_mode(engine); 698 if (x != -1) { 699 regs[x + 1] &= ~STOP_RING; 700 regs[x + 1] |= STOP_RING << 16; 701 } 702 } 703 704 static void __lrc_init_regs(u32 *regs, 705 const struct intel_context *ce, 706 const struct intel_engine_cs *engine, 707 bool inhibit) 708 { 709 /* 710 * A context is actually a big batch buffer with several 711 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 712 * values we are setting here are only for the first context restore: 713 * on a subsequent save, the GPU will recreate this batchbuffer with new 714 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 715 * we are not initializing here). 716 * 717 * Must keep consistent with virtual_update_register_offsets(). 718 */ 719 720 if (inhibit) 721 memset(regs, 0, PAGE_SIZE); 722 723 set_offsets(regs, reg_offsets(engine), engine, inhibit); 724 725 init_common_regs(regs, ce, engine, inhibit); 726 init_ppgtt_regs(regs, vm_alias(ce->vm)); 727 728 init_wa_bb_regs(regs, engine); 729 730 __reset_stop_ring(regs, engine); 731 } 732 733 void lrc_init_regs(const struct intel_context *ce, 734 const struct intel_engine_cs *engine, 735 bool inhibit) 736 { 737 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 738 } 739 740 void lrc_reset_regs(const struct intel_context *ce, 741 const struct intel_engine_cs *engine) 742 { 743 __reset_stop_ring(ce->lrc_reg_state, engine); 744 } 745 746 static void 747 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 748 { 749 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 750 return; 751 752 vaddr += engine->context_size; 753 754 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 755 } 756 757 static void 758 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 759 { 760 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 761 return; 762 763 vaddr += engine->context_size; 764 765 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 766 drm_err_once(&engine->i915->drm, 767 "%s context redzone overwritten!\n", 768 engine->name); 769 } 770 771 void lrc_init_state(struct intel_context *ce, 772 struct intel_engine_cs *engine, 773 void *state) 774 { 775 bool inhibit = true; 776 777 set_redzone(state, engine); 778 779 if (engine->default_state) { 780 shmem_read(engine->default_state, 0, 781 state, engine->context_size); 782 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 783 inhibit = false; 784 } 785 786 /* Clear the ppHWSP (inc. per-context counters) */ 787 memset(state, 0, PAGE_SIZE); 788 789 /* 790 * The second page of the context object contains some registers which 791 * must be set up prior to the first execution. 792 */ 793 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 794 } 795 796 static struct i915_vma * 797 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 798 { 799 struct drm_i915_gem_object *obj; 800 struct i915_vma *vma; 801 u32 context_size; 802 803 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 804 805 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 806 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 807 808 if (INTEL_GEN(engine->i915) == 12) { 809 ce->wa_bb_page = context_size / PAGE_SIZE; 810 context_size += PAGE_SIZE; 811 } 812 813 obj = i915_gem_object_create_lmem(engine->i915, context_size, 0); 814 if (IS_ERR(obj)) 815 obj = i915_gem_object_create_shmem(engine->i915, context_size); 816 if (IS_ERR(obj)) 817 return ERR_CAST(obj); 818 819 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 820 if (IS_ERR(vma)) { 821 i915_gem_object_put(obj); 822 return vma; 823 } 824 825 return vma; 826 } 827 828 static struct intel_timeline * 829 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 830 { 831 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 832 833 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 834 } 835 836 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 837 { 838 struct intel_ring *ring; 839 struct i915_vma *vma; 840 int err; 841 842 GEM_BUG_ON(ce->state); 843 844 vma = __lrc_alloc_state(ce, engine); 845 if (IS_ERR(vma)) 846 return PTR_ERR(vma); 847 848 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 849 if (IS_ERR(ring)) { 850 err = PTR_ERR(ring); 851 goto err_vma; 852 } 853 854 if (!page_mask_bits(ce->timeline)) { 855 struct intel_timeline *tl; 856 857 /* 858 * Use the static global HWSP for the kernel context, and 859 * a dynamically allocated cacheline for everyone else. 860 */ 861 if (unlikely(ce->timeline)) 862 tl = pinned_timeline(ce, engine); 863 else 864 tl = intel_timeline_create(engine->gt); 865 if (IS_ERR(tl)) { 866 err = PTR_ERR(tl); 867 goto err_ring; 868 } 869 870 ce->timeline = tl; 871 } 872 873 ce->ring = ring; 874 ce->state = vma; 875 876 return 0; 877 878 err_ring: 879 intel_ring_put(ring); 880 err_vma: 881 i915_vma_put(vma); 882 return err; 883 } 884 885 void lrc_reset(struct intel_context *ce) 886 { 887 GEM_BUG_ON(!intel_context_is_pinned(ce)); 888 889 intel_ring_reset(ce->ring, ce->ring->emit); 890 891 /* Scrub away the garbage */ 892 lrc_init_regs(ce, ce->engine, true); 893 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 894 } 895 896 int 897 lrc_pre_pin(struct intel_context *ce, 898 struct intel_engine_cs *engine, 899 struct i915_gem_ww_ctx *ww, 900 void **vaddr) 901 { 902 GEM_BUG_ON(!ce->state); 903 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 904 905 *vaddr = i915_gem_object_pin_map(ce->state->obj, 906 i915_coherent_map_type(ce->engine->i915) | 907 I915_MAP_OVERRIDE); 908 909 return PTR_ERR_OR_ZERO(*vaddr); 910 } 911 912 int 913 lrc_pin(struct intel_context *ce, 914 struct intel_engine_cs *engine, 915 void *vaddr) 916 { 917 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 918 919 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 920 lrc_init_state(ce, engine, vaddr); 921 922 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 923 return 0; 924 } 925 926 void lrc_unpin(struct intel_context *ce) 927 { 928 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 929 ce->engine); 930 } 931 932 void lrc_post_unpin(struct intel_context *ce) 933 { 934 i915_gem_object_unpin_map(ce->state->obj); 935 } 936 937 void lrc_fini(struct intel_context *ce) 938 { 939 if (!ce->state) 940 return; 941 942 intel_ring_put(fetch_and_zero(&ce->ring)); 943 i915_vma_put(fetch_and_zero(&ce->state)); 944 } 945 946 void lrc_destroy(struct kref *kref) 947 { 948 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 949 950 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 951 GEM_BUG_ON(intel_context_is_pinned(ce)); 952 953 lrc_fini(ce); 954 955 intel_context_fini(ce); 956 intel_context_free(ce); 957 } 958 959 static u32 * 960 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 961 { 962 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 963 MI_SRM_LRM_GLOBAL_GTT | 964 MI_LRI_LRM_CS_MMIO; 965 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 966 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 967 CTX_TIMESTAMP * sizeof(u32); 968 *cs++ = 0; 969 970 *cs++ = MI_LOAD_REGISTER_REG | 971 MI_LRR_SOURCE_CS_MMIO | 972 MI_LRI_LRM_CS_MMIO; 973 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 974 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 975 976 *cs++ = MI_LOAD_REGISTER_REG | 977 MI_LRR_SOURCE_CS_MMIO | 978 MI_LRI_LRM_CS_MMIO; 979 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 980 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 981 982 return cs; 983 } 984 985 static u32 * 986 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 987 { 988 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 989 990 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 991 MI_SRM_LRM_GLOBAL_GTT | 992 MI_LRI_LRM_CS_MMIO; 993 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 994 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 995 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 996 *cs++ = 0; 997 998 return cs; 999 } 1000 1001 static u32 * 1002 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1003 { 1004 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1005 1006 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1007 MI_SRM_LRM_GLOBAL_GTT | 1008 MI_LRI_LRM_CS_MMIO; 1009 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1010 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1011 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1012 *cs++ = 0; 1013 1014 *cs++ = MI_LOAD_REGISTER_REG | 1015 MI_LRR_SOURCE_CS_MMIO | 1016 MI_LRI_LRM_CS_MMIO; 1017 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1018 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1019 1020 return cs; 1021 } 1022 1023 static u32 * 1024 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1025 { 1026 cs = gen12_emit_timestamp_wa(ce, cs); 1027 cs = gen12_emit_cmd_buf_wa(ce, cs); 1028 cs = gen12_emit_restore_scratch(ce, cs); 1029 1030 return cs; 1031 } 1032 1033 static u32 * 1034 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1035 { 1036 cs = gen12_emit_timestamp_wa(ce, cs); 1037 cs = gen12_emit_restore_scratch(ce, cs); 1038 1039 return cs; 1040 } 1041 1042 static u32 context_wa_bb_offset(const struct intel_context *ce) 1043 { 1044 return PAGE_SIZE * ce->wa_bb_page; 1045 } 1046 1047 static u32 *context_indirect_bb(const struct intel_context *ce) 1048 { 1049 void *ptr; 1050 1051 GEM_BUG_ON(!ce->wa_bb_page); 1052 1053 ptr = ce->lrc_reg_state; 1054 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 1055 ptr += context_wa_bb_offset(ce); 1056 1057 return ptr; 1058 } 1059 1060 static void 1061 setup_indirect_ctx_bb(const struct intel_context *ce, 1062 const struct intel_engine_cs *engine, 1063 u32 *(*emit)(const struct intel_context *, u32 *)) 1064 { 1065 u32 * const start = context_indirect_bb(ce); 1066 u32 *cs; 1067 1068 cs = emit(ce, start); 1069 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1070 while ((unsigned long)cs % CACHELINE_BYTES) 1071 *cs++ = MI_NOOP; 1072 1073 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1074 i915_ggtt_offset(ce->state) + 1075 context_wa_bb_offset(ce), 1076 (cs - start) * sizeof(*cs)); 1077 } 1078 1079 /* 1080 * The context descriptor encodes various attributes of a context, 1081 * including its GTT address and some flags. Because it's fairly 1082 * expensive to calculate, we'll just do it once and cache the result, 1083 * which remains valid until the context is unpinned. 1084 * 1085 * This is what a descriptor looks like, from LSB to MSB:: 1086 * 1087 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1088 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1089 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1090 * bits 53-54: mbz, reserved for use by hardware 1091 * bits 55-63: group ID, currently unused and set to 0 1092 * 1093 * Starting from Gen11, the upper dword of the descriptor has a new format: 1094 * 1095 * bits 32-36: reserved 1096 * bits 37-47: SW context ID 1097 * bits 48:53: engine instance 1098 * bit 54: mbz, reserved for use by hardware 1099 * bits 55-60: SW counter 1100 * bits 61-63: engine class 1101 * 1102 * engine info, SW context ID and SW counter need to form a unique number 1103 * (Context ID) per lrc. 1104 */ 1105 static u32 lrc_descriptor(const struct intel_context *ce) 1106 { 1107 u32 desc; 1108 1109 desc = INTEL_LEGACY_32B_CONTEXT; 1110 if (i915_vm_is_4lvl(ce->vm)) 1111 desc = INTEL_LEGACY_64B_CONTEXT; 1112 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1113 1114 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1115 if (IS_GEN(ce->vm->i915, 8)) 1116 desc |= GEN8_CTX_L3LLC_COHERENT; 1117 1118 return i915_ggtt_offset(ce->state) | desc; 1119 } 1120 1121 u32 lrc_update_regs(const struct intel_context *ce, 1122 const struct intel_engine_cs *engine, 1123 u32 head) 1124 { 1125 struct intel_ring *ring = ce->ring; 1126 u32 *regs = ce->lrc_reg_state; 1127 1128 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1129 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1130 1131 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1132 regs[CTX_RING_HEAD] = head; 1133 regs[CTX_RING_TAIL] = ring->tail; 1134 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1135 1136 /* RPCS */ 1137 if (engine->class == RENDER_CLASS) { 1138 regs[CTX_R_PWR_CLK_STATE] = 1139 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1140 1141 i915_oa_init_reg_state(ce, engine); 1142 } 1143 1144 if (ce->wa_bb_page) { 1145 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1146 1147 fn = gen12_emit_indirect_ctx_xcs; 1148 if (ce->engine->class == RENDER_CLASS) 1149 fn = gen12_emit_indirect_ctx_rcs; 1150 1151 /* Mutually exclusive wrt to global indirect bb */ 1152 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1153 setup_indirect_ctx_bb(ce, engine, fn); 1154 } 1155 1156 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1157 } 1158 1159 void lrc_update_offsets(struct intel_context *ce, 1160 struct intel_engine_cs *engine) 1161 { 1162 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1163 } 1164 1165 void lrc_check_regs(const struct intel_context *ce, 1166 const struct intel_engine_cs *engine, 1167 const char *when) 1168 { 1169 const struct intel_ring *ring = ce->ring; 1170 u32 *regs = ce->lrc_reg_state; 1171 bool valid = true; 1172 int x; 1173 1174 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1175 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1176 engine->name, 1177 regs[CTX_RING_START], 1178 i915_ggtt_offset(ring->vma)); 1179 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1180 valid = false; 1181 } 1182 1183 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1184 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1185 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1186 engine->name, 1187 regs[CTX_RING_CTL], 1188 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1189 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1190 valid = false; 1191 } 1192 1193 x = lrc_ring_mi_mode(engine); 1194 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1195 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1196 engine->name, regs[x + 1]); 1197 regs[x + 1] &= ~STOP_RING; 1198 regs[x + 1] |= STOP_RING << 16; 1199 valid = false; 1200 } 1201 1202 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1203 } 1204 1205 /* 1206 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1207 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1208 * but there is a slight complication as this is applied in WA batch where the 1209 * values are only initialized once so we cannot take register value at the 1210 * beginning and reuse it further; hence we save its value to memory, upload a 1211 * constant value with bit21 set and then we restore it back with the saved value. 1212 * To simplify the WA, a constant value is formed by using the default value 1213 * of this register. This shouldn't be a problem because we are only modifying 1214 * it for a short period and this batch in non-premptible. We can ofcourse 1215 * use additional instructions that read the actual value of the register 1216 * at that time and set our bit of interest but it makes the WA complicated. 1217 * 1218 * This WA is also required for Gen9 so extracting as a function avoids 1219 * code duplication. 1220 */ 1221 static u32 * 1222 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1223 { 1224 /* NB no one else is allowed to scribble over scratch + 256! */ 1225 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1226 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1227 *batch++ = intel_gt_scratch_offset(engine->gt, 1228 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1229 *batch++ = 0; 1230 1231 *batch++ = MI_LOAD_REGISTER_IMM(1); 1232 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1233 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1234 1235 batch = gen8_emit_pipe_control(batch, 1236 PIPE_CONTROL_CS_STALL | 1237 PIPE_CONTROL_DC_FLUSH_ENABLE, 1238 0); 1239 1240 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1241 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1242 *batch++ = intel_gt_scratch_offset(engine->gt, 1243 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1244 *batch++ = 0; 1245 1246 return batch; 1247 } 1248 1249 /* 1250 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1251 * initialized at the beginning and shared across all contexts but this field 1252 * helps us to have multiple batches at different offsets and select them based 1253 * on a criteria. At the moment this batch always start at the beginning of the page 1254 * and at this point we don't have multiple wa_ctx batch buffers. 1255 * 1256 * The number of WA applied are not known at the beginning; we use this field 1257 * to return the no of DWORDS written. 1258 * 1259 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1260 * so it adds NOOPs as padding to make it cacheline aligned. 1261 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1262 * makes a complete batch buffer. 1263 */ 1264 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1265 { 1266 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1267 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1268 1269 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1270 if (IS_BROADWELL(engine->i915)) 1271 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1272 1273 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1274 /* Actual scratch location is at 128 bytes offset */ 1275 batch = gen8_emit_pipe_control(batch, 1276 PIPE_CONTROL_FLUSH_L3 | 1277 PIPE_CONTROL_STORE_DATA_INDEX | 1278 PIPE_CONTROL_CS_STALL | 1279 PIPE_CONTROL_QW_WRITE, 1280 LRC_PPHWSP_SCRATCH_ADDR); 1281 1282 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1283 1284 /* Pad to end of cacheline */ 1285 while ((unsigned long)batch % CACHELINE_BYTES) 1286 *batch++ = MI_NOOP; 1287 1288 /* 1289 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1290 * execution depends on the length specified in terms of cache lines 1291 * in the register CTX_RCS_INDIRECT_CTX 1292 */ 1293 1294 return batch; 1295 } 1296 1297 struct lri { 1298 i915_reg_t reg; 1299 u32 value; 1300 }; 1301 1302 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1303 { 1304 GEM_BUG_ON(!count || count > 63); 1305 1306 *batch++ = MI_LOAD_REGISTER_IMM(count); 1307 do { 1308 *batch++ = i915_mmio_reg_offset(lri->reg); 1309 *batch++ = lri->value; 1310 } while (lri++, --count); 1311 *batch++ = MI_NOOP; 1312 1313 return batch; 1314 } 1315 1316 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1317 { 1318 static const struct lri lri[] = { 1319 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1320 { 1321 COMMON_SLICE_CHICKEN2, 1322 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1323 0), 1324 }, 1325 1326 /* BSpec: 11391 */ 1327 { 1328 FF_SLICE_CHICKEN, 1329 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1330 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1331 }, 1332 1333 /* BSpec: 11299 */ 1334 { 1335 _3D_CHICKEN3, 1336 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1337 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1338 } 1339 }; 1340 1341 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1342 1343 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1344 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1345 1346 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1347 batch = gen8_emit_pipe_control(batch, 1348 PIPE_CONTROL_FLUSH_L3 | 1349 PIPE_CONTROL_STORE_DATA_INDEX | 1350 PIPE_CONTROL_CS_STALL | 1351 PIPE_CONTROL_QW_WRITE, 1352 LRC_PPHWSP_SCRATCH_ADDR); 1353 1354 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1355 1356 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1357 if (HAS_POOLED_EU(engine->i915)) { 1358 /* 1359 * EU pool configuration is setup along with golden context 1360 * during context initialization. This value depends on 1361 * device type (2x6 or 3x6) and needs to be updated based 1362 * on which subslice is disabled especially for 2x6 1363 * devices, however it is safe to load default 1364 * configuration of 3x6 device instead of masking off 1365 * corresponding bits because HW ignores bits of a disabled 1366 * subslice and drops down to appropriate config. Please 1367 * see render_state_setup() in i915_gem_render_state.c for 1368 * possible configurations, to avoid duplication they are 1369 * not shown here again. 1370 */ 1371 *batch++ = GEN9_MEDIA_POOL_STATE; 1372 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1373 *batch++ = 0x00777000; 1374 *batch++ = 0; 1375 *batch++ = 0; 1376 *batch++ = 0; 1377 } 1378 1379 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1380 1381 /* Pad to end of cacheline */ 1382 while ((unsigned long)batch % CACHELINE_BYTES) 1383 *batch++ = MI_NOOP; 1384 1385 return batch; 1386 } 1387 1388 static u32 * 1389 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1390 { 1391 int i; 1392 1393 /* 1394 * WaPipeControlBefore3DStateSamplePattern: cnl 1395 * 1396 * Ensure the engine is idle prior to programming a 1397 * 3DSTATE_SAMPLE_PATTERN during a context restore. 1398 */ 1399 batch = gen8_emit_pipe_control(batch, 1400 PIPE_CONTROL_CS_STALL, 1401 0); 1402 /* 1403 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 1404 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 1405 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 1406 * confusing. Since gen8_emit_pipe_control() already advances the 1407 * batch by 6 dwords, we advance the other 10 here, completing a 1408 * cacheline. It's not clear if the workaround requires this padding 1409 * before other commands, or if it's just the regular padding we would 1410 * already have for the workaround bb, so leave it here for now. 1411 */ 1412 for (i = 0; i < 10; i++) 1413 *batch++ = MI_NOOP; 1414 1415 /* Pad to end of cacheline */ 1416 while ((unsigned long)batch % CACHELINE_BYTES) 1417 *batch++ = MI_NOOP; 1418 1419 return batch; 1420 } 1421 1422 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1423 1424 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 1425 { 1426 struct drm_i915_gem_object *obj; 1427 struct i915_vma *vma; 1428 int err; 1429 1430 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1431 if (IS_ERR(obj)) 1432 return PTR_ERR(obj); 1433 1434 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1435 if (IS_ERR(vma)) { 1436 err = PTR_ERR(vma); 1437 goto err; 1438 } 1439 1440 err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); 1441 if (err) 1442 goto err; 1443 1444 engine->wa_ctx.vma = vma; 1445 return 0; 1446 1447 err: 1448 i915_gem_object_put(obj); 1449 return err; 1450 } 1451 1452 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1453 { 1454 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1455 1456 /* Called on error unwind, clear all flags to prevent further use */ 1457 memset(&engine->wa_ctx, 0, sizeof(engine->wa_ctx)); 1458 } 1459 1460 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1461 1462 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1463 { 1464 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1465 struct i915_wa_ctx_bb *wa_bb[] = { 1466 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1467 }; 1468 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1469 void *batch, *batch_ptr; 1470 unsigned int i; 1471 int err; 1472 1473 if (engine->class != RENDER_CLASS) 1474 return; 1475 1476 switch (INTEL_GEN(engine->i915)) { 1477 case 12: 1478 case 11: 1479 return; 1480 case 10: 1481 wa_bb_fn[0] = gen10_init_indirectctx_bb; 1482 wa_bb_fn[1] = NULL; 1483 break; 1484 case 9: 1485 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1486 wa_bb_fn[1] = NULL; 1487 break; 1488 case 8: 1489 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1490 wa_bb_fn[1] = NULL; 1491 break; 1492 default: 1493 MISSING_CASE(INTEL_GEN(engine->i915)); 1494 return; 1495 } 1496 1497 err = lrc_setup_wa_ctx(engine); 1498 if (err) { 1499 /* 1500 * We continue even if we fail to initialize WA batch 1501 * because we only expect rare glitches but nothing 1502 * critical to prevent us from using GPU 1503 */ 1504 drm_err(&engine->i915->drm, 1505 "Ignoring context switch w/a allocation error:%d\n", 1506 err); 1507 return; 1508 } 1509 1510 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1511 1512 /* 1513 * Emit the two workaround batch buffers, recording the offset from the 1514 * start of the workaround batch buffer object for each and their 1515 * respective sizes. 1516 */ 1517 batch_ptr = batch; 1518 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1519 wa_bb[i]->offset = batch_ptr - batch; 1520 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1521 CACHELINE_BYTES))) { 1522 err = -EINVAL; 1523 break; 1524 } 1525 if (wa_bb_fn[i]) 1526 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1527 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1528 } 1529 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1530 1531 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1532 __i915_gem_object_release_map(wa_ctx->vma->obj); 1533 1534 /* Verify that we can handle failure to setup the wa_ctx */ 1535 if (err || i915_inject_probe_error(engine->i915, -ENODEV)) 1536 lrc_fini_wa_ctx(engine); 1537 } 1538 1539 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1540 { 1541 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1542 ce->runtime.num_underflow++; 1543 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1544 #endif 1545 } 1546 1547 void lrc_update_runtime(struct intel_context *ce) 1548 { 1549 u32 old; 1550 s32 dt; 1551 1552 if (intel_context_is_barrier(ce)) 1553 return; 1554 1555 old = ce->runtime.last; 1556 ce->runtime.last = lrc_get_runtime(ce); 1557 dt = ce->runtime.last - old; 1558 1559 if (unlikely(dt < 0)) { 1560 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1561 old, ce->runtime.last, dt); 1562 st_update_runtime_underflow(ce, dt); 1563 return; 1564 } 1565 1566 ewma_runtime_add(&ce->runtime.avg, dt); 1567 ce->runtime.total += dt; 1568 } 1569 1570 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1571 #include "selftest_lrc.c" 1572 #endif 1573