1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include "gem/i915_gem_lmem.h" 7 8 #include "gen8_engine_cs.h" 9 #include "i915_drv.h" 10 #include "i915_perf.h" 11 #include "intel_engine.h" 12 #include "intel_gpu_commands.h" 13 #include "intel_gt.h" 14 #include "intel_lrc.h" 15 #include "intel_lrc_reg.h" 16 #include "intel_ring.h" 17 #include "shmem_utils.h" 18 19 static void set_offsets(u32 *regs, 20 const u8 *data, 21 const struct intel_engine_cs *engine, 22 bool close) 23 #define NOP(x) (BIT(7) | (x)) 24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 25 #define POSTED BIT(0) 26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 27 #define REG16(x) \ 28 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 29 (((x) >> 2) & 0x7f) 30 #define END 0 31 { 32 const u32 base = engine->mmio_base; 33 34 while (*data) { 35 u8 count, flags; 36 37 if (*data & BIT(7)) { /* skip */ 38 count = *data++ & ~BIT(7); 39 regs += count; 40 continue; 41 } 42 43 count = *data & 0x3f; 44 flags = *data >> 6; 45 data++; 46 47 *regs = MI_LOAD_REGISTER_IMM(count); 48 if (flags & POSTED) 49 *regs |= MI_LRI_FORCE_POSTED; 50 if (GRAPHICS_VER(engine->i915) >= 11) 51 *regs |= MI_LRI_LRM_CS_MMIO; 52 regs++; 53 54 GEM_BUG_ON(!count); 55 do { 56 u32 offset = 0; 57 u8 v; 58 59 do { 60 v = *data++; 61 offset <<= 7; 62 offset |= v & ~BIT(7); 63 } while (v & BIT(7)); 64 65 regs[0] = base + (offset << 2); 66 regs += 2; 67 } while (--count); 68 } 69 70 if (close) { 71 /* Close the batch; used mainly by live_lrc_layout() */ 72 *regs = MI_BATCH_BUFFER_END; 73 if (GRAPHICS_VER(engine->i915) >= 11) 74 *regs |= BIT(0); 75 } 76 } 77 78 static const u8 gen8_xcs_offsets[] = { 79 NOP(1), 80 LRI(11, 0), 81 REG16(0x244), 82 REG(0x034), 83 REG(0x030), 84 REG(0x038), 85 REG(0x03c), 86 REG(0x168), 87 REG(0x140), 88 REG(0x110), 89 REG(0x11c), 90 REG(0x114), 91 REG(0x118), 92 93 NOP(9), 94 LRI(9, 0), 95 REG16(0x3a8), 96 REG16(0x28c), 97 REG16(0x288), 98 REG16(0x284), 99 REG16(0x280), 100 REG16(0x27c), 101 REG16(0x278), 102 REG16(0x274), 103 REG16(0x270), 104 105 NOP(13), 106 LRI(2, 0), 107 REG16(0x200), 108 REG(0x028), 109 110 END 111 }; 112 113 static const u8 gen9_xcs_offsets[] = { 114 NOP(1), 115 LRI(14, POSTED), 116 REG16(0x244), 117 REG(0x034), 118 REG(0x030), 119 REG(0x038), 120 REG(0x03c), 121 REG(0x168), 122 REG(0x140), 123 REG(0x110), 124 REG(0x11c), 125 REG(0x114), 126 REG(0x118), 127 REG(0x1c0), 128 REG(0x1c4), 129 REG(0x1c8), 130 131 NOP(3), 132 LRI(9, POSTED), 133 REG16(0x3a8), 134 REG16(0x28c), 135 REG16(0x288), 136 REG16(0x284), 137 REG16(0x280), 138 REG16(0x27c), 139 REG16(0x278), 140 REG16(0x274), 141 REG16(0x270), 142 143 NOP(13), 144 LRI(1, POSTED), 145 REG16(0x200), 146 147 NOP(13), 148 LRI(44, POSTED), 149 REG(0x028), 150 REG(0x09c), 151 REG(0x0c0), 152 REG(0x178), 153 REG(0x17c), 154 REG16(0x358), 155 REG(0x170), 156 REG(0x150), 157 REG(0x154), 158 REG(0x158), 159 REG16(0x41c), 160 REG16(0x600), 161 REG16(0x604), 162 REG16(0x608), 163 REG16(0x60c), 164 REG16(0x610), 165 REG16(0x614), 166 REG16(0x618), 167 REG16(0x61c), 168 REG16(0x620), 169 REG16(0x624), 170 REG16(0x628), 171 REG16(0x62c), 172 REG16(0x630), 173 REG16(0x634), 174 REG16(0x638), 175 REG16(0x63c), 176 REG16(0x640), 177 REG16(0x644), 178 REG16(0x648), 179 REG16(0x64c), 180 REG16(0x650), 181 REG16(0x654), 182 REG16(0x658), 183 REG16(0x65c), 184 REG16(0x660), 185 REG16(0x664), 186 REG16(0x668), 187 REG16(0x66c), 188 REG16(0x670), 189 REG16(0x674), 190 REG16(0x678), 191 REG16(0x67c), 192 REG(0x068), 193 194 END 195 }; 196 197 static const u8 gen12_xcs_offsets[] = { 198 NOP(1), 199 LRI(13, POSTED), 200 REG16(0x244), 201 REG(0x034), 202 REG(0x030), 203 REG(0x038), 204 REG(0x03c), 205 REG(0x168), 206 REG(0x140), 207 REG(0x110), 208 REG(0x1c0), 209 REG(0x1c4), 210 REG(0x1c8), 211 REG(0x180), 212 REG16(0x2b4), 213 214 NOP(5), 215 LRI(9, POSTED), 216 REG16(0x3a8), 217 REG16(0x28c), 218 REG16(0x288), 219 REG16(0x284), 220 REG16(0x280), 221 REG16(0x27c), 222 REG16(0x278), 223 REG16(0x274), 224 REG16(0x270), 225 226 END 227 }; 228 229 static const u8 dg2_xcs_offsets[] = { 230 NOP(1), 231 LRI(15, POSTED), 232 REG16(0x244), 233 REG(0x034), 234 REG(0x030), 235 REG(0x038), 236 REG(0x03c), 237 REG(0x168), 238 REG(0x140), 239 REG(0x110), 240 REG(0x1c0), 241 REG(0x1c4), 242 REG(0x1c8), 243 REG(0x180), 244 REG16(0x2b4), 245 REG(0x120), 246 REG(0x124), 247 248 NOP(1), 249 LRI(9, POSTED), 250 REG16(0x3a8), 251 REG16(0x28c), 252 REG16(0x288), 253 REG16(0x284), 254 REG16(0x280), 255 REG16(0x27c), 256 REG16(0x278), 257 REG16(0x274), 258 REG16(0x270), 259 260 END 261 }; 262 263 static const u8 gen8_rcs_offsets[] = { 264 NOP(1), 265 LRI(14, POSTED), 266 REG16(0x244), 267 REG(0x034), 268 REG(0x030), 269 REG(0x038), 270 REG(0x03c), 271 REG(0x168), 272 REG(0x140), 273 REG(0x110), 274 REG(0x11c), 275 REG(0x114), 276 REG(0x118), 277 REG(0x1c0), 278 REG(0x1c4), 279 REG(0x1c8), 280 281 NOP(3), 282 LRI(9, POSTED), 283 REG16(0x3a8), 284 REG16(0x28c), 285 REG16(0x288), 286 REG16(0x284), 287 REG16(0x280), 288 REG16(0x27c), 289 REG16(0x278), 290 REG16(0x274), 291 REG16(0x270), 292 293 NOP(13), 294 LRI(1, 0), 295 REG(0x0c8), 296 297 END 298 }; 299 300 static const u8 gen9_rcs_offsets[] = { 301 NOP(1), 302 LRI(14, POSTED), 303 REG16(0x244), 304 REG(0x34), 305 REG(0x30), 306 REG(0x38), 307 REG(0x3c), 308 REG(0x168), 309 REG(0x140), 310 REG(0x110), 311 REG(0x11c), 312 REG(0x114), 313 REG(0x118), 314 REG(0x1c0), 315 REG(0x1c4), 316 REG(0x1c8), 317 318 NOP(3), 319 LRI(9, POSTED), 320 REG16(0x3a8), 321 REG16(0x28c), 322 REG16(0x288), 323 REG16(0x284), 324 REG16(0x280), 325 REG16(0x27c), 326 REG16(0x278), 327 REG16(0x274), 328 REG16(0x270), 329 330 NOP(13), 331 LRI(1, 0), 332 REG(0xc8), 333 334 NOP(13), 335 LRI(44, POSTED), 336 REG(0x28), 337 REG(0x9c), 338 REG(0xc0), 339 REG(0x178), 340 REG(0x17c), 341 REG16(0x358), 342 REG(0x170), 343 REG(0x150), 344 REG(0x154), 345 REG(0x158), 346 REG16(0x41c), 347 REG16(0x600), 348 REG16(0x604), 349 REG16(0x608), 350 REG16(0x60c), 351 REG16(0x610), 352 REG16(0x614), 353 REG16(0x618), 354 REG16(0x61c), 355 REG16(0x620), 356 REG16(0x624), 357 REG16(0x628), 358 REG16(0x62c), 359 REG16(0x630), 360 REG16(0x634), 361 REG16(0x638), 362 REG16(0x63c), 363 REG16(0x640), 364 REG16(0x644), 365 REG16(0x648), 366 REG16(0x64c), 367 REG16(0x650), 368 REG16(0x654), 369 REG16(0x658), 370 REG16(0x65c), 371 REG16(0x660), 372 REG16(0x664), 373 REG16(0x668), 374 REG16(0x66c), 375 REG16(0x670), 376 REG16(0x674), 377 REG16(0x678), 378 REG16(0x67c), 379 REG(0x68), 380 381 END 382 }; 383 384 static const u8 gen11_rcs_offsets[] = { 385 NOP(1), 386 LRI(15, POSTED), 387 REG16(0x244), 388 REG(0x034), 389 REG(0x030), 390 REG(0x038), 391 REG(0x03c), 392 REG(0x168), 393 REG(0x140), 394 REG(0x110), 395 REG(0x11c), 396 REG(0x114), 397 REG(0x118), 398 REG(0x1c0), 399 REG(0x1c4), 400 REG(0x1c8), 401 REG(0x180), 402 403 NOP(1), 404 LRI(9, POSTED), 405 REG16(0x3a8), 406 REG16(0x28c), 407 REG16(0x288), 408 REG16(0x284), 409 REG16(0x280), 410 REG16(0x27c), 411 REG16(0x278), 412 REG16(0x274), 413 REG16(0x270), 414 415 LRI(1, POSTED), 416 REG(0x1b0), 417 418 NOP(10), 419 LRI(1, 0), 420 REG(0x0c8), 421 422 END 423 }; 424 425 static const u8 gen12_rcs_offsets[] = { 426 NOP(1), 427 LRI(13, POSTED), 428 REG16(0x244), 429 REG(0x034), 430 REG(0x030), 431 REG(0x038), 432 REG(0x03c), 433 REG(0x168), 434 REG(0x140), 435 REG(0x110), 436 REG(0x1c0), 437 REG(0x1c4), 438 REG(0x1c8), 439 REG(0x180), 440 REG16(0x2b4), 441 442 NOP(5), 443 LRI(9, POSTED), 444 REG16(0x3a8), 445 REG16(0x28c), 446 REG16(0x288), 447 REG16(0x284), 448 REG16(0x280), 449 REG16(0x27c), 450 REG16(0x278), 451 REG16(0x274), 452 REG16(0x270), 453 454 LRI(3, POSTED), 455 REG(0x1b0), 456 REG16(0x5a8), 457 REG16(0x5ac), 458 459 NOP(6), 460 LRI(1, 0), 461 REG(0x0c8), 462 NOP(3 + 9 + 1), 463 464 LRI(51, POSTED), 465 REG16(0x588), 466 REG16(0x588), 467 REG16(0x588), 468 REG16(0x588), 469 REG16(0x588), 470 REG16(0x588), 471 REG(0x028), 472 REG(0x09c), 473 REG(0x0c0), 474 REG(0x178), 475 REG(0x17c), 476 REG16(0x358), 477 REG(0x170), 478 REG(0x150), 479 REG(0x154), 480 REG(0x158), 481 REG16(0x41c), 482 REG16(0x600), 483 REG16(0x604), 484 REG16(0x608), 485 REG16(0x60c), 486 REG16(0x610), 487 REG16(0x614), 488 REG16(0x618), 489 REG16(0x61c), 490 REG16(0x620), 491 REG16(0x624), 492 REG16(0x628), 493 REG16(0x62c), 494 REG16(0x630), 495 REG16(0x634), 496 REG16(0x638), 497 REG16(0x63c), 498 REG16(0x640), 499 REG16(0x644), 500 REG16(0x648), 501 REG16(0x64c), 502 REG16(0x650), 503 REG16(0x654), 504 REG16(0x658), 505 REG16(0x65c), 506 REG16(0x660), 507 REG16(0x664), 508 REG16(0x668), 509 REG16(0x66c), 510 REG16(0x670), 511 REG16(0x674), 512 REG16(0x678), 513 REG16(0x67c), 514 REG(0x068), 515 REG(0x084), 516 NOP(1), 517 518 END 519 }; 520 521 static const u8 xehp_rcs_offsets[] = { 522 NOP(1), 523 LRI(13, POSTED), 524 REG16(0x244), 525 REG(0x034), 526 REG(0x030), 527 REG(0x038), 528 REG(0x03c), 529 REG(0x168), 530 REG(0x140), 531 REG(0x110), 532 REG(0x1c0), 533 REG(0x1c4), 534 REG(0x1c8), 535 REG(0x180), 536 REG16(0x2b4), 537 538 NOP(5), 539 LRI(9, POSTED), 540 REG16(0x3a8), 541 REG16(0x28c), 542 REG16(0x288), 543 REG16(0x284), 544 REG16(0x280), 545 REG16(0x27c), 546 REG16(0x278), 547 REG16(0x274), 548 REG16(0x270), 549 550 LRI(3, POSTED), 551 REG(0x1b0), 552 REG16(0x5a8), 553 REG16(0x5ac), 554 555 NOP(6), 556 LRI(1, 0), 557 REG(0x0c8), 558 559 END 560 }; 561 562 static const u8 dg2_rcs_offsets[] = { 563 NOP(1), 564 LRI(15, POSTED), 565 REG16(0x244), 566 REG(0x034), 567 REG(0x030), 568 REG(0x038), 569 REG(0x03c), 570 REG(0x168), 571 REG(0x140), 572 REG(0x110), 573 REG(0x1c0), 574 REG(0x1c4), 575 REG(0x1c8), 576 REG(0x180), 577 REG16(0x2b4), 578 REG(0x120), 579 REG(0x124), 580 581 NOP(1), 582 LRI(9, POSTED), 583 REG16(0x3a8), 584 REG16(0x28c), 585 REG16(0x288), 586 REG16(0x284), 587 REG16(0x280), 588 REG16(0x27c), 589 REG16(0x278), 590 REG16(0x274), 591 REG16(0x270), 592 593 LRI(3, POSTED), 594 REG(0x1b0), 595 REG16(0x5a8), 596 REG16(0x5ac), 597 598 NOP(6), 599 LRI(1, 0), 600 REG(0x0c8), 601 602 END 603 }; 604 605 #undef END 606 #undef REG16 607 #undef REG 608 #undef LRI 609 #undef NOP 610 611 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 612 { 613 /* 614 * The gen12+ lists only have the registers we program in the basic 615 * default state. We rely on the context image using relative 616 * addressing to automatic fixup the register state between the 617 * physical engines for virtual engine. 618 */ 619 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && 620 !intel_engine_has_relative_mmio(engine)); 621 622 if (engine->class == RENDER_CLASS) { 623 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 624 return dg2_rcs_offsets; 625 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 626 return xehp_rcs_offsets; 627 else if (GRAPHICS_VER(engine->i915) >= 12) 628 return gen12_rcs_offsets; 629 else if (GRAPHICS_VER(engine->i915) >= 11) 630 return gen11_rcs_offsets; 631 else if (GRAPHICS_VER(engine->i915) >= 9) 632 return gen9_rcs_offsets; 633 else 634 return gen8_rcs_offsets; 635 } else { 636 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 637 return dg2_xcs_offsets; 638 else if (GRAPHICS_VER(engine->i915) >= 12) 639 return gen12_xcs_offsets; 640 else if (GRAPHICS_VER(engine->i915) >= 9) 641 return gen9_xcs_offsets; 642 else 643 return gen8_xcs_offsets; 644 } 645 } 646 647 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 648 { 649 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 650 return 0x70; 651 else if (GRAPHICS_VER(engine->i915) >= 12) 652 return 0x60; 653 else if (GRAPHICS_VER(engine->i915) >= 9) 654 return 0x54; 655 else if (engine->class == RENDER_CLASS) 656 return 0x58; 657 else 658 return -1; 659 } 660 661 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 662 { 663 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 664 return 0x84; 665 else if (GRAPHICS_VER(engine->i915) >= 12) 666 return 0x74; 667 else if (GRAPHICS_VER(engine->i915) >= 9) 668 return 0x68; 669 else if (engine->class == RENDER_CLASS) 670 return 0xd8; 671 else 672 return -1; 673 } 674 675 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 676 { 677 if (GRAPHICS_VER(engine->i915) >= 12) 678 return 0x12; 679 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS) 680 return 0x18; 681 else 682 return -1; 683 } 684 685 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 686 { 687 int x; 688 689 x = lrc_ring_wa_bb_per_ctx(engine); 690 if (x < 0) 691 return x; 692 693 return x + 2; 694 } 695 696 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 697 { 698 int x; 699 700 x = lrc_ring_indirect_ptr(engine); 701 if (x < 0) 702 return x; 703 704 return x + 2; 705 } 706 707 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 708 { 709 710 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 711 /* 712 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL 713 * simply to match the RCS context image layout. 714 */ 715 return 0xc6; 716 else if (engine->class != RENDER_CLASS) 717 return -1; 718 else if (GRAPHICS_VER(engine->i915) >= 12) 719 return 0xb6; 720 else if (GRAPHICS_VER(engine->i915) >= 11) 721 return 0xaa; 722 else 723 return -1; 724 } 725 726 static u32 727 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 728 { 729 switch (GRAPHICS_VER(engine->i915)) { 730 default: 731 MISSING_CASE(GRAPHICS_VER(engine->i915)); 732 fallthrough; 733 case 12: 734 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 735 case 11: 736 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 737 case 9: 738 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 739 case 8: 740 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 741 } 742 } 743 744 static void 745 lrc_setup_indirect_ctx(u32 *regs, 746 const struct intel_engine_cs *engine, 747 u32 ctx_bb_ggtt_addr, 748 u32 size) 749 { 750 GEM_BUG_ON(!size); 751 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 752 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 753 regs[lrc_ring_indirect_ptr(engine) + 1] = 754 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 755 756 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 757 regs[lrc_ring_indirect_offset(engine) + 1] = 758 lrc_ring_indirect_offset_default(engine) << 6; 759 } 760 761 static void init_common_regs(u32 * const regs, 762 const struct intel_context *ce, 763 const struct intel_engine_cs *engine, 764 bool inhibit) 765 { 766 u32 ctl; 767 768 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 769 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 770 if (inhibit) 771 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 772 if (GRAPHICS_VER(engine->i915) < 11) 773 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 774 CTX_CTRL_RS_CTX_ENABLE); 775 regs[CTX_CONTEXT_CONTROL] = ctl; 776 777 regs[CTX_TIMESTAMP] = ce->runtime.last; 778 } 779 780 static void init_wa_bb_regs(u32 * const regs, 781 const struct intel_engine_cs *engine) 782 { 783 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 784 785 if (wa_ctx->per_ctx.size) { 786 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 787 788 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 789 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 790 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 791 } 792 793 if (wa_ctx->indirect_ctx.size) { 794 lrc_setup_indirect_ctx(regs, engine, 795 i915_ggtt_offset(wa_ctx->vma) + 796 wa_ctx->indirect_ctx.offset, 797 wa_ctx->indirect_ctx.size); 798 } 799 } 800 801 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 802 { 803 if (i915_vm_is_4lvl(&ppgtt->vm)) { 804 /* 64b PPGTT (48bit canonical) 805 * PDP0_DESCRIPTOR contains the base address to PML4 and 806 * other PDP Descriptors are ignored. 807 */ 808 ASSIGN_CTX_PML4(ppgtt, regs); 809 } else { 810 ASSIGN_CTX_PDP(ppgtt, regs, 3); 811 ASSIGN_CTX_PDP(ppgtt, regs, 2); 812 ASSIGN_CTX_PDP(ppgtt, regs, 1); 813 ASSIGN_CTX_PDP(ppgtt, regs, 0); 814 } 815 } 816 817 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 818 { 819 if (i915_is_ggtt(vm)) 820 return i915_vm_to_ggtt(vm)->alias; 821 else 822 return i915_vm_to_ppgtt(vm); 823 } 824 825 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 826 { 827 int x; 828 829 x = lrc_ring_mi_mode(engine); 830 if (x != -1) { 831 regs[x + 1] &= ~STOP_RING; 832 regs[x + 1] |= STOP_RING << 16; 833 } 834 } 835 836 static void __lrc_init_regs(u32 *regs, 837 const struct intel_context *ce, 838 const struct intel_engine_cs *engine, 839 bool inhibit) 840 { 841 /* 842 * A context is actually a big batch buffer with several 843 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 844 * values we are setting here are only for the first context restore: 845 * on a subsequent save, the GPU will recreate this batchbuffer with new 846 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 847 * we are not initializing here). 848 * 849 * Must keep consistent with virtual_update_register_offsets(). 850 */ 851 852 if (inhibit) 853 memset(regs, 0, PAGE_SIZE); 854 855 set_offsets(regs, reg_offsets(engine), engine, inhibit); 856 857 init_common_regs(regs, ce, engine, inhibit); 858 init_ppgtt_regs(regs, vm_alias(ce->vm)); 859 860 init_wa_bb_regs(regs, engine); 861 862 __reset_stop_ring(regs, engine); 863 } 864 865 void lrc_init_regs(const struct intel_context *ce, 866 const struct intel_engine_cs *engine, 867 bool inhibit) 868 { 869 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 870 } 871 872 void lrc_reset_regs(const struct intel_context *ce, 873 const struct intel_engine_cs *engine) 874 { 875 __reset_stop_ring(ce->lrc_reg_state, engine); 876 } 877 878 static void 879 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 880 { 881 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 882 return; 883 884 vaddr += engine->context_size; 885 886 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 887 } 888 889 static void 890 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 891 { 892 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 893 return; 894 895 vaddr += engine->context_size; 896 897 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 898 drm_err_once(&engine->i915->drm, 899 "%s context redzone overwritten!\n", 900 engine->name); 901 } 902 903 void lrc_init_state(struct intel_context *ce, 904 struct intel_engine_cs *engine, 905 void *state) 906 { 907 bool inhibit = true; 908 909 set_redzone(state, engine); 910 911 if (engine->default_state) { 912 shmem_read(engine->default_state, 0, 913 state, engine->context_size); 914 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 915 inhibit = false; 916 } 917 918 /* Clear the ppHWSP (inc. per-context counters) */ 919 memset(state, 0, PAGE_SIZE); 920 921 /* 922 * The second page of the context object contains some registers which 923 * must be set up prior to the first execution. 924 */ 925 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 926 } 927 928 static struct i915_vma * 929 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 930 { 931 struct drm_i915_gem_object *obj; 932 struct i915_vma *vma; 933 u32 context_size; 934 935 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 936 937 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 938 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 939 940 if (GRAPHICS_VER(engine->i915) == 12) { 941 ce->wa_bb_page = context_size / PAGE_SIZE; 942 context_size += PAGE_SIZE; 943 } 944 945 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { 946 ce->parallel.guc.parent_page = context_size / PAGE_SIZE; 947 context_size += PARENT_SCRATCH_SIZE; 948 } 949 950 obj = i915_gem_object_create_lmem(engine->i915, context_size, 951 I915_BO_ALLOC_PM_VOLATILE); 952 if (IS_ERR(obj)) 953 obj = i915_gem_object_create_shmem(engine->i915, context_size); 954 if (IS_ERR(obj)) 955 return ERR_CAST(obj); 956 957 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 958 if (IS_ERR(vma)) { 959 i915_gem_object_put(obj); 960 return vma; 961 } 962 963 return vma; 964 } 965 966 static struct intel_timeline * 967 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 968 { 969 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 970 971 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 972 } 973 974 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 975 { 976 struct intel_ring *ring; 977 struct i915_vma *vma; 978 int err; 979 980 GEM_BUG_ON(ce->state); 981 982 vma = __lrc_alloc_state(ce, engine); 983 if (IS_ERR(vma)) 984 return PTR_ERR(vma); 985 986 ring = intel_engine_create_ring(engine, ce->ring_size); 987 if (IS_ERR(ring)) { 988 err = PTR_ERR(ring); 989 goto err_vma; 990 } 991 992 if (!page_mask_bits(ce->timeline)) { 993 struct intel_timeline *tl; 994 995 /* 996 * Use the static global HWSP for the kernel context, and 997 * a dynamically allocated cacheline for everyone else. 998 */ 999 if (unlikely(ce->timeline)) 1000 tl = pinned_timeline(ce, engine); 1001 else 1002 tl = intel_timeline_create(engine->gt); 1003 if (IS_ERR(tl)) { 1004 err = PTR_ERR(tl); 1005 goto err_ring; 1006 } 1007 1008 ce->timeline = tl; 1009 } 1010 1011 ce->ring = ring; 1012 ce->state = vma; 1013 1014 return 0; 1015 1016 err_ring: 1017 intel_ring_put(ring); 1018 err_vma: 1019 i915_vma_put(vma); 1020 return err; 1021 } 1022 1023 void lrc_reset(struct intel_context *ce) 1024 { 1025 GEM_BUG_ON(!intel_context_is_pinned(ce)); 1026 1027 intel_ring_reset(ce->ring, ce->ring->emit); 1028 1029 /* Scrub away the garbage */ 1030 lrc_init_regs(ce, ce->engine, true); 1031 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 1032 } 1033 1034 int 1035 lrc_pre_pin(struct intel_context *ce, 1036 struct intel_engine_cs *engine, 1037 struct i915_gem_ww_ctx *ww, 1038 void **vaddr) 1039 { 1040 GEM_BUG_ON(!ce->state); 1041 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1042 1043 *vaddr = i915_gem_object_pin_map(ce->state->obj, 1044 i915_coherent_map_type(ce->engine->i915, 1045 ce->state->obj, 1046 false) | 1047 I915_MAP_OVERRIDE); 1048 1049 return PTR_ERR_OR_ZERO(*vaddr); 1050 } 1051 1052 int 1053 lrc_pin(struct intel_context *ce, 1054 struct intel_engine_cs *engine, 1055 void *vaddr) 1056 { 1057 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 1058 1059 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 1060 lrc_init_state(ce, engine, vaddr); 1061 1062 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 1063 return 0; 1064 } 1065 1066 void lrc_unpin(struct intel_context *ce) 1067 { 1068 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 1069 ce->engine); 1070 } 1071 1072 void lrc_post_unpin(struct intel_context *ce) 1073 { 1074 i915_gem_object_unpin_map(ce->state->obj); 1075 } 1076 1077 void lrc_fini(struct intel_context *ce) 1078 { 1079 if (!ce->state) 1080 return; 1081 1082 intel_ring_put(fetch_and_zero(&ce->ring)); 1083 i915_vma_put(fetch_and_zero(&ce->state)); 1084 } 1085 1086 void lrc_destroy(struct kref *kref) 1087 { 1088 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1089 1090 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1091 GEM_BUG_ON(intel_context_is_pinned(ce)); 1092 1093 lrc_fini(ce); 1094 1095 intel_context_fini(ce); 1096 intel_context_free(ce); 1097 } 1098 1099 static u32 * 1100 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 1101 { 1102 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1103 MI_SRM_LRM_GLOBAL_GTT | 1104 MI_LRI_LRM_CS_MMIO; 1105 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1106 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1107 CTX_TIMESTAMP * sizeof(u32); 1108 *cs++ = 0; 1109 1110 *cs++ = MI_LOAD_REGISTER_REG | 1111 MI_LRR_SOURCE_CS_MMIO | 1112 MI_LRI_LRM_CS_MMIO; 1113 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1114 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1115 1116 *cs++ = MI_LOAD_REGISTER_REG | 1117 MI_LRR_SOURCE_CS_MMIO | 1118 MI_LRI_LRM_CS_MMIO; 1119 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1120 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1121 1122 return cs; 1123 } 1124 1125 static u32 * 1126 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 1127 { 1128 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 1129 1130 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1131 MI_SRM_LRM_GLOBAL_GTT | 1132 MI_LRI_LRM_CS_MMIO; 1133 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1134 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1135 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 1136 *cs++ = 0; 1137 1138 return cs; 1139 } 1140 1141 static u32 * 1142 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1143 { 1144 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1145 1146 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1147 MI_SRM_LRM_GLOBAL_GTT | 1148 MI_LRI_LRM_CS_MMIO; 1149 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1150 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1151 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1152 *cs++ = 0; 1153 1154 *cs++ = MI_LOAD_REGISTER_REG | 1155 MI_LRR_SOURCE_CS_MMIO | 1156 MI_LRI_LRM_CS_MMIO; 1157 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1158 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1159 1160 return cs; 1161 } 1162 1163 static u32 * 1164 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1165 { 1166 cs = gen12_emit_timestamp_wa(ce, cs); 1167 cs = gen12_emit_cmd_buf_wa(ce, cs); 1168 cs = gen12_emit_restore_scratch(ce, cs); 1169 1170 /* Wa_16013000631:dg2 */ 1171 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || 1172 IS_DG2_G11(ce->engine->i915)) 1173 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); 1174 1175 return cs; 1176 } 1177 1178 static u32 * 1179 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1180 { 1181 cs = gen12_emit_timestamp_wa(ce, cs); 1182 cs = gen12_emit_restore_scratch(ce, cs); 1183 1184 return cs; 1185 } 1186 1187 static u32 context_wa_bb_offset(const struct intel_context *ce) 1188 { 1189 return PAGE_SIZE * ce->wa_bb_page; 1190 } 1191 1192 static u32 *context_indirect_bb(const struct intel_context *ce) 1193 { 1194 void *ptr; 1195 1196 GEM_BUG_ON(!ce->wa_bb_page); 1197 1198 ptr = ce->lrc_reg_state; 1199 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 1200 ptr += context_wa_bb_offset(ce); 1201 1202 return ptr; 1203 } 1204 1205 static void 1206 setup_indirect_ctx_bb(const struct intel_context *ce, 1207 const struct intel_engine_cs *engine, 1208 u32 *(*emit)(const struct intel_context *, u32 *)) 1209 { 1210 u32 * const start = context_indirect_bb(ce); 1211 u32 *cs; 1212 1213 cs = emit(ce, start); 1214 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1215 while ((unsigned long)cs % CACHELINE_BYTES) 1216 *cs++ = MI_NOOP; 1217 1218 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1219 i915_ggtt_offset(ce->state) + 1220 context_wa_bb_offset(ce), 1221 (cs - start) * sizeof(*cs)); 1222 } 1223 1224 /* 1225 * The context descriptor encodes various attributes of a context, 1226 * including its GTT address and some flags. Because it's fairly 1227 * expensive to calculate, we'll just do it once and cache the result, 1228 * which remains valid until the context is unpinned. 1229 * 1230 * This is what a descriptor looks like, from LSB to MSB:: 1231 * 1232 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1233 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1234 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1235 * bits 53-54: mbz, reserved for use by hardware 1236 * bits 55-63: group ID, currently unused and set to 0 1237 * 1238 * Starting from Gen11, the upper dword of the descriptor has a new format: 1239 * 1240 * bits 32-36: reserved 1241 * bits 37-47: SW context ID 1242 * bits 48:53: engine instance 1243 * bit 54: mbz, reserved for use by hardware 1244 * bits 55-60: SW counter 1245 * bits 61-63: engine class 1246 * 1247 * On Xe_HP, the upper dword of the descriptor has a new format: 1248 * 1249 * bits 32-37: virtual function number 1250 * bit 38: mbz, reserved for use by hardware 1251 * bits 39-54: SW context ID 1252 * bits 55-57: reserved 1253 * bits 58-63: SW counter 1254 * 1255 * engine info, SW context ID and SW counter need to form a unique number 1256 * (Context ID) per lrc. 1257 */ 1258 static u32 lrc_descriptor(const struct intel_context *ce) 1259 { 1260 u32 desc; 1261 1262 desc = INTEL_LEGACY_32B_CONTEXT; 1263 if (i915_vm_is_4lvl(ce->vm)) 1264 desc = INTEL_LEGACY_64B_CONTEXT; 1265 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1266 1267 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1268 if (GRAPHICS_VER(ce->vm->i915) == 8) 1269 desc |= GEN8_CTX_L3LLC_COHERENT; 1270 1271 return i915_ggtt_offset(ce->state) | desc; 1272 } 1273 1274 u32 lrc_update_regs(const struct intel_context *ce, 1275 const struct intel_engine_cs *engine, 1276 u32 head) 1277 { 1278 struct intel_ring *ring = ce->ring; 1279 u32 *regs = ce->lrc_reg_state; 1280 1281 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1282 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1283 1284 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1285 regs[CTX_RING_HEAD] = head; 1286 regs[CTX_RING_TAIL] = ring->tail; 1287 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1288 1289 /* RPCS */ 1290 if (engine->class == RENDER_CLASS) { 1291 regs[CTX_R_PWR_CLK_STATE] = 1292 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1293 1294 i915_oa_init_reg_state(ce, engine); 1295 } 1296 1297 if (ce->wa_bb_page) { 1298 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1299 1300 fn = gen12_emit_indirect_ctx_xcs; 1301 if (ce->engine->class == RENDER_CLASS) 1302 fn = gen12_emit_indirect_ctx_rcs; 1303 1304 /* Mutually exclusive wrt to global indirect bb */ 1305 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1306 setup_indirect_ctx_bb(ce, engine, fn); 1307 } 1308 1309 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1310 } 1311 1312 void lrc_update_offsets(struct intel_context *ce, 1313 struct intel_engine_cs *engine) 1314 { 1315 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1316 } 1317 1318 void lrc_check_regs(const struct intel_context *ce, 1319 const struct intel_engine_cs *engine, 1320 const char *when) 1321 { 1322 const struct intel_ring *ring = ce->ring; 1323 u32 *regs = ce->lrc_reg_state; 1324 bool valid = true; 1325 int x; 1326 1327 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1328 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1329 engine->name, 1330 regs[CTX_RING_START], 1331 i915_ggtt_offset(ring->vma)); 1332 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1333 valid = false; 1334 } 1335 1336 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1337 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1338 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1339 engine->name, 1340 regs[CTX_RING_CTL], 1341 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1342 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1343 valid = false; 1344 } 1345 1346 x = lrc_ring_mi_mode(engine); 1347 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1348 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1349 engine->name, regs[x + 1]); 1350 regs[x + 1] &= ~STOP_RING; 1351 regs[x + 1] |= STOP_RING << 16; 1352 valid = false; 1353 } 1354 1355 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1356 } 1357 1358 /* 1359 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1360 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1361 * but there is a slight complication as this is applied in WA batch where the 1362 * values are only initialized once so we cannot take register value at the 1363 * beginning and reuse it further; hence we save its value to memory, upload a 1364 * constant value with bit21 set and then we restore it back with the saved value. 1365 * To simplify the WA, a constant value is formed by using the default value 1366 * of this register. This shouldn't be a problem because we are only modifying 1367 * it for a short period and this batch in non-premptible. We can ofcourse 1368 * use additional instructions that read the actual value of the register 1369 * at that time and set our bit of interest but it makes the WA complicated. 1370 * 1371 * This WA is also required for Gen9 so extracting as a function avoids 1372 * code duplication. 1373 */ 1374 static u32 * 1375 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1376 { 1377 /* NB no one else is allowed to scribble over scratch + 256! */ 1378 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1379 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1380 *batch++ = intel_gt_scratch_offset(engine->gt, 1381 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1382 *batch++ = 0; 1383 1384 *batch++ = MI_LOAD_REGISTER_IMM(1); 1385 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1386 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1387 1388 batch = gen8_emit_pipe_control(batch, 1389 PIPE_CONTROL_CS_STALL | 1390 PIPE_CONTROL_DC_FLUSH_ENABLE, 1391 0); 1392 1393 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1394 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1395 *batch++ = intel_gt_scratch_offset(engine->gt, 1396 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1397 *batch++ = 0; 1398 1399 return batch; 1400 } 1401 1402 /* 1403 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1404 * initialized at the beginning and shared across all contexts but this field 1405 * helps us to have multiple batches at different offsets and select them based 1406 * on a criteria. At the moment this batch always start at the beginning of the page 1407 * and at this point we don't have multiple wa_ctx batch buffers. 1408 * 1409 * The number of WA applied are not known at the beginning; we use this field 1410 * to return the no of DWORDS written. 1411 * 1412 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1413 * so it adds NOOPs as padding to make it cacheline aligned. 1414 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1415 * makes a complete batch buffer. 1416 */ 1417 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1418 { 1419 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1420 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1421 1422 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1423 if (IS_BROADWELL(engine->i915)) 1424 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1425 1426 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1427 /* Actual scratch location is at 128 bytes offset */ 1428 batch = gen8_emit_pipe_control(batch, 1429 PIPE_CONTROL_FLUSH_L3 | 1430 PIPE_CONTROL_STORE_DATA_INDEX | 1431 PIPE_CONTROL_CS_STALL | 1432 PIPE_CONTROL_QW_WRITE, 1433 LRC_PPHWSP_SCRATCH_ADDR); 1434 1435 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1436 1437 /* Pad to end of cacheline */ 1438 while ((unsigned long)batch % CACHELINE_BYTES) 1439 *batch++ = MI_NOOP; 1440 1441 /* 1442 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1443 * execution depends on the length specified in terms of cache lines 1444 * in the register CTX_RCS_INDIRECT_CTX 1445 */ 1446 1447 return batch; 1448 } 1449 1450 struct lri { 1451 i915_reg_t reg; 1452 u32 value; 1453 }; 1454 1455 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1456 { 1457 GEM_BUG_ON(!count || count > 63); 1458 1459 *batch++ = MI_LOAD_REGISTER_IMM(count); 1460 do { 1461 *batch++ = i915_mmio_reg_offset(lri->reg); 1462 *batch++ = lri->value; 1463 } while (lri++, --count); 1464 *batch++ = MI_NOOP; 1465 1466 return batch; 1467 } 1468 1469 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1470 { 1471 static const struct lri lri[] = { 1472 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1473 { 1474 COMMON_SLICE_CHICKEN2, 1475 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1476 0), 1477 }, 1478 1479 /* BSpec: 11391 */ 1480 { 1481 FF_SLICE_CHICKEN, 1482 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1483 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1484 }, 1485 1486 /* BSpec: 11299 */ 1487 { 1488 _3D_CHICKEN3, 1489 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1490 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1491 } 1492 }; 1493 1494 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1495 1496 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1497 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1498 1499 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1500 batch = gen8_emit_pipe_control(batch, 1501 PIPE_CONTROL_FLUSH_L3 | 1502 PIPE_CONTROL_STORE_DATA_INDEX | 1503 PIPE_CONTROL_CS_STALL | 1504 PIPE_CONTROL_QW_WRITE, 1505 LRC_PPHWSP_SCRATCH_ADDR); 1506 1507 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1508 1509 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1510 if (HAS_POOLED_EU(engine->i915)) { 1511 /* 1512 * EU pool configuration is setup along with golden context 1513 * during context initialization. This value depends on 1514 * device type (2x6 or 3x6) and needs to be updated based 1515 * on which subslice is disabled especially for 2x6 1516 * devices, however it is safe to load default 1517 * configuration of 3x6 device instead of masking off 1518 * corresponding bits because HW ignores bits of a disabled 1519 * subslice and drops down to appropriate config. Please 1520 * see render_state_setup() in i915_gem_render_state.c for 1521 * possible configurations, to avoid duplication they are 1522 * not shown here again. 1523 */ 1524 *batch++ = GEN9_MEDIA_POOL_STATE; 1525 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1526 *batch++ = 0x00777000; 1527 *batch++ = 0; 1528 *batch++ = 0; 1529 *batch++ = 0; 1530 } 1531 1532 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1533 1534 /* Pad to end of cacheline */ 1535 while ((unsigned long)batch % CACHELINE_BYTES) 1536 *batch++ = MI_NOOP; 1537 1538 return batch; 1539 } 1540 1541 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1542 1543 static int lrc_create_wa_ctx(struct intel_engine_cs *engine) 1544 { 1545 struct drm_i915_gem_object *obj; 1546 struct i915_vma *vma; 1547 int err; 1548 1549 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1550 if (IS_ERR(obj)) 1551 return PTR_ERR(obj); 1552 1553 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1554 if (IS_ERR(vma)) { 1555 err = PTR_ERR(vma); 1556 goto err; 1557 } 1558 1559 engine->wa_ctx.vma = vma; 1560 return 0; 1561 1562 err: 1563 i915_gem_object_put(obj); 1564 return err; 1565 } 1566 1567 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1568 { 1569 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1570 } 1571 1572 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1573 1574 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1575 { 1576 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1577 struct i915_wa_ctx_bb *wa_bb[] = { 1578 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1579 }; 1580 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1581 struct i915_gem_ww_ctx ww; 1582 void *batch, *batch_ptr; 1583 unsigned int i; 1584 int err; 1585 1586 if (engine->class != RENDER_CLASS) 1587 return; 1588 1589 switch (GRAPHICS_VER(engine->i915)) { 1590 case 12: 1591 case 11: 1592 return; 1593 case 9: 1594 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1595 wa_bb_fn[1] = NULL; 1596 break; 1597 case 8: 1598 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1599 wa_bb_fn[1] = NULL; 1600 break; 1601 default: 1602 MISSING_CASE(GRAPHICS_VER(engine->i915)); 1603 return; 1604 } 1605 1606 err = lrc_create_wa_ctx(engine); 1607 if (err) { 1608 /* 1609 * We continue even if we fail to initialize WA batch 1610 * because we only expect rare glitches but nothing 1611 * critical to prevent us from using GPU 1612 */ 1613 drm_err(&engine->i915->drm, 1614 "Ignoring context switch w/a allocation error:%d\n", 1615 err); 1616 return; 1617 } 1618 1619 if (!engine->wa_ctx.vma) 1620 return; 1621 1622 i915_gem_ww_ctx_init(&ww, true); 1623 retry: 1624 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); 1625 if (!err) 1626 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); 1627 if (err) 1628 goto err; 1629 1630 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1631 if (IS_ERR(batch)) { 1632 err = PTR_ERR(batch); 1633 goto err_unpin; 1634 } 1635 1636 /* 1637 * Emit the two workaround batch buffers, recording the offset from the 1638 * start of the workaround batch buffer object for each and their 1639 * respective sizes. 1640 */ 1641 batch_ptr = batch; 1642 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1643 wa_bb[i]->offset = batch_ptr - batch; 1644 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1645 CACHELINE_BYTES))) { 1646 err = -EINVAL; 1647 break; 1648 } 1649 if (wa_bb_fn[i]) 1650 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1651 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1652 } 1653 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1654 1655 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1656 __i915_gem_object_release_map(wa_ctx->vma->obj); 1657 1658 /* Verify that we can handle failure to setup the wa_ctx */ 1659 if (!err) 1660 err = i915_inject_probe_error(engine->i915, -ENODEV); 1661 1662 err_unpin: 1663 if (err) 1664 i915_vma_unpin(wa_ctx->vma); 1665 err: 1666 if (err == -EDEADLK) { 1667 err = i915_gem_ww_ctx_backoff(&ww); 1668 if (!err) 1669 goto retry; 1670 } 1671 i915_gem_ww_ctx_fini(&ww); 1672 1673 if (err) { 1674 i915_vma_put(engine->wa_ctx.vma); 1675 1676 /* Clear all flags to prevent further use */ 1677 memset(wa_ctx, 0, sizeof(*wa_ctx)); 1678 } 1679 } 1680 1681 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1682 { 1683 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1684 ce->runtime.num_underflow++; 1685 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1686 #endif 1687 } 1688 1689 void lrc_update_runtime(struct intel_context *ce) 1690 { 1691 u32 old; 1692 s32 dt; 1693 1694 if (intel_context_is_barrier(ce)) 1695 return; 1696 1697 old = ce->runtime.last; 1698 ce->runtime.last = lrc_get_runtime(ce); 1699 dt = ce->runtime.last - old; 1700 1701 if (unlikely(dt < 0)) { 1702 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1703 old, ce->runtime.last, dt); 1704 st_update_runtime_underflow(ce, dt); 1705 return; 1706 } 1707 1708 ewma_runtime_add(&ce->runtime.avg, dt); 1709 ce->runtime.total += dt; 1710 } 1711 1712 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1713 #include "selftest_lrc.c" 1714 #endif 1715