1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include "gem/i915_gem_lmem.h" 7 8 #include "gen8_engine_cs.h" 9 #include "i915_drv.h" 10 #include "i915_perf.h" 11 #include "intel_engine.h" 12 #include "intel_gpu_commands.h" 13 #include "intel_gt.h" 14 #include "intel_lrc.h" 15 #include "intel_lrc_reg.h" 16 #include "intel_ring.h" 17 #include "shmem_utils.h" 18 19 static void set_offsets(u32 *regs, 20 const u8 *data, 21 const struct intel_engine_cs *engine, 22 bool close) 23 #define NOP(x) (BIT(7) | (x)) 24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 25 #define POSTED BIT(0) 26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 27 #define REG16(x) \ 28 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 29 (((x) >> 2) & 0x7f) 30 #define END 0 31 { 32 const u32 base = engine->mmio_base; 33 34 while (*data) { 35 u8 count, flags; 36 37 if (*data & BIT(7)) { /* skip */ 38 count = *data++ & ~BIT(7); 39 regs += count; 40 continue; 41 } 42 43 count = *data & 0x3f; 44 flags = *data >> 6; 45 data++; 46 47 *regs = MI_LOAD_REGISTER_IMM(count); 48 if (flags & POSTED) 49 *regs |= MI_LRI_FORCE_POSTED; 50 if (GRAPHICS_VER(engine->i915) >= 11) 51 *regs |= MI_LRI_LRM_CS_MMIO; 52 regs++; 53 54 GEM_BUG_ON(!count); 55 do { 56 u32 offset = 0; 57 u8 v; 58 59 do { 60 v = *data++; 61 offset <<= 7; 62 offset |= v & ~BIT(7); 63 } while (v & BIT(7)); 64 65 regs[0] = base + (offset << 2); 66 regs += 2; 67 } while (--count); 68 } 69 70 if (close) { 71 /* Close the batch; used mainly by live_lrc_layout() */ 72 *regs = MI_BATCH_BUFFER_END; 73 if (GRAPHICS_VER(engine->i915) >= 11) 74 *regs |= BIT(0); 75 } 76 } 77 78 static const u8 gen8_xcs_offsets[] = { 79 NOP(1), 80 LRI(11, 0), 81 REG16(0x244), 82 REG(0x034), 83 REG(0x030), 84 REG(0x038), 85 REG(0x03c), 86 REG(0x168), 87 REG(0x140), 88 REG(0x110), 89 REG(0x11c), 90 REG(0x114), 91 REG(0x118), 92 93 NOP(9), 94 LRI(9, 0), 95 REG16(0x3a8), 96 REG16(0x28c), 97 REG16(0x288), 98 REG16(0x284), 99 REG16(0x280), 100 REG16(0x27c), 101 REG16(0x278), 102 REG16(0x274), 103 REG16(0x270), 104 105 NOP(13), 106 LRI(2, 0), 107 REG16(0x200), 108 REG(0x028), 109 110 END 111 }; 112 113 static const u8 gen9_xcs_offsets[] = { 114 NOP(1), 115 LRI(14, POSTED), 116 REG16(0x244), 117 REG(0x034), 118 REG(0x030), 119 REG(0x038), 120 REG(0x03c), 121 REG(0x168), 122 REG(0x140), 123 REG(0x110), 124 REG(0x11c), 125 REG(0x114), 126 REG(0x118), 127 REG(0x1c0), 128 REG(0x1c4), 129 REG(0x1c8), 130 131 NOP(3), 132 LRI(9, POSTED), 133 REG16(0x3a8), 134 REG16(0x28c), 135 REG16(0x288), 136 REG16(0x284), 137 REG16(0x280), 138 REG16(0x27c), 139 REG16(0x278), 140 REG16(0x274), 141 REG16(0x270), 142 143 NOP(13), 144 LRI(1, POSTED), 145 REG16(0x200), 146 147 NOP(13), 148 LRI(44, POSTED), 149 REG(0x028), 150 REG(0x09c), 151 REG(0x0c0), 152 REG(0x178), 153 REG(0x17c), 154 REG16(0x358), 155 REG(0x170), 156 REG(0x150), 157 REG(0x154), 158 REG(0x158), 159 REG16(0x41c), 160 REG16(0x600), 161 REG16(0x604), 162 REG16(0x608), 163 REG16(0x60c), 164 REG16(0x610), 165 REG16(0x614), 166 REG16(0x618), 167 REG16(0x61c), 168 REG16(0x620), 169 REG16(0x624), 170 REG16(0x628), 171 REG16(0x62c), 172 REG16(0x630), 173 REG16(0x634), 174 REG16(0x638), 175 REG16(0x63c), 176 REG16(0x640), 177 REG16(0x644), 178 REG16(0x648), 179 REG16(0x64c), 180 REG16(0x650), 181 REG16(0x654), 182 REG16(0x658), 183 REG16(0x65c), 184 REG16(0x660), 185 REG16(0x664), 186 REG16(0x668), 187 REG16(0x66c), 188 REG16(0x670), 189 REG16(0x674), 190 REG16(0x678), 191 REG16(0x67c), 192 REG(0x068), 193 194 END 195 }; 196 197 static const u8 gen12_xcs_offsets[] = { 198 NOP(1), 199 LRI(13, POSTED), 200 REG16(0x244), 201 REG(0x034), 202 REG(0x030), 203 REG(0x038), 204 REG(0x03c), 205 REG(0x168), 206 REG(0x140), 207 REG(0x110), 208 REG(0x1c0), 209 REG(0x1c4), 210 REG(0x1c8), 211 REG(0x180), 212 REG16(0x2b4), 213 214 NOP(5), 215 LRI(9, POSTED), 216 REG16(0x3a8), 217 REG16(0x28c), 218 REG16(0x288), 219 REG16(0x284), 220 REG16(0x280), 221 REG16(0x27c), 222 REG16(0x278), 223 REG16(0x274), 224 REG16(0x270), 225 226 END 227 }; 228 229 static const u8 dg2_xcs_offsets[] = { 230 NOP(1), 231 LRI(15, POSTED), 232 REG16(0x244), 233 REG(0x034), 234 REG(0x030), 235 REG(0x038), 236 REG(0x03c), 237 REG(0x168), 238 REG(0x140), 239 REG(0x110), 240 REG(0x1c0), 241 REG(0x1c4), 242 REG(0x1c8), 243 REG(0x180), 244 REG16(0x2b4), 245 REG(0x120), 246 REG(0x124), 247 248 NOP(1), 249 LRI(9, POSTED), 250 REG16(0x3a8), 251 REG16(0x28c), 252 REG16(0x288), 253 REG16(0x284), 254 REG16(0x280), 255 REG16(0x27c), 256 REG16(0x278), 257 REG16(0x274), 258 REG16(0x270), 259 260 END 261 }; 262 263 static const u8 gen8_rcs_offsets[] = { 264 NOP(1), 265 LRI(14, POSTED), 266 REG16(0x244), 267 REG(0x034), 268 REG(0x030), 269 REG(0x038), 270 REG(0x03c), 271 REG(0x168), 272 REG(0x140), 273 REG(0x110), 274 REG(0x11c), 275 REG(0x114), 276 REG(0x118), 277 REG(0x1c0), 278 REG(0x1c4), 279 REG(0x1c8), 280 281 NOP(3), 282 LRI(9, POSTED), 283 REG16(0x3a8), 284 REG16(0x28c), 285 REG16(0x288), 286 REG16(0x284), 287 REG16(0x280), 288 REG16(0x27c), 289 REG16(0x278), 290 REG16(0x274), 291 REG16(0x270), 292 293 NOP(13), 294 LRI(1, 0), 295 REG(0x0c8), 296 297 END 298 }; 299 300 static const u8 gen9_rcs_offsets[] = { 301 NOP(1), 302 LRI(14, POSTED), 303 REG16(0x244), 304 REG(0x34), 305 REG(0x30), 306 REG(0x38), 307 REG(0x3c), 308 REG(0x168), 309 REG(0x140), 310 REG(0x110), 311 REG(0x11c), 312 REG(0x114), 313 REG(0x118), 314 REG(0x1c0), 315 REG(0x1c4), 316 REG(0x1c8), 317 318 NOP(3), 319 LRI(9, POSTED), 320 REG16(0x3a8), 321 REG16(0x28c), 322 REG16(0x288), 323 REG16(0x284), 324 REG16(0x280), 325 REG16(0x27c), 326 REG16(0x278), 327 REG16(0x274), 328 REG16(0x270), 329 330 NOP(13), 331 LRI(1, 0), 332 REG(0xc8), 333 334 NOP(13), 335 LRI(44, POSTED), 336 REG(0x28), 337 REG(0x9c), 338 REG(0xc0), 339 REG(0x178), 340 REG(0x17c), 341 REG16(0x358), 342 REG(0x170), 343 REG(0x150), 344 REG(0x154), 345 REG(0x158), 346 REG16(0x41c), 347 REG16(0x600), 348 REG16(0x604), 349 REG16(0x608), 350 REG16(0x60c), 351 REG16(0x610), 352 REG16(0x614), 353 REG16(0x618), 354 REG16(0x61c), 355 REG16(0x620), 356 REG16(0x624), 357 REG16(0x628), 358 REG16(0x62c), 359 REG16(0x630), 360 REG16(0x634), 361 REG16(0x638), 362 REG16(0x63c), 363 REG16(0x640), 364 REG16(0x644), 365 REG16(0x648), 366 REG16(0x64c), 367 REG16(0x650), 368 REG16(0x654), 369 REG16(0x658), 370 REG16(0x65c), 371 REG16(0x660), 372 REG16(0x664), 373 REG16(0x668), 374 REG16(0x66c), 375 REG16(0x670), 376 REG16(0x674), 377 REG16(0x678), 378 REG16(0x67c), 379 REG(0x68), 380 381 END 382 }; 383 384 static const u8 gen11_rcs_offsets[] = { 385 NOP(1), 386 LRI(15, POSTED), 387 REG16(0x244), 388 REG(0x034), 389 REG(0x030), 390 REG(0x038), 391 REG(0x03c), 392 REG(0x168), 393 REG(0x140), 394 REG(0x110), 395 REG(0x11c), 396 REG(0x114), 397 REG(0x118), 398 REG(0x1c0), 399 REG(0x1c4), 400 REG(0x1c8), 401 REG(0x180), 402 403 NOP(1), 404 LRI(9, POSTED), 405 REG16(0x3a8), 406 REG16(0x28c), 407 REG16(0x288), 408 REG16(0x284), 409 REG16(0x280), 410 REG16(0x27c), 411 REG16(0x278), 412 REG16(0x274), 413 REG16(0x270), 414 415 LRI(1, POSTED), 416 REG(0x1b0), 417 418 NOP(10), 419 LRI(1, 0), 420 REG(0x0c8), 421 422 END 423 }; 424 425 static const u8 gen12_rcs_offsets[] = { 426 NOP(1), 427 LRI(13, POSTED), 428 REG16(0x244), 429 REG(0x034), 430 REG(0x030), 431 REG(0x038), 432 REG(0x03c), 433 REG(0x168), 434 REG(0x140), 435 REG(0x110), 436 REG(0x1c0), 437 REG(0x1c4), 438 REG(0x1c8), 439 REG(0x180), 440 REG16(0x2b4), 441 442 NOP(5), 443 LRI(9, POSTED), 444 REG16(0x3a8), 445 REG16(0x28c), 446 REG16(0x288), 447 REG16(0x284), 448 REG16(0x280), 449 REG16(0x27c), 450 REG16(0x278), 451 REG16(0x274), 452 REG16(0x270), 453 454 LRI(3, POSTED), 455 REG(0x1b0), 456 REG16(0x5a8), 457 REG16(0x5ac), 458 459 NOP(6), 460 LRI(1, 0), 461 REG(0x0c8), 462 NOP(3 + 9 + 1), 463 464 LRI(51, POSTED), 465 REG16(0x588), 466 REG16(0x588), 467 REG16(0x588), 468 REG16(0x588), 469 REG16(0x588), 470 REG16(0x588), 471 REG(0x028), 472 REG(0x09c), 473 REG(0x0c0), 474 REG(0x178), 475 REG(0x17c), 476 REG16(0x358), 477 REG(0x170), 478 REG(0x150), 479 REG(0x154), 480 REG(0x158), 481 REG16(0x41c), 482 REG16(0x600), 483 REG16(0x604), 484 REG16(0x608), 485 REG16(0x60c), 486 REG16(0x610), 487 REG16(0x614), 488 REG16(0x618), 489 REG16(0x61c), 490 REG16(0x620), 491 REG16(0x624), 492 REG16(0x628), 493 REG16(0x62c), 494 REG16(0x630), 495 REG16(0x634), 496 REG16(0x638), 497 REG16(0x63c), 498 REG16(0x640), 499 REG16(0x644), 500 REG16(0x648), 501 REG16(0x64c), 502 REG16(0x650), 503 REG16(0x654), 504 REG16(0x658), 505 REG16(0x65c), 506 REG16(0x660), 507 REG16(0x664), 508 REG16(0x668), 509 REG16(0x66c), 510 REG16(0x670), 511 REG16(0x674), 512 REG16(0x678), 513 REG16(0x67c), 514 REG(0x068), 515 REG(0x084), 516 NOP(1), 517 518 END 519 }; 520 521 static const u8 xehp_rcs_offsets[] = { 522 NOP(1), 523 LRI(13, POSTED), 524 REG16(0x244), 525 REG(0x034), 526 REG(0x030), 527 REG(0x038), 528 REG(0x03c), 529 REG(0x168), 530 REG(0x140), 531 REG(0x110), 532 REG(0x1c0), 533 REG(0x1c4), 534 REG(0x1c8), 535 REG(0x180), 536 REG16(0x2b4), 537 538 NOP(5), 539 LRI(9, POSTED), 540 REG16(0x3a8), 541 REG16(0x28c), 542 REG16(0x288), 543 REG16(0x284), 544 REG16(0x280), 545 REG16(0x27c), 546 REG16(0x278), 547 REG16(0x274), 548 REG16(0x270), 549 550 LRI(3, POSTED), 551 REG(0x1b0), 552 REG16(0x5a8), 553 REG16(0x5ac), 554 555 NOP(6), 556 LRI(1, 0), 557 REG(0x0c8), 558 559 END 560 }; 561 562 static const u8 dg2_rcs_offsets[] = { 563 NOP(1), 564 LRI(15, POSTED), 565 REG16(0x244), 566 REG(0x034), 567 REG(0x030), 568 REG(0x038), 569 REG(0x03c), 570 REG(0x168), 571 REG(0x140), 572 REG(0x110), 573 REG(0x1c0), 574 REG(0x1c4), 575 REG(0x1c8), 576 REG(0x180), 577 REG16(0x2b4), 578 REG(0x120), 579 REG(0x124), 580 581 NOP(1), 582 LRI(9, POSTED), 583 REG16(0x3a8), 584 REG16(0x28c), 585 REG16(0x288), 586 REG16(0x284), 587 REG16(0x280), 588 REG16(0x27c), 589 REG16(0x278), 590 REG16(0x274), 591 REG16(0x270), 592 593 LRI(3, POSTED), 594 REG(0x1b0), 595 REG16(0x5a8), 596 REG16(0x5ac), 597 598 NOP(6), 599 LRI(1, 0), 600 REG(0x0c8), 601 602 END 603 }; 604 605 #undef END 606 #undef REG16 607 #undef REG 608 #undef LRI 609 #undef NOP 610 611 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 612 { 613 /* 614 * The gen12+ lists only have the registers we program in the basic 615 * default state. We rely on the context image using relative 616 * addressing to automatic fixup the register state between the 617 * physical engines for virtual engine. 618 */ 619 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && 620 !intel_engine_has_relative_mmio(engine)); 621 622 if (engine->class == RENDER_CLASS) { 623 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 624 return dg2_rcs_offsets; 625 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 626 return xehp_rcs_offsets; 627 else if (GRAPHICS_VER(engine->i915) >= 12) 628 return gen12_rcs_offsets; 629 else if (GRAPHICS_VER(engine->i915) >= 11) 630 return gen11_rcs_offsets; 631 else if (GRAPHICS_VER(engine->i915) >= 9) 632 return gen9_rcs_offsets; 633 else 634 return gen8_rcs_offsets; 635 } else { 636 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 637 return dg2_xcs_offsets; 638 else if (GRAPHICS_VER(engine->i915) >= 12) 639 return gen12_xcs_offsets; 640 else if (GRAPHICS_VER(engine->i915) >= 9) 641 return gen9_xcs_offsets; 642 else 643 return gen8_xcs_offsets; 644 } 645 } 646 647 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 648 { 649 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 650 return 0x70; 651 else if (GRAPHICS_VER(engine->i915) >= 12) 652 return 0x60; 653 else if (GRAPHICS_VER(engine->i915) >= 9) 654 return 0x54; 655 else if (engine->class == RENDER_CLASS) 656 return 0x58; 657 else 658 return -1; 659 } 660 661 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 662 { 663 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 664 return 0x84; 665 else if (GRAPHICS_VER(engine->i915) >= 12) 666 return 0x74; 667 else if (GRAPHICS_VER(engine->i915) >= 9) 668 return 0x68; 669 else if (engine->class == RENDER_CLASS) 670 return 0xd8; 671 else 672 return -1; 673 } 674 675 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 676 { 677 if (GRAPHICS_VER(engine->i915) >= 12) 678 return 0x12; 679 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS) 680 return 0x18; 681 else 682 return -1; 683 } 684 685 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 686 { 687 int x; 688 689 x = lrc_ring_wa_bb_per_ctx(engine); 690 if (x < 0) 691 return x; 692 693 return x + 2; 694 } 695 696 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 697 { 698 int x; 699 700 x = lrc_ring_indirect_ptr(engine); 701 if (x < 0) 702 return x; 703 704 return x + 2; 705 } 706 707 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 708 { 709 710 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 711 /* 712 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL 713 * simply to match the RCS context image layout. 714 */ 715 return 0xc6; 716 else if (engine->class != RENDER_CLASS) 717 return -1; 718 else if (GRAPHICS_VER(engine->i915) >= 12) 719 return 0xb6; 720 else if (GRAPHICS_VER(engine->i915) >= 11) 721 return 0xaa; 722 else 723 return -1; 724 } 725 726 static u32 727 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 728 { 729 switch (GRAPHICS_VER(engine->i915)) { 730 default: 731 MISSING_CASE(GRAPHICS_VER(engine->i915)); 732 fallthrough; 733 case 12: 734 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 735 case 11: 736 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 737 case 9: 738 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 739 case 8: 740 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 741 } 742 } 743 744 static void 745 lrc_setup_indirect_ctx(u32 *regs, 746 const struct intel_engine_cs *engine, 747 u32 ctx_bb_ggtt_addr, 748 u32 size) 749 { 750 GEM_BUG_ON(!size); 751 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 752 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 753 regs[lrc_ring_indirect_ptr(engine) + 1] = 754 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 755 756 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 757 regs[lrc_ring_indirect_offset(engine) + 1] = 758 lrc_ring_indirect_offset_default(engine) << 6; 759 } 760 761 static void init_common_regs(u32 * const regs, 762 const struct intel_context *ce, 763 const struct intel_engine_cs *engine, 764 bool inhibit) 765 { 766 u32 ctl; 767 768 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 769 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 770 if (inhibit) 771 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 772 if (GRAPHICS_VER(engine->i915) < 11) 773 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 774 CTX_CTRL_RS_CTX_ENABLE); 775 regs[CTX_CONTEXT_CONTROL] = ctl; 776 777 regs[CTX_TIMESTAMP] = ce->runtime.last; 778 } 779 780 static void init_wa_bb_regs(u32 * const regs, 781 const struct intel_engine_cs *engine) 782 { 783 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 784 785 if (wa_ctx->per_ctx.size) { 786 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 787 788 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 789 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 790 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 791 } 792 793 if (wa_ctx->indirect_ctx.size) { 794 lrc_setup_indirect_ctx(regs, engine, 795 i915_ggtt_offset(wa_ctx->vma) + 796 wa_ctx->indirect_ctx.offset, 797 wa_ctx->indirect_ctx.size); 798 } 799 } 800 801 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 802 { 803 if (i915_vm_is_4lvl(&ppgtt->vm)) { 804 /* 64b PPGTT (48bit canonical) 805 * PDP0_DESCRIPTOR contains the base address to PML4 and 806 * other PDP Descriptors are ignored. 807 */ 808 ASSIGN_CTX_PML4(ppgtt, regs); 809 } else { 810 ASSIGN_CTX_PDP(ppgtt, regs, 3); 811 ASSIGN_CTX_PDP(ppgtt, regs, 2); 812 ASSIGN_CTX_PDP(ppgtt, regs, 1); 813 ASSIGN_CTX_PDP(ppgtt, regs, 0); 814 } 815 } 816 817 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 818 { 819 if (i915_is_ggtt(vm)) 820 return i915_vm_to_ggtt(vm)->alias; 821 else 822 return i915_vm_to_ppgtt(vm); 823 } 824 825 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 826 { 827 int x; 828 829 x = lrc_ring_mi_mode(engine); 830 if (x != -1) { 831 regs[x + 1] &= ~STOP_RING; 832 regs[x + 1] |= STOP_RING << 16; 833 } 834 } 835 836 static void __lrc_init_regs(u32 *regs, 837 const struct intel_context *ce, 838 const struct intel_engine_cs *engine, 839 bool inhibit) 840 { 841 /* 842 * A context is actually a big batch buffer with several 843 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 844 * values we are setting here are only for the first context restore: 845 * on a subsequent save, the GPU will recreate this batchbuffer with new 846 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 847 * we are not initializing here). 848 * 849 * Must keep consistent with virtual_update_register_offsets(). 850 */ 851 852 if (inhibit) 853 memset(regs, 0, PAGE_SIZE); 854 855 set_offsets(regs, reg_offsets(engine), engine, inhibit); 856 857 init_common_regs(regs, ce, engine, inhibit); 858 init_ppgtt_regs(regs, vm_alias(ce->vm)); 859 860 init_wa_bb_regs(regs, engine); 861 862 __reset_stop_ring(regs, engine); 863 } 864 865 void lrc_init_regs(const struct intel_context *ce, 866 const struct intel_engine_cs *engine, 867 bool inhibit) 868 { 869 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 870 } 871 872 void lrc_reset_regs(const struct intel_context *ce, 873 const struct intel_engine_cs *engine) 874 { 875 __reset_stop_ring(ce->lrc_reg_state, engine); 876 } 877 878 static void 879 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 880 { 881 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 882 return; 883 884 vaddr += engine->context_size; 885 886 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 887 } 888 889 static void 890 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 891 { 892 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 893 return; 894 895 vaddr += engine->context_size; 896 897 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 898 drm_err_once(&engine->i915->drm, 899 "%s context redzone overwritten!\n", 900 engine->name); 901 } 902 903 void lrc_init_state(struct intel_context *ce, 904 struct intel_engine_cs *engine, 905 void *state) 906 { 907 bool inhibit = true; 908 909 set_redzone(state, engine); 910 911 if (engine->default_state) { 912 shmem_read(engine->default_state, 0, 913 state, engine->context_size); 914 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 915 inhibit = false; 916 } 917 918 /* Clear the ppHWSP (inc. per-context counters) */ 919 memset(state, 0, PAGE_SIZE); 920 921 /* 922 * The second page of the context object contains some registers which 923 * must be set up prior to the first execution. 924 */ 925 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 926 } 927 928 static struct i915_vma * 929 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 930 { 931 struct drm_i915_gem_object *obj; 932 struct i915_vma *vma; 933 u32 context_size; 934 935 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 936 937 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 938 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 939 940 if (GRAPHICS_VER(engine->i915) == 12) { 941 ce->wa_bb_page = context_size / PAGE_SIZE; 942 context_size += PAGE_SIZE; 943 } 944 945 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { 946 ce->parallel.guc.parent_page = context_size / PAGE_SIZE; 947 context_size += PARENT_SCRATCH_SIZE; 948 } 949 950 obj = i915_gem_object_create_lmem(engine->i915, context_size, 951 I915_BO_ALLOC_PM_VOLATILE); 952 if (IS_ERR(obj)) 953 obj = i915_gem_object_create_shmem(engine->i915, context_size); 954 if (IS_ERR(obj)) 955 return ERR_CAST(obj); 956 957 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 958 if (IS_ERR(vma)) { 959 i915_gem_object_put(obj); 960 return vma; 961 } 962 963 return vma; 964 } 965 966 static struct intel_timeline * 967 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 968 { 969 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 970 971 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 972 } 973 974 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 975 { 976 struct intel_ring *ring; 977 struct i915_vma *vma; 978 int err; 979 980 GEM_BUG_ON(ce->state); 981 982 vma = __lrc_alloc_state(ce, engine); 983 if (IS_ERR(vma)) 984 return PTR_ERR(vma); 985 986 ring = intel_engine_create_ring(engine, ce->ring_size); 987 if (IS_ERR(ring)) { 988 err = PTR_ERR(ring); 989 goto err_vma; 990 } 991 992 if (!page_mask_bits(ce->timeline)) { 993 struct intel_timeline *tl; 994 995 /* 996 * Use the static global HWSP for the kernel context, and 997 * a dynamically allocated cacheline for everyone else. 998 */ 999 if (unlikely(ce->timeline)) 1000 tl = pinned_timeline(ce, engine); 1001 else 1002 tl = intel_timeline_create(engine->gt); 1003 if (IS_ERR(tl)) { 1004 err = PTR_ERR(tl); 1005 goto err_ring; 1006 } 1007 1008 ce->timeline = tl; 1009 } 1010 1011 ce->ring = ring; 1012 ce->state = vma; 1013 1014 return 0; 1015 1016 err_ring: 1017 intel_ring_put(ring); 1018 err_vma: 1019 i915_vma_put(vma); 1020 return err; 1021 } 1022 1023 void lrc_reset(struct intel_context *ce) 1024 { 1025 GEM_BUG_ON(!intel_context_is_pinned(ce)); 1026 1027 intel_ring_reset(ce->ring, ce->ring->emit); 1028 1029 /* Scrub away the garbage */ 1030 lrc_init_regs(ce, ce->engine, true); 1031 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 1032 } 1033 1034 int 1035 lrc_pre_pin(struct intel_context *ce, 1036 struct intel_engine_cs *engine, 1037 struct i915_gem_ww_ctx *ww, 1038 void **vaddr) 1039 { 1040 GEM_BUG_ON(!ce->state); 1041 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1042 1043 *vaddr = i915_gem_object_pin_map(ce->state->obj, 1044 i915_coherent_map_type(ce->engine->i915, 1045 ce->state->obj, 1046 false) | 1047 I915_MAP_OVERRIDE); 1048 1049 return PTR_ERR_OR_ZERO(*vaddr); 1050 } 1051 1052 int 1053 lrc_pin(struct intel_context *ce, 1054 struct intel_engine_cs *engine, 1055 void *vaddr) 1056 { 1057 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 1058 1059 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 1060 lrc_init_state(ce, engine, vaddr); 1061 1062 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 1063 return 0; 1064 } 1065 1066 void lrc_unpin(struct intel_context *ce) 1067 { 1068 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 1069 ce->engine); 1070 } 1071 1072 void lrc_post_unpin(struct intel_context *ce) 1073 { 1074 i915_gem_object_unpin_map(ce->state->obj); 1075 } 1076 1077 void lrc_fini(struct intel_context *ce) 1078 { 1079 if (!ce->state) 1080 return; 1081 1082 intel_ring_put(fetch_and_zero(&ce->ring)); 1083 i915_vma_put(fetch_and_zero(&ce->state)); 1084 } 1085 1086 void lrc_destroy(struct kref *kref) 1087 { 1088 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1089 1090 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1091 GEM_BUG_ON(intel_context_is_pinned(ce)); 1092 1093 lrc_fini(ce); 1094 1095 intel_context_fini(ce); 1096 intel_context_free(ce); 1097 } 1098 1099 static u32 * 1100 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 1101 { 1102 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1103 MI_SRM_LRM_GLOBAL_GTT | 1104 MI_LRI_LRM_CS_MMIO; 1105 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1106 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1107 CTX_TIMESTAMP * sizeof(u32); 1108 *cs++ = 0; 1109 1110 *cs++ = MI_LOAD_REGISTER_REG | 1111 MI_LRR_SOURCE_CS_MMIO | 1112 MI_LRI_LRM_CS_MMIO; 1113 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1114 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1115 1116 *cs++ = MI_LOAD_REGISTER_REG | 1117 MI_LRR_SOURCE_CS_MMIO | 1118 MI_LRI_LRM_CS_MMIO; 1119 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1120 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1121 1122 return cs; 1123 } 1124 1125 static u32 * 1126 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 1127 { 1128 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 1129 1130 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1131 MI_SRM_LRM_GLOBAL_GTT | 1132 MI_LRI_LRM_CS_MMIO; 1133 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1134 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1135 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 1136 *cs++ = 0; 1137 1138 return cs; 1139 } 1140 1141 static u32 * 1142 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1143 { 1144 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1145 1146 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1147 MI_SRM_LRM_GLOBAL_GTT | 1148 MI_LRI_LRM_CS_MMIO; 1149 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1150 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1151 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1152 *cs++ = 0; 1153 1154 *cs++ = MI_LOAD_REGISTER_REG | 1155 MI_LRR_SOURCE_CS_MMIO | 1156 MI_LRI_LRM_CS_MMIO; 1157 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1158 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1159 1160 return cs; 1161 } 1162 1163 static u32 * 1164 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1165 { 1166 cs = gen12_emit_timestamp_wa(ce, cs); 1167 cs = gen12_emit_cmd_buf_wa(ce, cs); 1168 cs = gen12_emit_restore_scratch(ce, cs); 1169 1170 return cs; 1171 } 1172 1173 static u32 * 1174 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1175 { 1176 cs = gen12_emit_timestamp_wa(ce, cs); 1177 cs = gen12_emit_restore_scratch(ce, cs); 1178 1179 return cs; 1180 } 1181 1182 static u32 context_wa_bb_offset(const struct intel_context *ce) 1183 { 1184 return PAGE_SIZE * ce->wa_bb_page; 1185 } 1186 1187 static u32 *context_indirect_bb(const struct intel_context *ce) 1188 { 1189 void *ptr; 1190 1191 GEM_BUG_ON(!ce->wa_bb_page); 1192 1193 ptr = ce->lrc_reg_state; 1194 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 1195 ptr += context_wa_bb_offset(ce); 1196 1197 return ptr; 1198 } 1199 1200 static void 1201 setup_indirect_ctx_bb(const struct intel_context *ce, 1202 const struct intel_engine_cs *engine, 1203 u32 *(*emit)(const struct intel_context *, u32 *)) 1204 { 1205 u32 * const start = context_indirect_bb(ce); 1206 u32 *cs; 1207 1208 cs = emit(ce, start); 1209 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1210 while ((unsigned long)cs % CACHELINE_BYTES) 1211 *cs++ = MI_NOOP; 1212 1213 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1214 i915_ggtt_offset(ce->state) + 1215 context_wa_bb_offset(ce), 1216 (cs - start) * sizeof(*cs)); 1217 } 1218 1219 /* 1220 * The context descriptor encodes various attributes of a context, 1221 * including its GTT address and some flags. Because it's fairly 1222 * expensive to calculate, we'll just do it once and cache the result, 1223 * which remains valid until the context is unpinned. 1224 * 1225 * This is what a descriptor looks like, from LSB to MSB:: 1226 * 1227 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1228 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1229 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1230 * bits 53-54: mbz, reserved for use by hardware 1231 * bits 55-63: group ID, currently unused and set to 0 1232 * 1233 * Starting from Gen11, the upper dword of the descriptor has a new format: 1234 * 1235 * bits 32-36: reserved 1236 * bits 37-47: SW context ID 1237 * bits 48:53: engine instance 1238 * bit 54: mbz, reserved for use by hardware 1239 * bits 55-60: SW counter 1240 * bits 61-63: engine class 1241 * 1242 * On Xe_HP, the upper dword of the descriptor has a new format: 1243 * 1244 * bits 32-37: virtual function number 1245 * bit 38: mbz, reserved for use by hardware 1246 * bits 39-54: SW context ID 1247 * bits 55-57: reserved 1248 * bits 58-63: SW counter 1249 * 1250 * engine info, SW context ID and SW counter need to form a unique number 1251 * (Context ID) per lrc. 1252 */ 1253 static u32 lrc_descriptor(const struct intel_context *ce) 1254 { 1255 u32 desc; 1256 1257 desc = INTEL_LEGACY_32B_CONTEXT; 1258 if (i915_vm_is_4lvl(ce->vm)) 1259 desc = INTEL_LEGACY_64B_CONTEXT; 1260 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1261 1262 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1263 if (GRAPHICS_VER(ce->vm->i915) == 8) 1264 desc |= GEN8_CTX_L3LLC_COHERENT; 1265 1266 return i915_ggtt_offset(ce->state) | desc; 1267 } 1268 1269 u32 lrc_update_regs(const struct intel_context *ce, 1270 const struct intel_engine_cs *engine, 1271 u32 head) 1272 { 1273 struct intel_ring *ring = ce->ring; 1274 u32 *regs = ce->lrc_reg_state; 1275 1276 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1277 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1278 1279 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1280 regs[CTX_RING_HEAD] = head; 1281 regs[CTX_RING_TAIL] = ring->tail; 1282 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1283 1284 /* RPCS */ 1285 if (engine->class == RENDER_CLASS) { 1286 regs[CTX_R_PWR_CLK_STATE] = 1287 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1288 1289 i915_oa_init_reg_state(ce, engine); 1290 } 1291 1292 if (ce->wa_bb_page) { 1293 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1294 1295 fn = gen12_emit_indirect_ctx_xcs; 1296 if (ce->engine->class == RENDER_CLASS) 1297 fn = gen12_emit_indirect_ctx_rcs; 1298 1299 /* Mutually exclusive wrt to global indirect bb */ 1300 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1301 setup_indirect_ctx_bb(ce, engine, fn); 1302 } 1303 1304 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1305 } 1306 1307 void lrc_update_offsets(struct intel_context *ce, 1308 struct intel_engine_cs *engine) 1309 { 1310 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1311 } 1312 1313 void lrc_check_regs(const struct intel_context *ce, 1314 const struct intel_engine_cs *engine, 1315 const char *when) 1316 { 1317 const struct intel_ring *ring = ce->ring; 1318 u32 *regs = ce->lrc_reg_state; 1319 bool valid = true; 1320 int x; 1321 1322 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1323 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1324 engine->name, 1325 regs[CTX_RING_START], 1326 i915_ggtt_offset(ring->vma)); 1327 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1328 valid = false; 1329 } 1330 1331 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1332 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1333 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1334 engine->name, 1335 regs[CTX_RING_CTL], 1336 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1337 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1338 valid = false; 1339 } 1340 1341 x = lrc_ring_mi_mode(engine); 1342 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1343 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1344 engine->name, regs[x + 1]); 1345 regs[x + 1] &= ~STOP_RING; 1346 regs[x + 1] |= STOP_RING << 16; 1347 valid = false; 1348 } 1349 1350 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1351 } 1352 1353 /* 1354 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1355 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1356 * but there is a slight complication as this is applied in WA batch where the 1357 * values are only initialized once so we cannot take register value at the 1358 * beginning and reuse it further; hence we save its value to memory, upload a 1359 * constant value with bit21 set and then we restore it back with the saved value. 1360 * To simplify the WA, a constant value is formed by using the default value 1361 * of this register. This shouldn't be a problem because we are only modifying 1362 * it for a short period and this batch in non-premptible. We can ofcourse 1363 * use additional instructions that read the actual value of the register 1364 * at that time and set our bit of interest but it makes the WA complicated. 1365 * 1366 * This WA is also required for Gen9 so extracting as a function avoids 1367 * code duplication. 1368 */ 1369 static u32 * 1370 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1371 { 1372 /* NB no one else is allowed to scribble over scratch + 256! */ 1373 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1374 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1375 *batch++ = intel_gt_scratch_offset(engine->gt, 1376 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1377 *batch++ = 0; 1378 1379 *batch++ = MI_LOAD_REGISTER_IMM(1); 1380 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1381 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1382 1383 batch = gen8_emit_pipe_control(batch, 1384 PIPE_CONTROL_CS_STALL | 1385 PIPE_CONTROL_DC_FLUSH_ENABLE, 1386 0); 1387 1388 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1389 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1390 *batch++ = intel_gt_scratch_offset(engine->gt, 1391 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1392 *batch++ = 0; 1393 1394 return batch; 1395 } 1396 1397 /* 1398 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1399 * initialized at the beginning and shared across all contexts but this field 1400 * helps us to have multiple batches at different offsets and select them based 1401 * on a criteria. At the moment this batch always start at the beginning of the page 1402 * and at this point we don't have multiple wa_ctx batch buffers. 1403 * 1404 * The number of WA applied are not known at the beginning; we use this field 1405 * to return the no of DWORDS written. 1406 * 1407 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1408 * so it adds NOOPs as padding to make it cacheline aligned. 1409 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1410 * makes a complete batch buffer. 1411 */ 1412 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1413 { 1414 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1415 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1416 1417 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1418 if (IS_BROADWELL(engine->i915)) 1419 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1420 1421 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1422 /* Actual scratch location is at 128 bytes offset */ 1423 batch = gen8_emit_pipe_control(batch, 1424 PIPE_CONTROL_FLUSH_L3 | 1425 PIPE_CONTROL_STORE_DATA_INDEX | 1426 PIPE_CONTROL_CS_STALL | 1427 PIPE_CONTROL_QW_WRITE, 1428 LRC_PPHWSP_SCRATCH_ADDR); 1429 1430 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1431 1432 /* Pad to end of cacheline */ 1433 while ((unsigned long)batch % CACHELINE_BYTES) 1434 *batch++ = MI_NOOP; 1435 1436 /* 1437 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1438 * execution depends on the length specified in terms of cache lines 1439 * in the register CTX_RCS_INDIRECT_CTX 1440 */ 1441 1442 return batch; 1443 } 1444 1445 struct lri { 1446 i915_reg_t reg; 1447 u32 value; 1448 }; 1449 1450 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1451 { 1452 GEM_BUG_ON(!count || count > 63); 1453 1454 *batch++ = MI_LOAD_REGISTER_IMM(count); 1455 do { 1456 *batch++ = i915_mmio_reg_offset(lri->reg); 1457 *batch++ = lri->value; 1458 } while (lri++, --count); 1459 *batch++ = MI_NOOP; 1460 1461 return batch; 1462 } 1463 1464 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1465 { 1466 static const struct lri lri[] = { 1467 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1468 { 1469 COMMON_SLICE_CHICKEN2, 1470 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1471 0), 1472 }, 1473 1474 /* BSpec: 11391 */ 1475 { 1476 FF_SLICE_CHICKEN, 1477 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1478 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1479 }, 1480 1481 /* BSpec: 11299 */ 1482 { 1483 _3D_CHICKEN3, 1484 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1485 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1486 } 1487 }; 1488 1489 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1490 1491 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1492 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1493 1494 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1495 batch = gen8_emit_pipe_control(batch, 1496 PIPE_CONTROL_FLUSH_L3 | 1497 PIPE_CONTROL_STORE_DATA_INDEX | 1498 PIPE_CONTROL_CS_STALL | 1499 PIPE_CONTROL_QW_WRITE, 1500 LRC_PPHWSP_SCRATCH_ADDR); 1501 1502 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1503 1504 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1505 if (HAS_POOLED_EU(engine->i915)) { 1506 /* 1507 * EU pool configuration is setup along with golden context 1508 * during context initialization. This value depends on 1509 * device type (2x6 or 3x6) and needs to be updated based 1510 * on which subslice is disabled especially for 2x6 1511 * devices, however it is safe to load default 1512 * configuration of 3x6 device instead of masking off 1513 * corresponding bits because HW ignores bits of a disabled 1514 * subslice and drops down to appropriate config. Please 1515 * see render_state_setup() in i915_gem_render_state.c for 1516 * possible configurations, to avoid duplication they are 1517 * not shown here again. 1518 */ 1519 *batch++ = GEN9_MEDIA_POOL_STATE; 1520 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1521 *batch++ = 0x00777000; 1522 *batch++ = 0; 1523 *batch++ = 0; 1524 *batch++ = 0; 1525 } 1526 1527 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1528 1529 /* Pad to end of cacheline */ 1530 while ((unsigned long)batch % CACHELINE_BYTES) 1531 *batch++ = MI_NOOP; 1532 1533 return batch; 1534 } 1535 1536 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1537 1538 static int lrc_create_wa_ctx(struct intel_engine_cs *engine) 1539 { 1540 struct drm_i915_gem_object *obj; 1541 struct i915_vma *vma; 1542 int err; 1543 1544 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1545 if (IS_ERR(obj)) 1546 return PTR_ERR(obj); 1547 1548 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1549 if (IS_ERR(vma)) { 1550 err = PTR_ERR(vma); 1551 goto err; 1552 } 1553 1554 engine->wa_ctx.vma = vma; 1555 return 0; 1556 1557 err: 1558 i915_gem_object_put(obj); 1559 return err; 1560 } 1561 1562 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1563 { 1564 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1565 } 1566 1567 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1568 1569 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1570 { 1571 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1572 struct i915_wa_ctx_bb *wa_bb[] = { 1573 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1574 }; 1575 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1576 struct i915_gem_ww_ctx ww; 1577 void *batch, *batch_ptr; 1578 unsigned int i; 1579 int err; 1580 1581 if (engine->class != RENDER_CLASS) 1582 return; 1583 1584 switch (GRAPHICS_VER(engine->i915)) { 1585 case 12: 1586 case 11: 1587 return; 1588 case 9: 1589 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1590 wa_bb_fn[1] = NULL; 1591 break; 1592 case 8: 1593 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1594 wa_bb_fn[1] = NULL; 1595 break; 1596 default: 1597 MISSING_CASE(GRAPHICS_VER(engine->i915)); 1598 return; 1599 } 1600 1601 err = lrc_create_wa_ctx(engine); 1602 if (err) { 1603 /* 1604 * We continue even if we fail to initialize WA batch 1605 * because we only expect rare glitches but nothing 1606 * critical to prevent us from using GPU 1607 */ 1608 drm_err(&engine->i915->drm, 1609 "Ignoring context switch w/a allocation error:%d\n", 1610 err); 1611 return; 1612 } 1613 1614 if (!engine->wa_ctx.vma) 1615 return; 1616 1617 i915_gem_ww_ctx_init(&ww, true); 1618 retry: 1619 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); 1620 if (!err) 1621 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); 1622 if (err) 1623 goto err; 1624 1625 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1626 if (IS_ERR(batch)) { 1627 err = PTR_ERR(batch); 1628 goto err_unpin; 1629 } 1630 1631 /* 1632 * Emit the two workaround batch buffers, recording the offset from the 1633 * start of the workaround batch buffer object for each and their 1634 * respective sizes. 1635 */ 1636 batch_ptr = batch; 1637 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1638 wa_bb[i]->offset = batch_ptr - batch; 1639 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1640 CACHELINE_BYTES))) { 1641 err = -EINVAL; 1642 break; 1643 } 1644 if (wa_bb_fn[i]) 1645 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1646 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1647 } 1648 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1649 1650 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1651 __i915_gem_object_release_map(wa_ctx->vma->obj); 1652 1653 /* Verify that we can handle failure to setup the wa_ctx */ 1654 if (!err) 1655 err = i915_inject_probe_error(engine->i915, -ENODEV); 1656 1657 err_unpin: 1658 if (err) 1659 i915_vma_unpin(wa_ctx->vma); 1660 err: 1661 if (err == -EDEADLK) { 1662 err = i915_gem_ww_ctx_backoff(&ww); 1663 if (!err) 1664 goto retry; 1665 } 1666 i915_gem_ww_ctx_fini(&ww); 1667 1668 if (err) { 1669 i915_vma_put(engine->wa_ctx.vma); 1670 1671 /* Clear all flags to prevent further use */ 1672 memset(wa_ctx, 0, sizeof(*wa_ctx)); 1673 } 1674 } 1675 1676 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1677 { 1678 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1679 ce->runtime.num_underflow++; 1680 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1681 #endif 1682 } 1683 1684 void lrc_update_runtime(struct intel_context *ce) 1685 { 1686 u32 old; 1687 s32 dt; 1688 1689 if (intel_context_is_barrier(ce)) 1690 return; 1691 1692 old = ce->runtime.last; 1693 ce->runtime.last = lrc_get_runtime(ce); 1694 dt = ce->runtime.last - old; 1695 1696 if (unlikely(dt < 0)) { 1697 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1698 old, ce->runtime.last, dt); 1699 st_update_runtime_underflow(ce, dt); 1700 return; 1701 } 1702 1703 ewma_runtime_add(&ce->runtime.avg, dt); 1704 ce->runtime.total += dt; 1705 } 1706 1707 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1708 #include "selftest_lrc.c" 1709 #endif 1710