1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include "gem/i915_gem_lmem.h" 7 8 #include "gen8_engine_cs.h" 9 #include "i915_drv.h" 10 #include "i915_perf.h" 11 #include "i915_reg.h" 12 #include "intel_context.h" 13 #include "intel_engine.h" 14 #include "intel_engine_regs.h" 15 #include "intel_gpu_commands.h" 16 #include "intel_gt.h" 17 #include "intel_gt_regs.h" 18 #include "intel_lrc.h" 19 #include "intel_lrc_reg.h" 20 #include "intel_ring.h" 21 #include "shmem_utils.h" 22 23 static void set_offsets(u32 *regs, 24 const u8 *data, 25 const struct intel_engine_cs *engine, 26 bool close) 27 #define NOP(x) (BIT(7) | (x)) 28 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 29 #define POSTED BIT(0) 30 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 31 #define REG16(x) \ 32 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 33 (((x) >> 2) & 0x7f) 34 #define END 0 35 { 36 const u32 base = engine->mmio_base; 37 38 while (*data) { 39 u8 count, flags; 40 41 if (*data & BIT(7)) { /* skip */ 42 count = *data++ & ~BIT(7); 43 regs += count; 44 continue; 45 } 46 47 count = *data & 0x3f; 48 flags = *data >> 6; 49 data++; 50 51 *regs = MI_LOAD_REGISTER_IMM(count); 52 if (flags & POSTED) 53 *regs |= MI_LRI_FORCE_POSTED; 54 if (GRAPHICS_VER(engine->i915) >= 11) 55 *regs |= MI_LRI_LRM_CS_MMIO; 56 regs++; 57 58 GEM_BUG_ON(!count); 59 do { 60 u32 offset = 0; 61 u8 v; 62 63 do { 64 v = *data++; 65 offset <<= 7; 66 offset |= v & ~BIT(7); 67 } while (v & BIT(7)); 68 69 regs[0] = base + (offset << 2); 70 regs += 2; 71 } while (--count); 72 } 73 74 if (close) { 75 /* Close the batch; used mainly by live_lrc_layout() */ 76 *regs = MI_BATCH_BUFFER_END; 77 if (GRAPHICS_VER(engine->i915) >= 11) 78 *regs |= BIT(0); 79 } 80 } 81 82 static const u8 gen8_xcs_offsets[] = { 83 NOP(1), 84 LRI(11, 0), 85 REG16(0x244), 86 REG(0x034), 87 REG(0x030), 88 REG(0x038), 89 REG(0x03c), 90 REG(0x168), 91 REG(0x140), 92 REG(0x110), 93 REG(0x11c), 94 REG(0x114), 95 REG(0x118), 96 97 NOP(9), 98 LRI(9, 0), 99 REG16(0x3a8), 100 REG16(0x28c), 101 REG16(0x288), 102 REG16(0x284), 103 REG16(0x280), 104 REG16(0x27c), 105 REG16(0x278), 106 REG16(0x274), 107 REG16(0x270), 108 109 NOP(13), 110 LRI(2, 0), 111 REG16(0x200), 112 REG(0x028), 113 114 END 115 }; 116 117 static const u8 gen9_xcs_offsets[] = { 118 NOP(1), 119 LRI(14, POSTED), 120 REG16(0x244), 121 REG(0x034), 122 REG(0x030), 123 REG(0x038), 124 REG(0x03c), 125 REG(0x168), 126 REG(0x140), 127 REG(0x110), 128 REG(0x11c), 129 REG(0x114), 130 REG(0x118), 131 REG(0x1c0), 132 REG(0x1c4), 133 REG(0x1c8), 134 135 NOP(3), 136 LRI(9, POSTED), 137 REG16(0x3a8), 138 REG16(0x28c), 139 REG16(0x288), 140 REG16(0x284), 141 REG16(0x280), 142 REG16(0x27c), 143 REG16(0x278), 144 REG16(0x274), 145 REG16(0x270), 146 147 NOP(13), 148 LRI(1, POSTED), 149 REG16(0x200), 150 151 NOP(13), 152 LRI(44, POSTED), 153 REG(0x028), 154 REG(0x09c), 155 REG(0x0c0), 156 REG(0x178), 157 REG(0x17c), 158 REG16(0x358), 159 REG(0x170), 160 REG(0x150), 161 REG(0x154), 162 REG(0x158), 163 REG16(0x41c), 164 REG16(0x600), 165 REG16(0x604), 166 REG16(0x608), 167 REG16(0x60c), 168 REG16(0x610), 169 REG16(0x614), 170 REG16(0x618), 171 REG16(0x61c), 172 REG16(0x620), 173 REG16(0x624), 174 REG16(0x628), 175 REG16(0x62c), 176 REG16(0x630), 177 REG16(0x634), 178 REG16(0x638), 179 REG16(0x63c), 180 REG16(0x640), 181 REG16(0x644), 182 REG16(0x648), 183 REG16(0x64c), 184 REG16(0x650), 185 REG16(0x654), 186 REG16(0x658), 187 REG16(0x65c), 188 REG16(0x660), 189 REG16(0x664), 190 REG16(0x668), 191 REG16(0x66c), 192 REG16(0x670), 193 REG16(0x674), 194 REG16(0x678), 195 REG16(0x67c), 196 REG(0x068), 197 198 END 199 }; 200 201 static const u8 gen12_xcs_offsets[] = { 202 NOP(1), 203 LRI(13, POSTED), 204 REG16(0x244), 205 REG(0x034), 206 REG(0x030), 207 REG(0x038), 208 REG(0x03c), 209 REG(0x168), 210 REG(0x140), 211 REG(0x110), 212 REG(0x1c0), 213 REG(0x1c4), 214 REG(0x1c8), 215 REG(0x180), 216 REG16(0x2b4), 217 218 NOP(5), 219 LRI(9, POSTED), 220 REG16(0x3a8), 221 REG16(0x28c), 222 REG16(0x288), 223 REG16(0x284), 224 REG16(0x280), 225 REG16(0x27c), 226 REG16(0x278), 227 REG16(0x274), 228 REG16(0x270), 229 230 END 231 }; 232 233 static const u8 dg2_xcs_offsets[] = { 234 NOP(1), 235 LRI(15, POSTED), 236 REG16(0x244), 237 REG(0x034), 238 REG(0x030), 239 REG(0x038), 240 REG(0x03c), 241 REG(0x168), 242 REG(0x140), 243 REG(0x110), 244 REG(0x1c0), 245 REG(0x1c4), 246 REG(0x1c8), 247 REG(0x180), 248 REG16(0x2b4), 249 REG(0x120), 250 REG(0x124), 251 252 NOP(1), 253 LRI(9, POSTED), 254 REG16(0x3a8), 255 REG16(0x28c), 256 REG16(0x288), 257 REG16(0x284), 258 REG16(0x280), 259 REG16(0x27c), 260 REG16(0x278), 261 REG16(0x274), 262 REG16(0x270), 263 264 END 265 }; 266 267 static const u8 gen8_rcs_offsets[] = { 268 NOP(1), 269 LRI(14, POSTED), 270 REG16(0x244), 271 REG(0x034), 272 REG(0x030), 273 REG(0x038), 274 REG(0x03c), 275 REG(0x168), 276 REG(0x140), 277 REG(0x110), 278 REG(0x11c), 279 REG(0x114), 280 REG(0x118), 281 REG(0x1c0), 282 REG(0x1c4), 283 REG(0x1c8), 284 285 NOP(3), 286 LRI(9, POSTED), 287 REG16(0x3a8), 288 REG16(0x28c), 289 REG16(0x288), 290 REG16(0x284), 291 REG16(0x280), 292 REG16(0x27c), 293 REG16(0x278), 294 REG16(0x274), 295 REG16(0x270), 296 297 NOP(13), 298 LRI(1, 0), 299 REG(0x0c8), 300 301 END 302 }; 303 304 static const u8 gen9_rcs_offsets[] = { 305 NOP(1), 306 LRI(14, POSTED), 307 REG16(0x244), 308 REG(0x34), 309 REG(0x30), 310 REG(0x38), 311 REG(0x3c), 312 REG(0x168), 313 REG(0x140), 314 REG(0x110), 315 REG(0x11c), 316 REG(0x114), 317 REG(0x118), 318 REG(0x1c0), 319 REG(0x1c4), 320 REG(0x1c8), 321 322 NOP(3), 323 LRI(9, POSTED), 324 REG16(0x3a8), 325 REG16(0x28c), 326 REG16(0x288), 327 REG16(0x284), 328 REG16(0x280), 329 REG16(0x27c), 330 REG16(0x278), 331 REG16(0x274), 332 REG16(0x270), 333 334 NOP(13), 335 LRI(1, 0), 336 REG(0xc8), 337 338 NOP(13), 339 LRI(44, POSTED), 340 REG(0x28), 341 REG(0x9c), 342 REG(0xc0), 343 REG(0x178), 344 REG(0x17c), 345 REG16(0x358), 346 REG(0x170), 347 REG(0x150), 348 REG(0x154), 349 REG(0x158), 350 REG16(0x41c), 351 REG16(0x600), 352 REG16(0x604), 353 REG16(0x608), 354 REG16(0x60c), 355 REG16(0x610), 356 REG16(0x614), 357 REG16(0x618), 358 REG16(0x61c), 359 REG16(0x620), 360 REG16(0x624), 361 REG16(0x628), 362 REG16(0x62c), 363 REG16(0x630), 364 REG16(0x634), 365 REG16(0x638), 366 REG16(0x63c), 367 REG16(0x640), 368 REG16(0x644), 369 REG16(0x648), 370 REG16(0x64c), 371 REG16(0x650), 372 REG16(0x654), 373 REG16(0x658), 374 REG16(0x65c), 375 REG16(0x660), 376 REG16(0x664), 377 REG16(0x668), 378 REG16(0x66c), 379 REG16(0x670), 380 REG16(0x674), 381 REG16(0x678), 382 REG16(0x67c), 383 REG(0x68), 384 385 END 386 }; 387 388 static const u8 gen11_rcs_offsets[] = { 389 NOP(1), 390 LRI(15, POSTED), 391 REG16(0x244), 392 REG(0x034), 393 REG(0x030), 394 REG(0x038), 395 REG(0x03c), 396 REG(0x168), 397 REG(0x140), 398 REG(0x110), 399 REG(0x11c), 400 REG(0x114), 401 REG(0x118), 402 REG(0x1c0), 403 REG(0x1c4), 404 REG(0x1c8), 405 REG(0x180), 406 407 NOP(1), 408 LRI(9, POSTED), 409 REG16(0x3a8), 410 REG16(0x28c), 411 REG16(0x288), 412 REG16(0x284), 413 REG16(0x280), 414 REG16(0x27c), 415 REG16(0x278), 416 REG16(0x274), 417 REG16(0x270), 418 419 LRI(1, POSTED), 420 REG(0x1b0), 421 422 NOP(10), 423 LRI(1, 0), 424 REG(0x0c8), 425 426 END 427 }; 428 429 static const u8 gen12_rcs_offsets[] = { 430 NOP(1), 431 LRI(13, POSTED), 432 REG16(0x244), 433 REG(0x034), 434 REG(0x030), 435 REG(0x038), 436 REG(0x03c), 437 REG(0x168), 438 REG(0x140), 439 REG(0x110), 440 REG(0x1c0), 441 REG(0x1c4), 442 REG(0x1c8), 443 REG(0x180), 444 REG16(0x2b4), 445 446 NOP(5), 447 LRI(9, POSTED), 448 REG16(0x3a8), 449 REG16(0x28c), 450 REG16(0x288), 451 REG16(0x284), 452 REG16(0x280), 453 REG16(0x27c), 454 REG16(0x278), 455 REG16(0x274), 456 REG16(0x270), 457 458 LRI(3, POSTED), 459 REG(0x1b0), 460 REG16(0x5a8), 461 REG16(0x5ac), 462 463 NOP(6), 464 LRI(1, 0), 465 REG(0x0c8), 466 NOP(3 + 9 + 1), 467 468 LRI(51, POSTED), 469 REG16(0x588), 470 REG16(0x588), 471 REG16(0x588), 472 REG16(0x588), 473 REG16(0x588), 474 REG16(0x588), 475 REG(0x028), 476 REG(0x09c), 477 REG(0x0c0), 478 REG(0x178), 479 REG(0x17c), 480 REG16(0x358), 481 REG(0x170), 482 REG(0x150), 483 REG(0x154), 484 REG(0x158), 485 REG16(0x41c), 486 REG16(0x600), 487 REG16(0x604), 488 REG16(0x608), 489 REG16(0x60c), 490 REG16(0x610), 491 REG16(0x614), 492 REG16(0x618), 493 REG16(0x61c), 494 REG16(0x620), 495 REG16(0x624), 496 REG16(0x628), 497 REG16(0x62c), 498 REG16(0x630), 499 REG16(0x634), 500 REG16(0x638), 501 REG16(0x63c), 502 REG16(0x640), 503 REG16(0x644), 504 REG16(0x648), 505 REG16(0x64c), 506 REG16(0x650), 507 REG16(0x654), 508 REG16(0x658), 509 REG16(0x65c), 510 REG16(0x660), 511 REG16(0x664), 512 REG16(0x668), 513 REG16(0x66c), 514 REG16(0x670), 515 REG16(0x674), 516 REG16(0x678), 517 REG16(0x67c), 518 REG(0x068), 519 REG(0x084), 520 NOP(1), 521 522 END 523 }; 524 525 static const u8 xehp_rcs_offsets[] = { 526 NOP(1), 527 LRI(13, POSTED), 528 REG16(0x244), 529 REG(0x034), 530 REG(0x030), 531 REG(0x038), 532 REG(0x03c), 533 REG(0x168), 534 REG(0x140), 535 REG(0x110), 536 REG(0x1c0), 537 REG(0x1c4), 538 REG(0x1c8), 539 REG(0x180), 540 REG16(0x2b4), 541 542 NOP(5), 543 LRI(9, POSTED), 544 REG16(0x3a8), 545 REG16(0x28c), 546 REG16(0x288), 547 REG16(0x284), 548 REG16(0x280), 549 REG16(0x27c), 550 REG16(0x278), 551 REG16(0x274), 552 REG16(0x270), 553 554 LRI(3, POSTED), 555 REG(0x1b0), 556 REG16(0x5a8), 557 REG16(0x5ac), 558 559 NOP(6), 560 LRI(1, 0), 561 REG(0x0c8), 562 563 END 564 }; 565 566 static const u8 dg2_rcs_offsets[] = { 567 NOP(1), 568 LRI(15, POSTED), 569 REG16(0x244), 570 REG(0x034), 571 REG(0x030), 572 REG(0x038), 573 REG(0x03c), 574 REG(0x168), 575 REG(0x140), 576 REG(0x110), 577 REG(0x1c0), 578 REG(0x1c4), 579 REG(0x1c8), 580 REG(0x180), 581 REG16(0x2b4), 582 REG(0x120), 583 REG(0x124), 584 585 NOP(1), 586 LRI(9, POSTED), 587 REG16(0x3a8), 588 REG16(0x28c), 589 REG16(0x288), 590 REG16(0x284), 591 REG16(0x280), 592 REG16(0x27c), 593 REG16(0x278), 594 REG16(0x274), 595 REG16(0x270), 596 597 LRI(3, POSTED), 598 REG(0x1b0), 599 REG16(0x5a8), 600 REG16(0x5ac), 601 602 NOP(6), 603 LRI(1, 0), 604 REG(0x0c8), 605 606 END 607 }; 608 609 #undef END 610 #undef REG16 611 #undef REG 612 #undef LRI 613 #undef NOP 614 615 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 616 { 617 /* 618 * The gen12+ lists only have the registers we program in the basic 619 * default state. We rely on the context image using relative 620 * addressing to automatic fixup the register state between the 621 * physical engines for virtual engine. 622 */ 623 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && 624 !intel_engine_has_relative_mmio(engine)); 625 626 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) { 627 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 628 return dg2_rcs_offsets; 629 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 630 return xehp_rcs_offsets; 631 else if (GRAPHICS_VER(engine->i915) >= 12) 632 return gen12_rcs_offsets; 633 else if (GRAPHICS_VER(engine->i915) >= 11) 634 return gen11_rcs_offsets; 635 else if (GRAPHICS_VER(engine->i915) >= 9) 636 return gen9_rcs_offsets; 637 else 638 return gen8_rcs_offsets; 639 } else { 640 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 641 return dg2_xcs_offsets; 642 else if (GRAPHICS_VER(engine->i915) >= 12) 643 return gen12_xcs_offsets; 644 else if (GRAPHICS_VER(engine->i915) >= 9) 645 return gen9_xcs_offsets; 646 else 647 return gen8_xcs_offsets; 648 } 649 } 650 651 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 652 { 653 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 654 return 0x70; 655 else if (GRAPHICS_VER(engine->i915) >= 12) 656 return 0x60; 657 else if (GRAPHICS_VER(engine->i915) >= 9) 658 return 0x54; 659 else if (engine->class == RENDER_CLASS) 660 return 0x58; 661 else 662 return -1; 663 } 664 665 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine) 666 { 667 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 668 return 0x80; 669 else if (GRAPHICS_VER(engine->i915) >= 12) 670 return 0x70; 671 else if (GRAPHICS_VER(engine->i915) >= 9) 672 return 0x64; 673 else if (GRAPHICS_VER(engine->i915) >= 8 && 674 engine->class == RENDER_CLASS) 675 return 0xc4; 676 else 677 return -1; 678 } 679 680 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 681 { 682 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 683 return 0x84; 684 else if (GRAPHICS_VER(engine->i915) >= 12) 685 return 0x74; 686 else if (GRAPHICS_VER(engine->i915) >= 9) 687 return 0x68; 688 else if (engine->class == RENDER_CLASS) 689 return 0xd8; 690 else 691 return -1; 692 } 693 694 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 695 { 696 if (GRAPHICS_VER(engine->i915) >= 12) 697 return 0x12; 698 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS) 699 return 0x18; 700 else 701 return -1; 702 } 703 704 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 705 { 706 int x; 707 708 x = lrc_ring_wa_bb_per_ctx(engine); 709 if (x < 0) 710 return x; 711 712 return x + 2; 713 } 714 715 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 716 { 717 int x; 718 719 x = lrc_ring_indirect_ptr(engine); 720 if (x < 0) 721 return x; 722 723 return x + 2; 724 } 725 726 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 727 { 728 729 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 730 /* 731 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL 732 * simply to match the RCS context image layout. 733 */ 734 return 0xc6; 735 else if (engine->class != RENDER_CLASS) 736 return -1; 737 else if (GRAPHICS_VER(engine->i915) >= 12) 738 return 0xb6; 739 else if (GRAPHICS_VER(engine->i915) >= 11) 740 return 0xaa; 741 else 742 return -1; 743 } 744 745 static u32 746 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 747 { 748 switch (GRAPHICS_VER(engine->i915)) { 749 default: 750 MISSING_CASE(GRAPHICS_VER(engine->i915)); 751 fallthrough; 752 case 12: 753 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 754 case 11: 755 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 756 case 9: 757 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 758 case 8: 759 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 760 } 761 } 762 763 static void 764 lrc_setup_indirect_ctx(u32 *regs, 765 const struct intel_engine_cs *engine, 766 u32 ctx_bb_ggtt_addr, 767 u32 size) 768 { 769 GEM_BUG_ON(!size); 770 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 771 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 772 regs[lrc_ring_indirect_ptr(engine) + 1] = 773 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 774 775 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 776 regs[lrc_ring_indirect_offset(engine) + 1] = 777 lrc_ring_indirect_offset_default(engine) << 6; 778 } 779 780 static void init_common_regs(u32 * const regs, 781 const struct intel_context *ce, 782 const struct intel_engine_cs *engine, 783 bool inhibit) 784 { 785 u32 ctl; 786 int loc; 787 788 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 789 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 790 if (inhibit) 791 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 792 if (GRAPHICS_VER(engine->i915) < 11) 793 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 794 CTX_CTRL_RS_CTX_ENABLE); 795 regs[CTX_CONTEXT_CONTROL] = ctl; 796 797 regs[CTX_TIMESTAMP] = ce->stats.runtime.last; 798 799 loc = lrc_ring_bb_offset(engine); 800 if (loc != -1) 801 regs[loc + 1] = 0; 802 } 803 804 static void init_wa_bb_regs(u32 * const regs, 805 const struct intel_engine_cs *engine) 806 { 807 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 808 809 if (wa_ctx->per_ctx.size) { 810 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 811 812 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 813 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 814 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 815 } 816 817 if (wa_ctx->indirect_ctx.size) { 818 lrc_setup_indirect_ctx(regs, engine, 819 i915_ggtt_offset(wa_ctx->vma) + 820 wa_ctx->indirect_ctx.offset, 821 wa_ctx->indirect_ctx.size); 822 } 823 } 824 825 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 826 { 827 if (i915_vm_is_4lvl(&ppgtt->vm)) { 828 /* 64b PPGTT (48bit canonical) 829 * PDP0_DESCRIPTOR contains the base address to PML4 and 830 * other PDP Descriptors are ignored. 831 */ 832 ASSIGN_CTX_PML4(ppgtt, regs); 833 } else { 834 ASSIGN_CTX_PDP(ppgtt, regs, 3); 835 ASSIGN_CTX_PDP(ppgtt, regs, 2); 836 ASSIGN_CTX_PDP(ppgtt, regs, 1); 837 ASSIGN_CTX_PDP(ppgtt, regs, 0); 838 } 839 } 840 841 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 842 { 843 if (i915_is_ggtt(vm)) 844 return i915_vm_to_ggtt(vm)->alias; 845 else 846 return i915_vm_to_ppgtt(vm); 847 } 848 849 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 850 { 851 int x; 852 853 x = lrc_ring_mi_mode(engine); 854 if (x != -1) { 855 regs[x + 1] &= ~STOP_RING; 856 regs[x + 1] |= STOP_RING << 16; 857 } 858 } 859 860 static void __lrc_init_regs(u32 *regs, 861 const struct intel_context *ce, 862 const struct intel_engine_cs *engine, 863 bool inhibit) 864 { 865 /* 866 * A context is actually a big batch buffer with several 867 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 868 * values we are setting here are only for the first context restore: 869 * on a subsequent save, the GPU will recreate this batchbuffer with new 870 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 871 * we are not initializing here). 872 * 873 * Must keep consistent with virtual_update_register_offsets(). 874 */ 875 876 if (inhibit) 877 memset(regs, 0, PAGE_SIZE); 878 879 set_offsets(regs, reg_offsets(engine), engine, inhibit); 880 881 init_common_regs(regs, ce, engine, inhibit); 882 init_ppgtt_regs(regs, vm_alias(ce->vm)); 883 884 init_wa_bb_regs(regs, engine); 885 886 __reset_stop_ring(regs, engine); 887 } 888 889 void lrc_init_regs(const struct intel_context *ce, 890 const struct intel_engine_cs *engine, 891 bool inhibit) 892 { 893 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 894 } 895 896 void lrc_reset_regs(const struct intel_context *ce, 897 const struct intel_engine_cs *engine) 898 { 899 __reset_stop_ring(ce->lrc_reg_state, engine); 900 } 901 902 static void 903 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 904 { 905 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 906 return; 907 908 vaddr += engine->context_size; 909 910 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 911 } 912 913 static void 914 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 915 { 916 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 917 return; 918 919 vaddr += engine->context_size; 920 921 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 922 drm_err_once(&engine->i915->drm, 923 "%s context redzone overwritten!\n", 924 engine->name); 925 } 926 927 static u32 context_wa_bb_offset(const struct intel_context *ce) 928 { 929 return PAGE_SIZE * ce->wa_bb_page; 930 } 931 932 static u32 *context_indirect_bb(const struct intel_context *ce) 933 { 934 void *ptr; 935 936 GEM_BUG_ON(!ce->wa_bb_page); 937 938 ptr = ce->lrc_reg_state; 939 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 940 ptr += context_wa_bb_offset(ce); 941 942 return ptr; 943 } 944 945 void lrc_init_state(struct intel_context *ce, 946 struct intel_engine_cs *engine, 947 void *state) 948 { 949 bool inhibit = true; 950 951 set_redzone(state, engine); 952 953 if (engine->default_state) { 954 shmem_read(engine->default_state, 0, 955 state, engine->context_size); 956 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 957 inhibit = false; 958 } 959 960 /* Clear the ppHWSP (inc. per-context counters) */ 961 memset(state, 0, PAGE_SIZE); 962 963 /* Clear the indirect wa and storage */ 964 if (ce->wa_bb_page) 965 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE); 966 967 /* 968 * The second page of the context object contains some registers which 969 * must be set up prior to the first execution. 970 */ 971 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 972 } 973 974 u32 lrc_indirect_bb(const struct intel_context *ce) 975 { 976 return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce); 977 } 978 979 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs) 980 { 981 /* If predication is active, this will be noop'ed */ 982 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 983 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 984 *cs++ = 0; 985 *cs++ = 0; /* No predication */ 986 987 /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */ 988 *cs++ = MI_BATCH_BUFFER_END | BIT(15); 989 *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE; 990 991 /* Instructions are no longer predicated (disabled), we can proceed */ 992 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 993 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 994 *cs++ = 0; 995 *cs++ = 1; /* enable predication before the next BB */ 996 997 *cs++ = MI_BATCH_BUFFER_END; 998 GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA); 999 1000 return cs; 1001 } 1002 1003 static struct i915_vma * 1004 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 1005 { 1006 struct drm_i915_gem_object *obj; 1007 struct i915_vma *vma; 1008 u32 context_size; 1009 1010 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 1011 1012 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1013 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 1014 1015 if (GRAPHICS_VER(engine->i915) == 12) { 1016 ce->wa_bb_page = context_size / PAGE_SIZE; 1017 context_size += PAGE_SIZE; 1018 } 1019 1020 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { 1021 ce->parallel.guc.parent_page = context_size / PAGE_SIZE; 1022 context_size += PARENT_SCRATCH_SIZE; 1023 } 1024 1025 obj = i915_gem_object_create_lmem(engine->i915, context_size, 1026 I915_BO_ALLOC_PM_VOLATILE); 1027 if (IS_ERR(obj)) 1028 obj = i915_gem_object_create_shmem(engine->i915, context_size); 1029 if (IS_ERR(obj)) 1030 return ERR_CAST(obj); 1031 1032 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1033 if (IS_ERR(vma)) { 1034 i915_gem_object_put(obj); 1035 return vma; 1036 } 1037 1038 return vma; 1039 } 1040 1041 static struct intel_timeline * 1042 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 1043 { 1044 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 1045 1046 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 1047 } 1048 1049 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 1050 { 1051 struct intel_ring *ring; 1052 struct i915_vma *vma; 1053 int err; 1054 1055 GEM_BUG_ON(ce->state); 1056 1057 vma = __lrc_alloc_state(ce, engine); 1058 if (IS_ERR(vma)) 1059 return PTR_ERR(vma); 1060 1061 ring = intel_engine_create_ring(engine, ce->ring_size); 1062 if (IS_ERR(ring)) { 1063 err = PTR_ERR(ring); 1064 goto err_vma; 1065 } 1066 1067 if (!page_mask_bits(ce->timeline)) { 1068 struct intel_timeline *tl; 1069 1070 /* 1071 * Use the static global HWSP for the kernel context, and 1072 * a dynamically allocated cacheline for everyone else. 1073 */ 1074 if (unlikely(ce->timeline)) 1075 tl = pinned_timeline(ce, engine); 1076 else 1077 tl = intel_timeline_create(engine->gt); 1078 if (IS_ERR(tl)) { 1079 err = PTR_ERR(tl); 1080 goto err_ring; 1081 } 1082 1083 ce->timeline = tl; 1084 } 1085 1086 ce->ring = ring; 1087 ce->state = vma; 1088 1089 return 0; 1090 1091 err_ring: 1092 intel_ring_put(ring); 1093 err_vma: 1094 i915_vma_put(vma); 1095 return err; 1096 } 1097 1098 void lrc_reset(struct intel_context *ce) 1099 { 1100 GEM_BUG_ON(!intel_context_is_pinned(ce)); 1101 1102 intel_ring_reset(ce->ring, ce->ring->emit); 1103 1104 /* Scrub away the garbage */ 1105 lrc_init_regs(ce, ce->engine, true); 1106 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 1107 } 1108 1109 int 1110 lrc_pre_pin(struct intel_context *ce, 1111 struct intel_engine_cs *engine, 1112 struct i915_gem_ww_ctx *ww, 1113 void **vaddr) 1114 { 1115 GEM_BUG_ON(!ce->state); 1116 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1117 1118 *vaddr = i915_gem_object_pin_map(ce->state->obj, 1119 i915_coherent_map_type(ce->engine->i915, 1120 ce->state->obj, 1121 false) | 1122 I915_MAP_OVERRIDE); 1123 1124 return PTR_ERR_OR_ZERO(*vaddr); 1125 } 1126 1127 int 1128 lrc_pin(struct intel_context *ce, 1129 struct intel_engine_cs *engine, 1130 void *vaddr) 1131 { 1132 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 1133 1134 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 1135 lrc_init_state(ce, engine, vaddr); 1136 1137 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 1138 return 0; 1139 } 1140 1141 void lrc_unpin(struct intel_context *ce) 1142 { 1143 if (unlikely(ce->parallel.last_rq)) { 1144 i915_request_put(ce->parallel.last_rq); 1145 ce->parallel.last_rq = NULL; 1146 } 1147 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 1148 ce->engine); 1149 } 1150 1151 void lrc_post_unpin(struct intel_context *ce) 1152 { 1153 i915_gem_object_unpin_map(ce->state->obj); 1154 } 1155 1156 void lrc_fini(struct intel_context *ce) 1157 { 1158 if (!ce->state) 1159 return; 1160 1161 intel_ring_put(fetch_and_zero(&ce->ring)); 1162 i915_vma_put(fetch_and_zero(&ce->state)); 1163 } 1164 1165 void lrc_destroy(struct kref *kref) 1166 { 1167 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1168 1169 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1170 GEM_BUG_ON(intel_context_is_pinned(ce)); 1171 1172 lrc_fini(ce); 1173 1174 intel_context_fini(ce); 1175 intel_context_free(ce); 1176 } 1177 1178 static u32 * 1179 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 1180 { 1181 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1182 MI_SRM_LRM_GLOBAL_GTT | 1183 MI_LRI_LRM_CS_MMIO; 1184 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1185 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1186 CTX_TIMESTAMP * sizeof(u32); 1187 *cs++ = 0; 1188 1189 *cs++ = MI_LOAD_REGISTER_REG | 1190 MI_LRR_SOURCE_CS_MMIO | 1191 MI_LRI_LRM_CS_MMIO; 1192 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1193 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1194 1195 *cs++ = MI_LOAD_REGISTER_REG | 1196 MI_LRR_SOURCE_CS_MMIO | 1197 MI_LRI_LRM_CS_MMIO; 1198 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1199 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1200 1201 return cs; 1202 } 1203 1204 static u32 * 1205 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 1206 { 1207 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 1208 1209 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1210 MI_SRM_LRM_GLOBAL_GTT | 1211 MI_LRI_LRM_CS_MMIO; 1212 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1213 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1214 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 1215 *cs++ = 0; 1216 1217 return cs; 1218 } 1219 1220 static u32 * 1221 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1222 { 1223 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1224 1225 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1226 MI_SRM_LRM_GLOBAL_GTT | 1227 MI_LRI_LRM_CS_MMIO; 1228 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1229 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1230 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1231 *cs++ = 0; 1232 1233 *cs++ = MI_LOAD_REGISTER_REG | 1234 MI_LRR_SOURCE_CS_MMIO | 1235 MI_LRI_LRM_CS_MMIO; 1236 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1237 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1238 1239 return cs; 1240 } 1241 1242 /* 1243 * On DG2 during context restore of a preempted context in GPGPU mode, 1244 * RCS restore hang is detected. This is extremely timing dependent. 1245 * To address this below sw wabb is implemented for DG2 A steppings. 1246 */ 1247 static u32 * 1248 dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs) 1249 { 1250 *cs++ = MI_LOAD_REGISTER_IMM(1); 1251 *cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG); 1252 *cs++ = 0x21; 1253 1254 *cs++ = MI_LOAD_REGISTER_REG; 1255 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base)); 1256 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1); 1257 1258 *cs++ = MI_LOAD_REGISTER_REG; 1259 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base)); 1260 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2); 1261 1262 return cs; 1263 } 1264 1265 /* 1266 * The bspec's tuning guide asks us to program a vertical watermark value of 1267 * 0x3FF. However this register is not saved/restored properly by the 1268 * hardware, so we're required to apply the desired value via INDIRECT_CTX 1269 * batch buffer to ensure the value takes effect properly. All other bits 1270 * in this register should remain at 0 (the hardware default). 1271 */ 1272 static u32 * 1273 dg2_emit_draw_watermark_setting(u32 *cs) 1274 { 1275 *cs++ = MI_LOAD_REGISTER_IMM(1); 1276 *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK); 1277 *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF); 1278 1279 return cs; 1280 } 1281 1282 static u32 * 1283 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1284 { 1285 cs = gen12_emit_timestamp_wa(ce, cs); 1286 cs = gen12_emit_cmd_buf_wa(ce, cs); 1287 cs = gen12_emit_restore_scratch(ce, cs); 1288 1289 /* Wa_22011450934:dg2 */ 1290 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) || 1291 IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0)) 1292 cs = dg2_emit_rcs_hang_wabb(ce, cs); 1293 1294 /* Wa_16013000631:dg2 */ 1295 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || 1296 IS_DG2_G11(ce->engine->i915)) 1297 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); 1298 1299 /* hsdes: 1809175790 */ 1300 if (!HAS_FLAT_CCS(ce->engine->i915)) 1301 cs = gen12_emit_aux_table_inv(ce->engine->gt, 1302 cs, GEN12_GFX_CCS_AUX_NV); 1303 1304 /* Wa_16014892111 */ 1305 if (IS_DG2(ce->engine->i915)) 1306 cs = dg2_emit_draw_watermark_setting(cs); 1307 1308 return cs; 1309 } 1310 1311 static u32 * 1312 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1313 { 1314 cs = gen12_emit_timestamp_wa(ce, cs); 1315 cs = gen12_emit_restore_scratch(ce, cs); 1316 1317 /* Wa_16013000631:dg2 */ 1318 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || 1319 IS_DG2_G11(ce->engine->i915)) 1320 if (ce->engine->class == COMPUTE_CLASS) 1321 cs = gen8_emit_pipe_control(cs, 1322 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 1323 0); 1324 1325 /* hsdes: 1809175790 */ 1326 if (!HAS_FLAT_CCS(ce->engine->i915)) { 1327 if (ce->engine->class == VIDEO_DECODE_CLASS) 1328 cs = gen12_emit_aux_table_inv(ce->engine->gt, 1329 cs, GEN12_VD0_AUX_NV); 1330 else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS) 1331 cs = gen12_emit_aux_table_inv(ce->engine->gt, 1332 cs, GEN12_VE0_AUX_NV); 1333 } 1334 1335 return cs; 1336 } 1337 1338 static void 1339 setup_indirect_ctx_bb(const struct intel_context *ce, 1340 const struct intel_engine_cs *engine, 1341 u32 *(*emit)(const struct intel_context *, u32 *)) 1342 { 1343 u32 * const start = context_indirect_bb(ce); 1344 u32 *cs; 1345 1346 cs = emit(ce, start); 1347 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1348 while ((unsigned long)cs % CACHELINE_BYTES) 1349 *cs++ = MI_NOOP; 1350 1351 GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1352 setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1353 1354 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1355 lrc_indirect_bb(ce), 1356 (cs - start) * sizeof(*cs)); 1357 } 1358 1359 /* 1360 * The context descriptor encodes various attributes of a context, 1361 * including its GTT address and some flags. Because it's fairly 1362 * expensive to calculate, we'll just do it once and cache the result, 1363 * which remains valid until the context is unpinned. 1364 * 1365 * This is what a descriptor looks like, from LSB to MSB:: 1366 * 1367 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1368 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1369 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1370 * bits 53-54: mbz, reserved for use by hardware 1371 * bits 55-63: group ID, currently unused and set to 0 1372 * 1373 * Starting from Gen11, the upper dword of the descriptor has a new format: 1374 * 1375 * bits 32-36: reserved 1376 * bits 37-47: SW context ID 1377 * bits 48:53: engine instance 1378 * bit 54: mbz, reserved for use by hardware 1379 * bits 55-60: SW counter 1380 * bits 61-63: engine class 1381 * 1382 * On Xe_HP, the upper dword of the descriptor has a new format: 1383 * 1384 * bits 32-37: virtual function number 1385 * bit 38: mbz, reserved for use by hardware 1386 * bits 39-54: SW context ID 1387 * bits 55-57: reserved 1388 * bits 58-63: SW counter 1389 * 1390 * engine info, SW context ID and SW counter need to form a unique number 1391 * (Context ID) per lrc. 1392 */ 1393 static u32 lrc_descriptor(const struct intel_context *ce) 1394 { 1395 u32 desc; 1396 1397 desc = INTEL_LEGACY_32B_CONTEXT; 1398 if (i915_vm_is_4lvl(ce->vm)) 1399 desc = INTEL_LEGACY_64B_CONTEXT; 1400 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1401 1402 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1403 if (GRAPHICS_VER(ce->vm->i915) == 8) 1404 desc |= GEN8_CTX_L3LLC_COHERENT; 1405 1406 return i915_ggtt_offset(ce->state) | desc; 1407 } 1408 1409 u32 lrc_update_regs(const struct intel_context *ce, 1410 const struct intel_engine_cs *engine, 1411 u32 head) 1412 { 1413 struct intel_ring *ring = ce->ring; 1414 u32 *regs = ce->lrc_reg_state; 1415 1416 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1417 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1418 1419 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1420 regs[CTX_RING_HEAD] = head; 1421 regs[CTX_RING_TAIL] = ring->tail; 1422 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1423 1424 /* RPCS */ 1425 if (engine->class == RENDER_CLASS) { 1426 regs[CTX_R_PWR_CLK_STATE] = 1427 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1428 1429 i915_oa_init_reg_state(ce, engine); 1430 } 1431 1432 if (ce->wa_bb_page) { 1433 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1434 1435 fn = gen12_emit_indirect_ctx_xcs; 1436 if (ce->engine->class == RENDER_CLASS) 1437 fn = gen12_emit_indirect_ctx_rcs; 1438 1439 /* Mutually exclusive wrt to global indirect bb */ 1440 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1441 setup_indirect_ctx_bb(ce, engine, fn); 1442 } 1443 1444 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1445 } 1446 1447 void lrc_update_offsets(struct intel_context *ce, 1448 struct intel_engine_cs *engine) 1449 { 1450 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1451 } 1452 1453 void lrc_check_regs(const struct intel_context *ce, 1454 const struct intel_engine_cs *engine, 1455 const char *when) 1456 { 1457 const struct intel_ring *ring = ce->ring; 1458 u32 *regs = ce->lrc_reg_state; 1459 bool valid = true; 1460 int x; 1461 1462 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1463 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1464 engine->name, 1465 regs[CTX_RING_START], 1466 i915_ggtt_offset(ring->vma)); 1467 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1468 valid = false; 1469 } 1470 1471 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1472 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1473 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1474 engine->name, 1475 regs[CTX_RING_CTL], 1476 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1477 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1478 valid = false; 1479 } 1480 1481 x = lrc_ring_mi_mode(engine); 1482 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1483 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1484 engine->name, regs[x + 1]); 1485 regs[x + 1] &= ~STOP_RING; 1486 regs[x + 1] |= STOP_RING << 16; 1487 valid = false; 1488 } 1489 1490 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1491 } 1492 1493 /* 1494 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1495 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1496 * but there is a slight complication as this is applied in WA batch where the 1497 * values are only initialized once so we cannot take register value at the 1498 * beginning and reuse it further; hence we save its value to memory, upload a 1499 * constant value with bit21 set and then we restore it back with the saved value. 1500 * To simplify the WA, a constant value is formed by using the default value 1501 * of this register. This shouldn't be a problem because we are only modifying 1502 * it for a short period and this batch in non-premptible. We can ofcourse 1503 * use additional instructions that read the actual value of the register 1504 * at that time and set our bit of interest but it makes the WA complicated. 1505 * 1506 * This WA is also required for Gen9 so extracting as a function avoids 1507 * code duplication. 1508 */ 1509 static u32 * 1510 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1511 { 1512 /* NB no one else is allowed to scribble over scratch + 256! */ 1513 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1514 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1515 *batch++ = intel_gt_scratch_offset(engine->gt, 1516 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1517 *batch++ = 0; 1518 1519 *batch++ = MI_LOAD_REGISTER_IMM(1); 1520 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1521 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1522 1523 batch = gen8_emit_pipe_control(batch, 1524 PIPE_CONTROL_CS_STALL | 1525 PIPE_CONTROL_DC_FLUSH_ENABLE, 1526 0); 1527 1528 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1529 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1530 *batch++ = intel_gt_scratch_offset(engine->gt, 1531 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1532 *batch++ = 0; 1533 1534 return batch; 1535 } 1536 1537 /* 1538 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1539 * initialized at the beginning and shared across all contexts but this field 1540 * helps us to have multiple batches at different offsets and select them based 1541 * on a criteria. At the moment this batch always start at the beginning of the page 1542 * and at this point we don't have multiple wa_ctx batch buffers. 1543 * 1544 * The number of WA applied are not known at the beginning; we use this field 1545 * to return the no of DWORDS written. 1546 * 1547 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1548 * so it adds NOOPs as padding to make it cacheline aligned. 1549 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1550 * makes a complete batch buffer. 1551 */ 1552 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1553 { 1554 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1555 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1556 1557 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1558 if (IS_BROADWELL(engine->i915)) 1559 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1560 1561 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1562 /* Actual scratch location is at 128 bytes offset */ 1563 batch = gen8_emit_pipe_control(batch, 1564 PIPE_CONTROL_FLUSH_L3 | 1565 PIPE_CONTROL_STORE_DATA_INDEX | 1566 PIPE_CONTROL_CS_STALL | 1567 PIPE_CONTROL_QW_WRITE, 1568 LRC_PPHWSP_SCRATCH_ADDR); 1569 1570 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1571 1572 /* Pad to end of cacheline */ 1573 while ((unsigned long)batch % CACHELINE_BYTES) 1574 *batch++ = MI_NOOP; 1575 1576 /* 1577 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1578 * execution depends on the length specified in terms of cache lines 1579 * in the register CTX_RCS_INDIRECT_CTX 1580 */ 1581 1582 return batch; 1583 } 1584 1585 struct lri { 1586 i915_reg_t reg; 1587 u32 value; 1588 }; 1589 1590 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1591 { 1592 GEM_BUG_ON(!count || count > 63); 1593 1594 *batch++ = MI_LOAD_REGISTER_IMM(count); 1595 do { 1596 *batch++ = i915_mmio_reg_offset(lri->reg); 1597 *batch++ = lri->value; 1598 } while (lri++, --count); 1599 *batch++ = MI_NOOP; 1600 1601 return batch; 1602 } 1603 1604 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1605 { 1606 static const struct lri lri[] = { 1607 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1608 { 1609 COMMON_SLICE_CHICKEN2, 1610 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1611 0), 1612 }, 1613 1614 /* BSpec: 11391 */ 1615 { 1616 FF_SLICE_CHICKEN, 1617 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1618 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1619 }, 1620 1621 /* BSpec: 11299 */ 1622 { 1623 _3D_CHICKEN3, 1624 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1625 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1626 } 1627 }; 1628 1629 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1630 1631 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1632 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1633 1634 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1635 batch = gen8_emit_pipe_control(batch, 1636 PIPE_CONTROL_FLUSH_L3 | 1637 PIPE_CONTROL_STORE_DATA_INDEX | 1638 PIPE_CONTROL_CS_STALL | 1639 PIPE_CONTROL_QW_WRITE, 1640 LRC_PPHWSP_SCRATCH_ADDR); 1641 1642 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1643 1644 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1645 if (HAS_POOLED_EU(engine->i915)) { 1646 /* 1647 * EU pool configuration is setup along with golden context 1648 * during context initialization. This value depends on 1649 * device type (2x6 or 3x6) and needs to be updated based 1650 * on which subslice is disabled especially for 2x6 1651 * devices, however it is safe to load default 1652 * configuration of 3x6 device instead of masking off 1653 * corresponding bits because HW ignores bits of a disabled 1654 * subslice and drops down to appropriate config. Please 1655 * see render_state_setup() in i915_gem_render_state.c for 1656 * possible configurations, to avoid duplication they are 1657 * not shown here again. 1658 */ 1659 *batch++ = GEN9_MEDIA_POOL_STATE; 1660 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1661 *batch++ = 0x00777000; 1662 *batch++ = 0; 1663 *batch++ = 0; 1664 *batch++ = 0; 1665 } 1666 1667 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1668 1669 /* Pad to end of cacheline */ 1670 while ((unsigned long)batch % CACHELINE_BYTES) 1671 *batch++ = MI_NOOP; 1672 1673 return batch; 1674 } 1675 1676 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1677 1678 static int lrc_create_wa_ctx(struct intel_engine_cs *engine) 1679 { 1680 struct drm_i915_gem_object *obj; 1681 struct i915_vma *vma; 1682 int err; 1683 1684 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1685 if (IS_ERR(obj)) 1686 return PTR_ERR(obj); 1687 1688 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1689 if (IS_ERR(vma)) { 1690 err = PTR_ERR(vma); 1691 goto err; 1692 } 1693 1694 engine->wa_ctx.vma = vma; 1695 return 0; 1696 1697 err: 1698 i915_gem_object_put(obj); 1699 return err; 1700 } 1701 1702 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1703 { 1704 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1705 } 1706 1707 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1708 1709 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1710 { 1711 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1712 struct i915_wa_ctx_bb *wa_bb[] = { 1713 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1714 }; 1715 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1716 struct i915_gem_ww_ctx ww; 1717 void *batch, *batch_ptr; 1718 unsigned int i; 1719 int err; 1720 1721 if (!(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)) 1722 return; 1723 1724 switch (GRAPHICS_VER(engine->i915)) { 1725 case 12: 1726 case 11: 1727 return; 1728 case 9: 1729 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1730 wa_bb_fn[1] = NULL; 1731 break; 1732 case 8: 1733 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1734 wa_bb_fn[1] = NULL; 1735 break; 1736 default: 1737 MISSING_CASE(GRAPHICS_VER(engine->i915)); 1738 return; 1739 } 1740 1741 err = lrc_create_wa_ctx(engine); 1742 if (err) { 1743 /* 1744 * We continue even if we fail to initialize WA batch 1745 * because we only expect rare glitches but nothing 1746 * critical to prevent us from using GPU 1747 */ 1748 drm_err(&engine->i915->drm, 1749 "Ignoring context switch w/a allocation error:%d\n", 1750 err); 1751 return; 1752 } 1753 1754 if (!engine->wa_ctx.vma) 1755 return; 1756 1757 i915_gem_ww_ctx_init(&ww, true); 1758 retry: 1759 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); 1760 if (!err) 1761 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); 1762 if (err) 1763 goto err; 1764 1765 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1766 if (IS_ERR(batch)) { 1767 err = PTR_ERR(batch); 1768 goto err_unpin; 1769 } 1770 1771 /* 1772 * Emit the two workaround batch buffers, recording the offset from the 1773 * start of the workaround batch buffer object for each and their 1774 * respective sizes. 1775 */ 1776 batch_ptr = batch; 1777 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1778 wa_bb[i]->offset = batch_ptr - batch; 1779 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1780 CACHELINE_BYTES))) { 1781 err = -EINVAL; 1782 break; 1783 } 1784 if (wa_bb_fn[i]) 1785 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1786 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1787 } 1788 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1789 1790 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1791 __i915_gem_object_release_map(wa_ctx->vma->obj); 1792 1793 /* Verify that we can handle failure to setup the wa_ctx */ 1794 if (!err) 1795 err = i915_inject_probe_error(engine->i915, -ENODEV); 1796 1797 err_unpin: 1798 if (err) 1799 i915_vma_unpin(wa_ctx->vma); 1800 err: 1801 if (err == -EDEADLK) { 1802 err = i915_gem_ww_ctx_backoff(&ww); 1803 if (!err) 1804 goto retry; 1805 } 1806 i915_gem_ww_ctx_fini(&ww); 1807 1808 if (err) { 1809 i915_vma_put(engine->wa_ctx.vma); 1810 1811 /* Clear all flags to prevent further use */ 1812 memset(wa_ctx, 0, sizeof(*wa_ctx)); 1813 } 1814 } 1815 1816 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt) 1817 { 1818 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1819 stats->runtime.num_underflow++; 1820 stats->runtime.max_underflow = 1821 max_t(u32, stats->runtime.max_underflow, -dt); 1822 #endif 1823 } 1824 1825 static u32 lrc_get_runtime(const struct intel_context *ce) 1826 { 1827 /* 1828 * We can use either ppHWSP[16] which is recorded before the context 1829 * switch (and so excludes the cost of context switches) or use the 1830 * value from the context image itself, which is saved/restored earlier 1831 * and so includes the cost of the save. 1832 */ 1833 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 1834 } 1835 1836 void lrc_update_runtime(struct intel_context *ce) 1837 { 1838 struct intel_context_stats *stats = &ce->stats; 1839 u32 old; 1840 s32 dt; 1841 1842 old = stats->runtime.last; 1843 stats->runtime.last = lrc_get_runtime(ce); 1844 dt = stats->runtime.last - old; 1845 if (!dt) 1846 return; 1847 1848 if (unlikely(dt < 0)) { 1849 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1850 old, stats->runtime.last, dt); 1851 st_runtime_underflow(stats, dt); 1852 return; 1853 } 1854 1855 ewma_runtime_add(&stats->runtime.avg, dt); 1856 stats->runtime.total += dt; 1857 } 1858 1859 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1860 #include "selftest_lrc.c" 1861 #endif 1862