1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include "gem/i915_gem_lmem.h" 7 8 #include "gen8_engine_cs.h" 9 #include "i915_drv.h" 10 #include "i915_perf.h" 11 #include "i915_reg.h" 12 #include "intel_context.h" 13 #include "intel_engine.h" 14 #include "intel_engine_regs.h" 15 #include "intel_gpu_commands.h" 16 #include "intel_gt.h" 17 #include "intel_gt_regs.h" 18 #include "intel_lrc.h" 19 #include "intel_lrc_reg.h" 20 #include "intel_ring.h" 21 #include "shmem_utils.h" 22 23 static void set_offsets(u32 *regs, 24 const u8 *data, 25 const struct intel_engine_cs *engine, 26 bool close) 27 #define NOP(x) (BIT(7) | (x)) 28 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 29 #define POSTED BIT(0) 30 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 31 #define REG16(x) \ 32 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 33 (((x) >> 2) & 0x7f) 34 #define END 0 35 { 36 const u32 base = engine->mmio_base; 37 38 while (*data) { 39 u8 count, flags; 40 41 if (*data & BIT(7)) { /* skip */ 42 count = *data++ & ~BIT(7); 43 regs += count; 44 continue; 45 } 46 47 count = *data & 0x3f; 48 flags = *data >> 6; 49 data++; 50 51 *regs = MI_LOAD_REGISTER_IMM(count); 52 if (flags & POSTED) 53 *regs |= MI_LRI_FORCE_POSTED; 54 if (GRAPHICS_VER(engine->i915) >= 11) 55 *regs |= MI_LRI_LRM_CS_MMIO; 56 regs++; 57 58 GEM_BUG_ON(!count); 59 do { 60 u32 offset = 0; 61 u8 v; 62 63 do { 64 v = *data++; 65 offset <<= 7; 66 offset |= v & ~BIT(7); 67 } while (v & BIT(7)); 68 69 regs[0] = base + (offset << 2); 70 regs += 2; 71 } while (--count); 72 } 73 74 if (close) { 75 /* Close the batch; used mainly by live_lrc_layout() */ 76 *regs = MI_BATCH_BUFFER_END; 77 if (GRAPHICS_VER(engine->i915) >= 11) 78 *regs |= BIT(0); 79 } 80 } 81 82 static const u8 gen8_xcs_offsets[] = { 83 NOP(1), 84 LRI(11, 0), 85 REG16(0x244), 86 REG(0x034), 87 REG(0x030), 88 REG(0x038), 89 REG(0x03c), 90 REG(0x168), 91 REG(0x140), 92 REG(0x110), 93 REG(0x11c), 94 REG(0x114), 95 REG(0x118), 96 97 NOP(9), 98 LRI(9, 0), 99 REG16(0x3a8), 100 REG16(0x28c), 101 REG16(0x288), 102 REG16(0x284), 103 REG16(0x280), 104 REG16(0x27c), 105 REG16(0x278), 106 REG16(0x274), 107 REG16(0x270), 108 109 NOP(13), 110 LRI(2, 0), 111 REG16(0x200), 112 REG(0x028), 113 114 END 115 }; 116 117 static const u8 gen9_xcs_offsets[] = { 118 NOP(1), 119 LRI(14, POSTED), 120 REG16(0x244), 121 REG(0x034), 122 REG(0x030), 123 REG(0x038), 124 REG(0x03c), 125 REG(0x168), 126 REG(0x140), 127 REG(0x110), 128 REG(0x11c), 129 REG(0x114), 130 REG(0x118), 131 REG(0x1c0), 132 REG(0x1c4), 133 REG(0x1c8), 134 135 NOP(3), 136 LRI(9, POSTED), 137 REG16(0x3a8), 138 REG16(0x28c), 139 REG16(0x288), 140 REG16(0x284), 141 REG16(0x280), 142 REG16(0x27c), 143 REG16(0x278), 144 REG16(0x274), 145 REG16(0x270), 146 147 NOP(13), 148 LRI(1, POSTED), 149 REG16(0x200), 150 151 NOP(13), 152 LRI(44, POSTED), 153 REG(0x028), 154 REG(0x09c), 155 REG(0x0c0), 156 REG(0x178), 157 REG(0x17c), 158 REG16(0x358), 159 REG(0x170), 160 REG(0x150), 161 REG(0x154), 162 REG(0x158), 163 REG16(0x41c), 164 REG16(0x600), 165 REG16(0x604), 166 REG16(0x608), 167 REG16(0x60c), 168 REG16(0x610), 169 REG16(0x614), 170 REG16(0x618), 171 REG16(0x61c), 172 REG16(0x620), 173 REG16(0x624), 174 REG16(0x628), 175 REG16(0x62c), 176 REG16(0x630), 177 REG16(0x634), 178 REG16(0x638), 179 REG16(0x63c), 180 REG16(0x640), 181 REG16(0x644), 182 REG16(0x648), 183 REG16(0x64c), 184 REG16(0x650), 185 REG16(0x654), 186 REG16(0x658), 187 REG16(0x65c), 188 REG16(0x660), 189 REG16(0x664), 190 REG16(0x668), 191 REG16(0x66c), 192 REG16(0x670), 193 REG16(0x674), 194 REG16(0x678), 195 REG16(0x67c), 196 REG(0x068), 197 198 END 199 }; 200 201 static const u8 gen12_xcs_offsets[] = { 202 NOP(1), 203 LRI(13, POSTED), 204 REG16(0x244), 205 REG(0x034), 206 REG(0x030), 207 REG(0x038), 208 REG(0x03c), 209 REG(0x168), 210 REG(0x140), 211 REG(0x110), 212 REG(0x1c0), 213 REG(0x1c4), 214 REG(0x1c8), 215 REG(0x180), 216 REG16(0x2b4), 217 218 NOP(5), 219 LRI(9, POSTED), 220 REG16(0x3a8), 221 REG16(0x28c), 222 REG16(0x288), 223 REG16(0x284), 224 REG16(0x280), 225 REG16(0x27c), 226 REG16(0x278), 227 REG16(0x274), 228 REG16(0x270), 229 230 END 231 }; 232 233 static const u8 dg2_xcs_offsets[] = { 234 NOP(1), 235 LRI(15, POSTED), 236 REG16(0x244), 237 REG(0x034), 238 REG(0x030), 239 REG(0x038), 240 REG(0x03c), 241 REG(0x168), 242 REG(0x140), 243 REG(0x110), 244 REG(0x1c0), 245 REG(0x1c4), 246 REG(0x1c8), 247 REG(0x180), 248 REG16(0x2b4), 249 REG(0x120), 250 REG(0x124), 251 252 NOP(1), 253 LRI(9, POSTED), 254 REG16(0x3a8), 255 REG16(0x28c), 256 REG16(0x288), 257 REG16(0x284), 258 REG16(0x280), 259 REG16(0x27c), 260 REG16(0x278), 261 REG16(0x274), 262 REG16(0x270), 263 264 END 265 }; 266 267 static const u8 mtl_xcs_offsets[] = { 268 NOP(1), 269 LRI(13, POSTED), 270 REG16(0x244), 271 REG(0x034), 272 REG(0x030), 273 REG(0x038), 274 REG(0x03c), 275 REG(0x168), 276 REG(0x140), 277 REG(0x110), 278 REG(0x1c0), 279 REG(0x1c4), 280 REG(0x1c8), 281 REG(0x180), 282 REG16(0x2b4), 283 NOP(4), 284 285 NOP(1), 286 LRI(9, POSTED), 287 REG16(0x3a8), 288 REG16(0x28c), 289 REG16(0x288), 290 REG16(0x284), 291 REG16(0x280), 292 REG16(0x27c), 293 REG16(0x278), 294 REG16(0x274), 295 REG16(0x270), 296 297 END 298 }; 299 300 static const u8 gen8_rcs_offsets[] = { 301 NOP(1), 302 LRI(14, POSTED), 303 REG16(0x244), 304 REG(0x034), 305 REG(0x030), 306 REG(0x038), 307 REG(0x03c), 308 REG(0x168), 309 REG(0x140), 310 REG(0x110), 311 REG(0x11c), 312 REG(0x114), 313 REG(0x118), 314 REG(0x1c0), 315 REG(0x1c4), 316 REG(0x1c8), 317 318 NOP(3), 319 LRI(9, POSTED), 320 REG16(0x3a8), 321 REG16(0x28c), 322 REG16(0x288), 323 REG16(0x284), 324 REG16(0x280), 325 REG16(0x27c), 326 REG16(0x278), 327 REG16(0x274), 328 REG16(0x270), 329 330 NOP(13), 331 LRI(1, 0), 332 REG(0x0c8), 333 334 END 335 }; 336 337 static const u8 gen9_rcs_offsets[] = { 338 NOP(1), 339 LRI(14, POSTED), 340 REG16(0x244), 341 REG(0x34), 342 REG(0x30), 343 REG(0x38), 344 REG(0x3c), 345 REG(0x168), 346 REG(0x140), 347 REG(0x110), 348 REG(0x11c), 349 REG(0x114), 350 REG(0x118), 351 REG(0x1c0), 352 REG(0x1c4), 353 REG(0x1c8), 354 355 NOP(3), 356 LRI(9, POSTED), 357 REG16(0x3a8), 358 REG16(0x28c), 359 REG16(0x288), 360 REG16(0x284), 361 REG16(0x280), 362 REG16(0x27c), 363 REG16(0x278), 364 REG16(0x274), 365 REG16(0x270), 366 367 NOP(13), 368 LRI(1, 0), 369 REG(0xc8), 370 371 NOP(13), 372 LRI(44, POSTED), 373 REG(0x28), 374 REG(0x9c), 375 REG(0xc0), 376 REG(0x178), 377 REG(0x17c), 378 REG16(0x358), 379 REG(0x170), 380 REG(0x150), 381 REG(0x154), 382 REG(0x158), 383 REG16(0x41c), 384 REG16(0x600), 385 REG16(0x604), 386 REG16(0x608), 387 REG16(0x60c), 388 REG16(0x610), 389 REG16(0x614), 390 REG16(0x618), 391 REG16(0x61c), 392 REG16(0x620), 393 REG16(0x624), 394 REG16(0x628), 395 REG16(0x62c), 396 REG16(0x630), 397 REG16(0x634), 398 REG16(0x638), 399 REG16(0x63c), 400 REG16(0x640), 401 REG16(0x644), 402 REG16(0x648), 403 REG16(0x64c), 404 REG16(0x650), 405 REG16(0x654), 406 REG16(0x658), 407 REG16(0x65c), 408 REG16(0x660), 409 REG16(0x664), 410 REG16(0x668), 411 REG16(0x66c), 412 REG16(0x670), 413 REG16(0x674), 414 REG16(0x678), 415 REG16(0x67c), 416 REG(0x68), 417 418 END 419 }; 420 421 static const u8 gen11_rcs_offsets[] = { 422 NOP(1), 423 LRI(15, POSTED), 424 REG16(0x244), 425 REG(0x034), 426 REG(0x030), 427 REG(0x038), 428 REG(0x03c), 429 REG(0x168), 430 REG(0x140), 431 REG(0x110), 432 REG(0x11c), 433 REG(0x114), 434 REG(0x118), 435 REG(0x1c0), 436 REG(0x1c4), 437 REG(0x1c8), 438 REG(0x180), 439 440 NOP(1), 441 LRI(9, POSTED), 442 REG16(0x3a8), 443 REG16(0x28c), 444 REG16(0x288), 445 REG16(0x284), 446 REG16(0x280), 447 REG16(0x27c), 448 REG16(0x278), 449 REG16(0x274), 450 REG16(0x270), 451 452 LRI(1, POSTED), 453 REG(0x1b0), 454 455 NOP(10), 456 LRI(1, 0), 457 REG(0x0c8), 458 459 END 460 }; 461 462 static const u8 gen12_rcs_offsets[] = { 463 NOP(1), 464 LRI(13, POSTED), 465 REG16(0x244), 466 REG(0x034), 467 REG(0x030), 468 REG(0x038), 469 REG(0x03c), 470 REG(0x168), 471 REG(0x140), 472 REG(0x110), 473 REG(0x1c0), 474 REG(0x1c4), 475 REG(0x1c8), 476 REG(0x180), 477 REG16(0x2b4), 478 479 NOP(5), 480 LRI(9, POSTED), 481 REG16(0x3a8), 482 REG16(0x28c), 483 REG16(0x288), 484 REG16(0x284), 485 REG16(0x280), 486 REG16(0x27c), 487 REG16(0x278), 488 REG16(0x274), 489 REG16(0x270), 490 491 LRI(3, POSTED), 492 REG(0x1b0), 493 REG16(0x5a8), 494 REG16(0x5ac), 495 496 NOP(6), 497 LRI(1, 0), 498 REG(0x0c8), 499 NOP(3 + 9 + 1), 500 501 LRI(51, POSTED), 502 REG16(0x588), 503 REG16(0x588), 504 REG16(0x588), 505 REG16(0x588), 506 REG16(0x588), 507 REG16(0x588), 508 REG(0x028), 509 REG(0x09c), 510 REG(0x0c0), 511 REG(0x178), 512 REG(0x17c), 513 REG16(0x358), 514 REG(0x170), 515 REG(0x150), 516 REG(0x154), 517 REG(0x158), 518 REG16(0x41c), 519 REG16(0x600), 520 REG16(0x604), 521 REG16(0x608), 522 REG16(0x60c), 523 REG16(0x610), 524 REG16(0x614), 525 REG16(0x618), 526 REG16(0x61c), 527 REG16(0x620), 528 REG16(0x624), 529 REG16(0x628), 530 REG16(0x62c), 531 REG16(0x630), 532 REG16(0x634), 533 REG16(0x638), 534 REG16(0x63c), 535 REG16(0x640), 536 REG16(0x644), 537 REG16(0x648), 538 REG16(0x64c), 539 REG16(0x650), 540 REG16(0x654), 541 REG16(0x658), 542 REG16(0x65c), 543 REG16(0x660), 544 REG16(0x664), 545 REG16(0x668), 546 REG16(0x66c), 547 REG16(0x670), 548 REG16(0x674), 549 REG16(0x678), 550 REG16(0x67c), 551 REG(0x068), 552 REG(0x084), 553 NOP(1), 554 555 END 556 }; 557 558 static const u8 xehp_rcs_offsets[] = { 559 NOP(1), 560 LRI(13, POSTED), 561 REG16(0x244), 562 REG(0x034), 563 REG(0x030), 564 REG(0x038), 565 REG(0x03c), 566 REG(0x168), 567 REG(0x140), 568 REG(0x110), 569 REG(0x1c0), 570 REG(0x1c4), 571 REG(0x1c8), 572 REG(0x180), 573 REG16(0x2b4), 574 575 NOP(5), 576 LRI(9, POSTED), 577 REG16(0x3a8), 578 REG16(0x28c), 579 REG16(0x288), 580 REG16(0x284), 581 REG16(0x280), 582 REG16(0x27c), 583 REG16(0x278), 584 REG16(0x274), 585 REG16(0x270), 586 587 LRI(3, POSTED), 588 REG(0x1b0), 589 REG16(0x5a8), 590 REG16(0x5ac), 591 592 NOP(6), 593 LRI(1, 0), 594 REG(0x0c8), 595 596 END 597 }; 598 599 static const u8 dg2_rcs_offsets[] = { 600 NOP(1), 601 LRI(15, POSTED), 602 REG16(0x244), 603 REG(0x034), 604 REG(0x030), 605 REG(0x038), 606 REG(0x03c), 607 REG(0x168), 608 REG(0x140), 609 REG(0x110), 610 REG(0x1c0), 611 REG(0x1c4), 612 REG(0x1c8), 613 REG(0x180), 614 REG16(0x2b4), 615 REG(0x120), 616 REG(0x124), 617 618 NOP(1), 619 LRI(9, POSTED), 620 REG16(0x3a8), 621 REG16(0x28c), 622 REG16(0x288), 623 REG16(0x284), 624 REG16(0x280), 625 REG16(0x27c), 626 REG16(0x278), 627 REG16(0x274), 628 REG16(0x270), 629 630 LRI(3, POSTED), 631 REG(0x1b0), 632 REG16(0x5a8), 633 REG16(0x5ac), 634 635 NOP(6), 636 LRI(1, 0), 637 REG(0x0c8), 638 639 END 640 }; 641 642 static const u8 mtl_rcs_offsets[] = { 643 NOP(1), 644 LRI(15, POSTED), 645 REG16(0x244), 646 REG(0x034), 647 REG(0x030), 648 REG(0x038), 649 REG(0x03c), 650 REG(0x168), 651 REG(0x140), 652 REG(0x110), 653 REG(0x1c0), 654 REG(0x1c4), 655 REG(0x1c8), 656 REG(0x180), 657 REG16(0x2b4), 658 REG(0x120), 659 REG(0x124), 660 661 NOP(1), 662 LRI(9, POSTED), 663 REG16(0x3a8), 664 REG16(0x28c), 665 REG16(0x288), 666 REG16(0x284), 667 REG16(0x280), 668 REG16(0x27c), 669 REG16(0x278), 670 REG16(0x274), 671 REG16(0x270), 672 673 NOP(2), 674 LRI(2, POSTED), 675 REG16(0x5a8), 676 REG16(0x5ac), 677 678 NOP(6), 679 LRI(1, 0), 680 REG(0x0c8), 681 682 END 683 }; 684 685 #undef END 686 #undef REG16 687 #undef REG 688 #undef LRI 689 #undef NOP 690 691 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 692 { 693 /* 694 * The gen12+ lists only have the registers we program in the basic 695 * default state. We rely on the context image using relative 696 * addressing to automatic fixup the register state between the 697 * physical engines for virtual engine. 698 */ 699 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && 700 !intel_engine_has_relative_mmio(engine)); 701 702 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) { 703 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) 704 return mtl_rcs_offsets; 705 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 706 return dg2_rcs_offsets; 707 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 708 return xehp_rcs_offsets; 709 else if (GRAPHICS_VER(engine->i915) >= 12) 710 return gen12_rcs_offsets; 711 else if (GRAPHICS_VER(engine->i915) >= 11) 712 return gen11_rcs_offsets; 713 else if (GRAPHICS_VER(engine->i915) >= 9) 714 return gen9_rcs_offsets; 715 else 716 return gen8_rcs_offsets; 717 } else { 718 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) 719 return mtl_xcs_offsets; 720 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 721 return dg2_xcs_offsets; 722 else if (GRAPHICS_VER(engine->i915) >= 12) 723 return gen12_xcs_offsets; 724 else if (GRAPHICS_VER(engine->i915) >= 9) 725 return gen9_xcs_offsets; 726 else 727 return gen8_xcs_offsets; 728 } 729 } 730 731 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 732 { 733 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 734 return 0x70; 735 else if (GRAPHICS_VER(engine->i915) >= 12) 736 return 0x60; 737 else if (GRAPHICS_VER(engine->i915) >= 9) 738 return 0x54; 739 else if (engine->class == RENDER_CLASS) 740 return 0x58; 741 else 742 return -1; 743 } 744 745 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine) 746 { 747 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 748 return 0x80; 749 else if (GRAPHICS_VER(engine->i915) >= 12) 750 return 0x70; 751 else if (GRAPHICS_VER(engine->i915) >= 9) 752 return 0x64; 753 else if (GRAPHICS_VER(engine->i915) >= 8 && 754 engine->class == RENDER_CLASS) 755 return 0xc4; 756 else 757 return -1; 758 } 759 760 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 761 { 762 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 763 return 0x84; 764 else if (GRAPHICS_VER(engine->i915) >= 12) 765 return 0x74; 766 else if (GRAPHICS_VER(engine->i915) >= 9) 767 return 0x68; 768 else if (engine->class == RENDER_CLASS) 769 return 0xd8; 770 else 771 return -1; 772 } 773 774 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 775 { 776 if (GRAPHICS_VER(engine->i915) >= 12) 777 return 0x12; 778 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS) 779 return 0x18; 780 else 781 return -1; 782 } 783 784 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 785 { 786 int x; 787 788 x = lrc_ring_wa_bb_per_ctx(engine); 789 if (x < 0) 790 return x; 791 792 return x + 2; 793 } 794 795 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 796 { 797 int x; 798 799 x = lrc_ring_indirect_ptr(engine); 800 if (x < 0) 801 return x; 802 803 return x + 2; 804 } 805 806 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 807 { 808 809 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 810 /* 811 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL 812 * simply to match the RCS context image layout. 813 */ 814 return 0xc6; 815 else if (engine->class != RENDER_CLASS) 816 return -1; 817 else if (GRAPHICS_VER(engine->i915) >= 12) 818 return 0xb6; 819 else if (GRAPHICS_VER(engine->i915) >= 11) 820 return 0xaa; 821 else 822 return -1; 823 } 824 825 static u32 826 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 827 { 828 switch (GRAPHICS_VER(engine->i915)) { 829 default: 830 MISSING_CASE(GRAPHICS_VER(engine->i915)); 831 fallthrough; 832 case 12: 833 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 834 case 11: 835 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 836 case 9: 837 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 838 case 8: 839 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 840 } 841 } 842 843 static void 844 lrc_setup_indirect_ctx(u32 *regs, 845 const struct intel_engine_cs *engine, 846 u32 ctx_bb_ggtt_addr, 847 u32 size) 848 { 849 GEM_BUG_ON(!size); 850 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 851 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 852 regs[lrc_ring_indirect_ptr(engine) + 1] = 853 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 854 855 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 856 regs[lrc_ring_indirect_offset(engine) + 1] = 857 lrc_ring_indirect_offset_default(engine) << 6; 858 } 859 860 static void init_common_regs(u32 * const regs, 861 const struct intel_context *ce, 862 const struct intel_engine_cs *engine, 863 bool inhibit) 864 { 865 u32 ctl; 866 int loc; 867 868 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 869 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 870 if (inhibit) 871 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 872 if (GRAPHICS_VER(engine->i915) < 11) 873 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 874 CTX_CTRL_RS_CTX_ENABLE); 875 regs[CTX_CONTEXT_CONTROL] = ctl; 876 877 regs[CTX_TIMESTAMP] = ce->stats.runtime.last; 878 879 loc = lrc_ring_bb_offset(engine); 880 if (loc != -1) 881 regs[loc + 1] = 0; 882 } 883 884 static void init_wa_bb_regs(u32 * const regs, 885 const struct intel_engine_cs *engine) 886 { 887 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 888 889 if (wa_ctx->per_ctx.size) { 890 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 891 892 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 893 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 894 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 895 } 896 897 if (wa_ctx->indirect_ctx.size) { 898 lrc_setup_indirect_ctx(regs, engine, 899 i915_ggtt_offset(wa_ctx->vma) + 900 wa_ctx->indirect_ctx.offset, 901 wa_ctx->indirect_ctx.size); 902 } 903 } 904 905 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 906 { 907 if (i915_vm_is_4lvl(&ppgtt->vm)) { 908 /* 64b PPGTT (48bit canonical) 909 * PDP0_DESCRIPTOR contains the base address to PML4 and 910 * other PDP Descriptors are ignored. 911 */ 912 ASSIGN_CTX_PML4(ppgtt, regs); 913 } else { 914 ASSIGN_CTX_PDP(ppgtt, regs, 3); 915 ASSIGN_CTX_PDP(ppgtt, regs, 2); 916 ASSIGN_CTX_PDP(ppgtt, regs, 1); 917 ASSIGN_CTX_PDP(ppgtt, regs, 0); 918 } 919 } 920 921 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 922 { 923 if (i915_is_ggtt(vm)) 924 return i915_vm_to_ggtt(vm)->alias; 925 else 926 return i915_vm_to_ppgtt(vm); 927 } 928 929 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 930 { 931 int x; 932 933 x = lrc_ring_mi_mode(engine); 934 if (x != -1) { 935 regs[x + 1] &= ~STOP_RING; 936 regs[x + 1] |= STOP_RING << 16; 937 } 938 } 939 940 static void __lrc_init_regs(u32 *regs, 941 const struct intel_context *ce, 942 const struct intel_engine_cs *engine, 943 bool inhibit) 944 { 945 /* 946 * A context is actually a big batch buffer with several 947 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 948 * values we are setting here are only for the first context restore: 949 * on a subsequent save, the GPU will recreate this batchbuffer with new 950 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 951 * we are not initializing here). 952 * 953 * Must keep consistent with virtual_update_register_offsets(). 954 */ 955 956 if (inhibit) 957 memset(regs, 0, PAGE_SIZE); 958 959 set_offsets(regs, reg_offsets(engine), engine, inhibit); 960 961 init_common_regs(regs, ce, engine, inhibit); 962 init_ppgtt_regs(regs, vm_alias(ce->vm)); 963 964 init_wa_bb_regs(regs, engine); 965 966 __reset_stop_ring(regs, engine); 967 } 968 969 void lrc_init_regs(const struct intel_context *ce, 970 const struct intel_engine_cs *engine, 971 bool inhibit) 972 { 973 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 974 } 975 976 void lrc_reset_regs(const struct intel_context *ce, 977 const struct intel_engine_cs *engine) 978 { 979 __reset_stop_ring(ce->lrc_reg_state, engine); 980 } 981 982 static void 983 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 984 { 985 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 986 return; 987 988 vaddr += engine->context_size; 989 990 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 991 } 992 993 static void 994 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 995 { 996 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 997 return; 998 999 vaddr += engine->context_size; 1000 1001 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 1002 drm_err_once(&engine->i915->drm, 1003 "%s context redzone overwritten!\n", 1004 engine->name); 1005 } 1006 1007 static u32 context_wa_bb_offset(const struct intel_context *ce) 1008 { 1009 return PAGE_SIZE * ce->wa_bb_page; 1010 } 1011 1012 static u32 *context_indirect_bb(const struct intel_context *ce) 1013 { 1014 void *ptr; 1015 1016 GEM_BUG_ON(!ce->wa_bb_page); 1017 1018 ptr = ce->lrc_reg_state; 1019 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 1020 ptr += context_wa_bb_offset(ce); 1021 1022 return ptr; 1023 } 1024 1025 void lrc_init_state(struct intel_context *ce, 1026 struct intel_engine_cs *engine, 1027 void *state) 1028 { 1029 bool inhibit = true; 1030 1031 set_redzone(state, engine); 1032 1033 if (engine->default_state) { 1034 shmem_read(engine->default_state, 0, 1035 state, engine->context_size); 1036 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 1037 inhibit = false; 1038 } 1039 1040 /* Clear the ppHWSP (inc. per-context counters) */ 1041 memset(state, 0, PAGE_SIZE); 1042 1043 /* Clear the indirect wa and storage */ 1044 if (ce->wa_bb_page) 1045 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE); 1046 1047 /* 1048 * The second page of the context object contains some registers which 1049 * must be set up prior to the first execution. 1050 */ 1051 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 1052 } 1053 1054 u32 lrc_indirect_bb(const struct intel_context *ce) 1055 { 1056 return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce); 1057 } 1058 1059 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs) 1060 { 1061 /* If predication is active, this will be noop'ed */ 1062 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 1063 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 1064 *cs++ = 0; 1065 *cs++ = 0; /* No predication */ 1066 1067 /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */ 1068 *cs++ = MI_BATCH_BUFFER_END | BIT(15); 1069 *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE; 1070 1071 /* Instructions are no longer predicated (disabled), we can proceed */ 1072 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 1073 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 1074 *cs++ = 0; 1075 *cs++ = 1; /* enable predication before the next BB */ 1076 1077 *cs++ = MI_BATCH_BUFFER_END; 1078 GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA); 1079 1080 return cs; 1081 } 1082 1083 static struct i915_vma * 1084 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 1085 { 1086 struct drm_i915_gem_object *obj; 1087 struct i915_vma *vma; 1088 u32 context_size; 1089 1090 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 1091 1092 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1093 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 1094 1095 if (GRAPHICS_VER(engine->i915) == 12) { 1096 ce->wa_bb_page = context_size / PAGE_SIZE; 1097 context_size += PAGE_SIZE; 1098 } 1099 1100 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { 1101 ce->parallel.guc.parent_page = context_size / PAGE_SIZE; 1102 context_size += PARENT_SCRATCH_SIZE; 1103 } 1104 1105 obj = i915_gem_object_create_lmem(engine->i915, context_size, 1106 I915_BO_ALLOC_PM_VOLATILE); 1107 if (IS_ERR(obj)) 1108 obj = i915_gem_object_create_shmem(engine->i915, context_size); 1109 if (IS_ERR(obj)) 1110 return ERR_CAST(obj); 1111 1112 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1113 if (IS_ERR(vma)) { 1114 i915_gem_object_put(obj); 1115 return vma; 1116 } 1117 1118 return vma; 1119 } 1120 1121 static struct intel_timeline * 1122 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 1123 { 1124 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 1125 1126 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 1127 } 1128 1129 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 1130 { 1131 struct intel_ring *ring; 1132 struct i915_vma *vma; 1133 int err; 1134 1135 GEM_BUG_ON(ce->state); 1136 1137 vma = __lrc_alloc_state(ce, engine); 1138 if (IS_ERR(vma)) 1139 return PTR_ERR(vma); 1140 1141 ring = intel_engine_create_ring(engine, ce->ring_size); 1142 if (IS_ERR(ring)) { 1143 err = PTR_ERR(ring); 1144 goto err_vma; 1145 } 1146 1147 if (!page_mask_bits(ce->timeline)) { 1148 struct intel_timeline *tl; 1149 1150 /* 1151 * Use the static global HWSP for the kernel context, and 1152 * a dynamically allocated cacheline for everyone else. 1153 */ 1154 if (unlikely(ce->timeline)) 1155 tl = pinned_timeline(ce, engine); 1156 else 1157 tl = intel_timeline_create(engine->gt); 1158 if (IS_ERR(tl)) { 1159 err = PTR_ERR(tl); 1160 goto err_ring; 1161 } 1162 1163 ce->timeline = tl; 1164 } 1165 1166 ce->ring = ring; 1167 ce->state = vma; 1168 1169 return 0; 1170 1171 err_ring: 1172 intel_ring_put(ring); 1173 err_vma: 1174 i915_vma_put(vma); 1175 return err; 1176 } 1177 1178 void lrc_reset(struct intel_context *ce) 1179 { 1180 GEM_BUG_ON(!intel_context_is_pinned(ce)); 1181 1182 intel_ring_reset(ce->ring, ce->ring->emit); 1183 1184 /* Scrub away the garbage */ 1185 lrc_init_regs(ce, ce->engine, true); 1186 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 1187 } 1188 1189 int 1190 lrc_pre_pin(struct intel_context *ce, 1191 struct intel_engine_cs *engine, 1192 struct i915_gem_ww_ctx *ww, 1193 void **vaddr) 1194 { 1195 GEM_BUG_ON(!ce->state); 1196 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1197 1198 *vaddr = i915_gem_object_pin_map(ce->state->obj, 1199 i915_coherent_map_type(ce->engine->i915, 1200 ce->state->obj, 1201 false) | 1202 I915_MAP_OVERRIDE); 1203 1204 return PTR_ERR_OR_ZERO(*vaddr); 1205 } 1206 1207 int 1208 lrc_pin(struct intel_context *ce, 1209 struct intel_engine_cs *engine, 1210 void *vaddr) 1211 { 1212 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 1213 1214 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 1215 lrc_init_state(ce, engine, vaddr); 1216 1217 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 1218 return 0; 1219 } 1220 1221 void lrc_unpin(struct intel_context *ce) 1222 { 1223 if (unlikely(ce->parallel.last_rq)) { 1224 i915_request_put(ce->parallel.last_rq); 1225 ce->parallel.last_rq = NULL; 1226 } 1227 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 1228 ce->engine); 1229 } 1230 1231 void lrc_post_unpin(struct intel_context *ce) 1232 { 1233 i915_gem_object_unpin_map(ce->state->obj); 1234 } 1235 1236 void lrc_fini(struct intel_context *ce) 1237 { 1238 if (!ce->state) 1239 return; 1240 1241 intel_ring_put(fetch_and_zero(&ce->ring)); 1242 i915_vma_put(fetch_and_zero(&ce->state)); 1243 } 1244 1245 void lrc_destroy(struct kref *kref) 1246 { 1247 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1248 1249 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1250 GEM_BUG_ON(intel_context_is_pinned(ce)); 1251 1252 lrc_fini(ce); 1253 1254 intel_context_fini(ce); 1255 intel_context_free(ce); 1256 } 1257 1258 static u32 * 1259 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 1260 { 1261 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1262 MI_SRM_LRM_GLOBAL_GTT | 1263 MI_LRI_LRM_CS_MMIO; 1264 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1265 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1266 CTX_TIMESTAMP * sizeof(u32); 1267 *cs++ = 0; 1268 1269 *cs++ = MI_LOAD_REGISTER_REG | 1270 MI_LRR_SOURCE_CS_MMIO | 1271 MI_LRI_LRM_CS_MMIO; 1272 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1273 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1274 1275 *cs++ = MI_LOAD_REGISTER_REG | 1276 MI_LRR_SOURCE_CS_MMIO | 1277 MI_LRI_LRM_CS_MMIO; 1278 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1279 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1280 1281 return cs; 1282 } 1283 1284 static u32 * 1285 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 1286 { 1287 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 1288 1289 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1290 MI_SRM_LRM_GLOBAL_GTT | 1291 MI_LRI_LRM_CS_MMIO; 1292 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1293 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1294 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 1295 *cs++ = 0; 1296 1297 return cs; 1298 } 1299 1300 static u32 * 1301 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1302 { 1303 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1304 1305 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1306 MI_SRM_LRM_GLOBAL_GTT | 1307 MI_LRI_LRM_CS_MMIO; 1308 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1309 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1310 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1311 *cs++ = 0; 1312 1313 *cs++ = MI_LOAD_REGISTER_REG | 1314 MI_LRR_SOURCE_CS_MMIO | 1315 MI_LRI_LRM_CS_MMIO; 1316 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1317 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1318 1319 return cs; 1320 } 1321 1322 /* 1323 * On DG2 during context restore of a preempted context in GPGPU mode, 1324 * RCS restore hang is detected. This is extremely timing dependent. 1325 * To address this below sw wabb is implemented for DG2 A steppings. 1326 */ 1327 static u32 * 1328 dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs) 1329 { 1330 *cs++ = MI_LOAD_REGISTER_IMM(1); 1331 *cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG); 1332 *cs++ = 0x21; 1333 1334 *cs++ = MI_LOAD_REGISTER_REG; 1335 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base)); 1336 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1); 1337 1338 *cs++ = MI_LOAD_REGISTER_REG; 1339 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base)); 1340 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2); 1341 1342 return cs; 1343 } 1344 1345 /* 1346 * The bspec's tuning guide asks us to program a vertical watermark value of 1347 * 0x3FF. However this register is not saved/restored properly by the 1348 * hardware, so we're required to apply the desired value via INDIRECT_CTX 1349 * batch buffer to ensure the value takes effect properly. All other bits 1350 * in this register should remain at 0 (the hardware default). 1351 */ 1352 static u32 * 1353 dg2_emit_draw_watermark_setting(u32 *cs) 1354 { 1355 *cs++ = MI_LOAD_REGISTER_IMM(1); 1356 *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK); 1357 *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF); 1358 1359 return cs; 1360 } 1361 1362 static u32 * 1363 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1364 { 1365 cs = gen12_emit_timestamp_wa(ce, cs); 1366 cs = gen12_emit_cmd_buf_wa(ce, cs); 1367 cs = gen12_emit_restore_scratch(ce, cs); 1368 1369 /* Wa_22011450934:dg2 */ 1370 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) || 1371 IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0)) 1372 cs = dg2_emit_rcs_hang_wabb(ce, cs); 1373 1374 /* Wa_16013000631:dg2 */ 1375 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || 1376 IS_DG2_G11(ce->engine->i915)) 1377 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); 1378 1379 /* hsdes: 1809175790 */ 1380 if (!HAS_FLAT_CCS(ce->engine->i915)) 1381 cs = gen12_emit_aux_table_inv(ce->engine->gt, 1382 cs, GEN12_GFX_CCS_AUX_NV); 1383 1384 /* Wa_16014892111 */ 1385 if (IS_DG2(ce->engine->i915)) 1386 cs = dg2_emit_draw_watermark_setting(cs); 1387 1388 return cs; 1389 } 1390 1391 static u32 * 1392 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1393 { 1394 cs = gen12_emit_timestamp_wa(ce, cs); 1395 cs = gen12_emit_restore_scratch(ce, cs); 1396 1397 /* Wa_16013000631:dg2 */ 1398 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || 1399 IS_DG2_G11(ce->engine->i915)) 1400 if (ce->engine->class == COMPUTE_CLASS) 1401 cs = gen8_emit_pipe_control(cs, 1402 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 1403 0); 1404 1405 /* hsdes: 1809175790 */ 1406 if (!HAS_FLAT_CCS(ce->engine->i915)) { 1407 if (ce->engine->class == VIDEO_DECODE_CLASS) 1408 cs = gen12_emit_aux_table_inv(ce->engine->gt, 1409 cs, GEN12_VD0_AUX_NV); 1410 else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS) 1411 cs = gen12_emit_aux_table_inv(ce->engine->gt, 1412 cs, GEN12_VE0_AUX_NV); 1413 } 1414 1415 return cs; 1416 } 1417 1418 static void 1419 setup_indirect_ctx_bb(const struct intel_context *ce, 1420 const struct intel_engine_cs *engine, 1421 u32 *(*emit)(const struct intel_context *, u32 *)) 1422 { 1423 u32 * const start = context_indirect_bb(ce); 1424 u32 *cs; 1425 1426 cs = emit(ce, start); 1427 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1428 while ((unsigned long)cs % CACHELINE_BYTES) 1429 *cs++ = MI_NOOP; 1430 1431 GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1432 setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1433 1434 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1435 lrc_indirect_bb(ce), 1436 (cs - start) * sizeof(*cs)); 1437 } 1438 1439 /* 1440 * The context descriptor encodes various attributes of a context, 1441 * including its GTT address and some flags. Because it's fairly 1442 * expensive to calculate, we'll just do it once and cache the result, 1443 * which remains valid until the context is unpinned. 1444 * 1445 * This is what a descriptor looks like, from LSB to MSB:: 1446 * 1447 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1448 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1449 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1450 * bits 53-54: mbz, reserved for use by hardware 1451 * bits 55-63: group ID, currently unused and set to 0 1452 * 1453 * Starting from Gen11, the upper dword of the descriptor has a new format: 1454 * 1455 * bits 32-36: reserved 1456 * bits 37-47: SW context ID 1457 * bits 48:53: engine instance 1458 * bit 54: mbz, reserved for use by hardware 1459 * bits 55-60: SW counter 1460 * bits 61-63: engine class 1461 * 1462 * On Xe_HP, the upper dword of the descriptor has a new format: 1463 * 1464 * bits 32-37: virtual function number 1465 * bit 38: mbz, reserved for use by hardware 1466 * bits 39-54: SW context ID 1467 * bits 55-57: reserved 1468 * bits 58-63: SW counter 1469 * 1470 * engine info, SW context ID and SW counter need to form a unique number 1471 * (Context ID) per lrc. 1472 */ 1473 static u32 lrc_descriptor(const struct intel_context *ce) 1474 { 1475 u32 desc; 1476 1477 desc = INTEL_LEGACY_32B_CONTEXT; 1478 if (i915_vm_is_4lvl(ce->vm)) 1479 desc = INTEL_LEGACY_64B_CONTEXT; 1480 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1481 1482 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1483 if (GRAPHICS_VER(ce->vm->i915) == 8) 1484 desc |= GEN8_CTX_L3LLC_COHERENT; 1485 1486 return i915_ggtt_offset(ce->state) | desc; 1487 } 1488 1489 u32 lrc_update_regs(const struct intel_context *ce, 1490 const struct intel_engine_cs *engine, 1491 u32 head) 1492 { 1493 struct intel_ring *ring = ce->ring; 1494 u32 *regs = ce->lrc_reg_state; 1495 1496 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1497 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1498 1499 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1500 regs[CTX_RING_HEAD] = head; 1501 regs[CTX_RING_TAIL] = ring->tail; 1502 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1503 1504 /* RPCS */ 1505 if (engine->class == RENDER_CLASS) { 1506 regs[CTX_R_PWR_CLK_STATE] = 1507 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1508 1509 i915_oa_init_reg_state(ce, engine); 1510 } 1511 1512 if (ce->wa_bb_page) { 1513 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1514 1515 fn = gen12_emit_indirect_ctx_xcs; 1516 if (ce->engine->class == RENDER_CLASS) 1517 fn = gen12_emit_indirect_ctx_rcs; 1518 1519 /* Mutually exclusive wrt to global indirect bb */ 1520 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1521 setup_indirect_ctx_bb(ce, engine, fn); 1522 } 1523 1524 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1525 } 1526 1527 void lrc_update_offsets(struct intel_context *ce, 1528 struct intel_engine_cs *engine) 1529 { 1530 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1531 } 1532 1533 void lrc_check_regs(const struct intel_context *ce, 1534 const struct intel_engine_cs *engine, 1535 const char *when) 1536 { 1537 const struct intel_ring *ring = ce->ring; 1538 u32 *regs = ce->lrc_reg_state; 1539 bool valid = true; 1540 int x; 1541 1542 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1543 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1544 engine->name, 1545 regs[CTX_RING_START], 1546 i915_ggtt_offset(ring->vma)); 1547 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1548 valid = false; 1549 } 1550 1551 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1552 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1553 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1554 engine->name, 1555 regs[CTX_RING_CTL], 1556 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1557 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1558 valid = false; 1559 } 1560 1561 x = lrc_ring_mi_mode(engine); 1562 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1563 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1564 engine->name, regs[x + 1]); 1565 regs[x + 1] &= ~STOP_RING; 1566 regs[x + 1] |= STOP_RING << 16; 1567 valid = false; 1568 } 1569 1570 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1571 } 1572 1573 /* 1574 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1575 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1576 * but there is a slight complication as this is applied in WA batch where the 1577 * values are only initialized once so we cannot take register value at the 1578 * beginning and reuse it further; hence we save its value to memory, upload a 1579 * constant value with bit21 set and then we restore it back with the saved value. 1580 * To simplify the WA, a constant value is formed by using the default value 1581 * of this register. This shouldn't be a problem because we are only modifying 1582 * it for a short period and this batch in non-premptible. We can ofcourse 1583 * use additional instructions that read the actual value of the register 1584 * at that time and set our bit of interest but it makes the WA complicated. 1585 * 1586 * This WA is also required for Gen9 so extracting as a function avoids 1587 * code duplication. 1588 */ 1589 static u32 * 1590 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1591 { 1592 /* NB no one else is allowed to scribble over scratch + 256! */ 1593 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1594 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1595 *batch++ = intel_gt_scratch_offset(engine->gt, 1596 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1597 *batch++ = 0; 1598 1599 *batch++ = MI_LOAD_REGISTER_IMM(1); 1600 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1601 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1602 1603 batch = gen8_emit_pipe_control(batch, 1604 PIPE_CONTROL_CS_STALL | 1605 PIPE_CONTROL_DC_FLUSH_ENABLE, 1606 0); 1607 1608 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1609 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1610 *batch++ = intel_gt_scratch_offset(engine->gt, 1611 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1612 *batch++ = 0; 1613 1614 return batch; 1615 } 1616 1617 /* 1618 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1619 * initialized at the beginning and shared across all contexts but this field 1620 * helps us to have multiple batches at different offsets and select them based 1621 * on a criteria. At the moment this batch always start at the beginning of the page 1622 * and at this point we don't have multiple wa_ctx batch buffers. 1623 * 1624 * The number of WA applied are not known at the beginning; we use this field 1625 * to return the no of DWORDS written. 1626 * 1627 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1628 * so it adds NOOPs as padding to make it cacheline aligned. 1629 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1630 * makes a complete batch buffer. 1631 */ 1632 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1633 { 1634 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1635 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1636 1637 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1638 if (IS_BROADWELL(engine->i915)) 1639 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1640 1641 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1642 /* Actual scratch location is at 128 bytes offset */ 1643 batch = gen8_emit_pipe_control(batch, 1644 PIPE_CONTROL_FLUSH_L3 | 1645 PIPE_CONTROL_STORE_DATA_INDEX | 1646 PIPE_CONTROL_CS_STALL | 1647 PIPE_CONTROL_QW_WRITE, 1648 LRC_PPHWSP_SCRATCH_ADDR); 1649 1650 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1651 1652 /* Pad to end of cacheline */ 1653 while ((unsigned long)batch % CACHELINE_BYTES) 1654 *batch++ = MI_NOOP; 1655 1656 /* 1657 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1658 * execution depends on the length specified in terms of cache lines 1659 * in the register CTX_RCS_INDIRECT_CTX 1660 */ 1661 1662 return batch; 1663 } 1664 1665 struct lri { 1666 i915_reg_t reg; 1667 u32 value; 1668 }; 1669 1670 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1671 { 1672 GEM_BUG_ON(!count || count > 63); 1673 1674 *batch++ = MI_LOAD_REGISTER_IMM(count); 1675 do { 1676 *batch++ = i915_mmio_reg_offset(lri->reg); 1677 *batch++ = lri->value; 1678 } while (lri++, --count); 1679 *batch++ = MI_NOOP; 1680 1681 return batch; 1682 } 1683 1684 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1685 { 1686 static const struct lri lri[] = { 1687 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1688 { 1689 COMMON_SLICE_CHICKEN2, 1690 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1691 0), 1692 }, 1693 1694 /* BSpec: 11391 */ 1695 { 1696 FF_SLICE_CHICKEN, 1697 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1698 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1699 }, 1700 1701 /* BSpec: 11299 */ 1702 { 1703 _3D_CHICKEN3, 1704 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1705 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1706 } 1707 }; 1708 1709 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1710 1711 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1712 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1713 1714 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1715 batch = gen8_emit_pipe_control(batch, 1716 PIPE_CONTROL_FLUSH_L3 | 1717 PIPE_CONTROL_STORE_DATA_INDEX | 1718 PIPE_CONTROL_CS_STALL | 1719 PIPE_CONTROL_QW_WRITE, 1720 LRC_PPHWSP_SCRATCH_ADDR); 1721 1722 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1723 1724 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1725 if (HAS_POOLED_EU(engine->i915)) { 1726 /* 1727 * EU pool configuration is setup along with golden context 1728 * during context initialization. This value depends on 1729 * device type (2x6 or 3x6) and needs to be updated based 1730 * on which subslice is disabled especially for 2x6 1731 * devices, however it is safe to load default 1732 * configuration of 3x6 device instead of masking off 1733 * corresponding bits because HW ignores bits of a disabled 1734 * subslice and drops down to appropriate config. Please 1735 * see render_state_setup() in i915_gem_render_state.c for 1736 * possible configurations, to avoid duplication they are 1737 * not shown here again. 1738 */ 1739 *batch++ = GEN9_MEDIA_POOL_STATE; 1740 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1741 *batch++ = 0x00777000; 1742 *batch++ = 0; 1743 *batch++ = 0; 1744 *batch++ = 0; 1745 } 1746 1747 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1748 1749 /* Pad to end of cacheline */ 1750 while ((unsigned long)batch % CACHELINE_BYTES) 1751 *batch++ = MI_NOOP; 1752 1753 return batch; 1754 } 1755 1756 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1757 1758 static int lrc_create_wa_ctx(struct intel_engine_cs *engine) 1759 { 1760 struct drm_i915_gem_object *obj; 1761 struct i915_vma *vma; 1762 int err; 1763 1764 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1765 if (IS_ERR(obj)) 1766 return PTR_ERR(obj); 1767 1768 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1769 if (IS_ERR(vma)) { 1770 err = PTR_ERR(vma); 1771 goto err; 1772 } 1773 1774 engine->wa_ctx.vma = vma; 1775 return 0; 1776 1777 err: 1778 i915_gem_object_put(obj); 1779 return err; 1780 } 1781 1782 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1783 { 1784 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1785 } 1786 1787 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1788 1789 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1790 { 1791 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1792 struct i915_wa_ctx_bb *wa_bb[] = { 1793 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1794 }; 1795 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1796 struct i915_gem_ww_ctx ww; 1797 void *batch, *batch_ptr; 1798 unsigned int i; 1799 int err; 1800 1801 if (GRAPHICS_VER(engine->i915) >= 11 || 1802 !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)) 1803 return; 1804 1805 if (GRAPHICS_VER(engine->i915) == 9) { 1806 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1807 wa_bb_fn[1] = NULL; 1808 } else if (GRAPHICS_VER(engine->i915) == 8) { 1809 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1810 wa_bb_fn[1] = NULL; 1811 } 1812 1813 err = lrc_create_wa_ctx(engine); 1814 if (err) { 1815 /* 1816 * We continue even if we fail to initialize WA batch 1817 * because we only expect rare glitches but nothing 1818 * critical to prevent us from using GPU 1819 */ 1820 drm_err(&engine->i915->drm, 1821 "Ignoring context switch w/a allocation error:%d\n", 1822 err); 1823 return; 1824 } 1825 1826 if (!engine->wa_ctx.vma) 1827 return; 1828 1829 i915_gem_ww_ctx_init(&ww, true); 1830 retry: 1831 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); 1832 if (!err) 1833 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); 1834 if (err) 1835 goto err; 1836 1837 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1838 if (IS_ERR(batch)) { 1839 err = PTR_ERR(batch); 1840 goto err_unpin; 1841 } 1842 1843 /* 1844 * Emit the two workaround batch buffers, recording the offset from the 1845 * start of the workaround batch buffer object for each and their 1846 * respective sizes. 1847 */ 1848 batch_ptr = batch; 1849 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1850 wa_bb[i]->offset = batch_ptr - batch; 1851 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1852 CACHELINE_BYTES))) { 1853 err = -EINVAL; 1854 break; 1855 } 1856 if (wa_bb_fn[i]) 1857 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1858 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1859 } 1860 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1861 1862 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1863 __i915_gem_object_release_map(wa_ctx->vma->obj); 1864 1865 /* Verify that we can handle failure to setup the wa_ctx */ 1866 if (!err) 1867 err = i915_inject_probe_error(engine->i915, -ENODEV); 1868 1869 err_unpin: 1870 if (err) 1871 i915_vma_unpin(wa_ctx->vma); 1872 err: 1873 if (err == -EDEADLK) { 1874 err = i915_gem_ww_ctx_backoff(&ww); 1875 if (!err) 1876 goto retry; 1877 } 1878 i915_gem_ww_ctx_fini(&ww); 1879 1880 if (err) { 1881 i915_vma_put(engine->wa_ctx.vma); 1882 1883 /* Clear all flags to prevent further use */ 1884 memset(wa_ctx, 0, sizeof(*wa_ctx)); 1885 } 1886 } 1887 1888 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt) 1889 { 1890 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1891 stats->runtime.num_underflow++; 1892 stats->runtime.max_underflow = 1893 max_t(u32, stats->runtime.max_underflow, -dt); 1894 #endif 1895 } 1896 1897 static u32 lrc_get_runtime(const struct intel_context *ce) 1898 { 1899 /* 1900 * We can use either ppHWSP[16] which is recorded before the context 1901 * switch (and so excludes the cost of context switches) or use the 1902 * value from the context image itself, which is saved/restored earlier 1903 * and so includes the cost of the save. 1904 */ 1905 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 1906 } 1907 1908 void lrc_update_runtime(struct intel_context *ce) 1909 { 1910 struct intel_context_stats *stats = &ce->stats; 1911 u32 old; 1912 s32 dt; 1913 1914 old = stats->runtime.last; 1915 stats->runtime.last = lrc_get_runtime(ce); 1916 dt = stats->runtime.last - old; 1917 if (!dt) 1918 return; 1919 1920 if (unlikely(dt < 0)) { 1921 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1922 old, stats->runtime.last, dt); 1923 st_runtime_underflow(stats, dt); 1924 return; 1925 } 1926 1927 ewma_runtime_add(&stats->runtime.avg, dt); 1928 stats->runtime.total += dt; 1929 } 1930 1931 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1932 #include "selftest_lrc.c" 1933 #endif 1934