1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include "gem/i915_gem_lmem.h" 7 8 #include "gen8_engine_cs.h" 9 #include "i915_drv.h" 10 #include "i915_perf.h" 11 #include "i915_reg.h" 12 #include "intel_context.h" 13 #include "intel_engine.h" 14 #include "intel_engine_regs.h" 15 #include "intel_gpu_commands.h" 16 #include "intel_gt.h" 17 #include "intel_gt_regs.h" 18 #include "intel_lrc.h" 19 #include "intel_lrc_reg.h" 20 #include "intel_ring.h" 21 #include "shmem_utils.h" 22 23 static void set_offsets(u32 *regs, 24 const u8 *data, 25 const struct intel_engine_cs *engine, 26 bool close) 27 #define NOP(x) (BIT(7) | (x)) 28 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 29 #define POSTED BIT(0) 30 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 31 #define REG16(x) \ 32 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 33 (((x) >> 2) & 0x7f) 34 #define END 0 35 { 36 const u32 base = engine->mmio_base; 37 38 while (*data) { 39 u8 count, flags; 40 41 if (*data & BIT(7)) { /* skip */ 42 count = *data++ & ~BIT(7); 43 regs += count; 44 continue; 45 } 46 47 count = *data & 0x3f; 48 flags = *data >> 6; 49 data++; 50 51 *regs = MI_LOAD_REGISTER_IMM(count); 52 if (flags & POSTED) 53 *regs |= MI_LRI_FORCE_POSTED; 54 if (GRAPHICS_VER(engine->i915) >= 11) 55 *regs |= MI_LRI_LRM_CS_MMIO; 56 regs++; 57 58 GEM_BUG_ON(!count); 59 do { 60 u32 offset = 0; 61 u8 v; 62 63 do { 64 v = *data++; 65 offset <<= 7; 66 offset |= v & ~BIT(7); 67 } while (v & BIT(7)); 68 69 regs[0] = base + (offset << 2); 70 regs += 2; 71 } while (--count); 72 } 73 74 if (close) { 75 /* Close the batch; used mainly by live_lrc_layout() */ 76 *regs = MI_BATCH_BUFFER_END; 77 if (GRAPHICS_VER(engine->i915) >= 11) 78 *regs |= BIT(0); 79 } 80 } 81 82 static const u8 gen8_xcs_offsets[] = { 83 NOP(1), 84 LRI(11, 0), 85 REG16(0x244), 86 REG(0x034), 87 REG(0x030), 88 REG(0x038), 89 REG(0x03c), 90 REG(0x168), 91 REG(0x140), 92 REG(0x110), 93 REG(0x11c), 94 REG(0x114), 95 REG(0x118), 96 97 NOP(9), 98 LRI(9, 0), 99 REG16(0x3a8), 100 REG16(0x28c), 101 REG16(0x288), 102 REG16(0x284), 103 REG16(0x280), 104 REG16(0x27c), 105 REG16(0x278), 106 REG16(0x274), 107 REG16(0x270), 108 109 NOP(13), 110 LRI(2, 0), 111 REG16(0x200), 112 REG(0x028), 113 114 END 115 }; 116 117 static const u8 gen9_xcs_offsets[] = { 118 NOP(1), 119 LRI(14, POSTED), 120 REG16(0x244), 121 REG(0x034), 122 REG(0x030), 123 REG(0x038), 124 REG(0x03c), 125 REG(0x168), 126 REG(0x140), 127 REG(0x110), 128 REG(0x11c), 129 REG(0x114), 130 REG(0x118), 131 REG(0x1c0), 132 REG(0x1c4), 133 REG(0x1c8), 134 135 NOP(3), 136 LRI(9, POSTED), 137 REG16(0x3a8), 138 REG16(0x28c), 139 REG16(0x288), 140 REG16(0x284), 141 REG16(0x280), 142 REG16(0x27c), 143 REG16(0x278), 144 REG16(0x274), 145 REG16(0x270), 146 147 NOP(13), 148 LRI(1, POSTED), 149 REG16(0x200), 150 151 NOP(13), 152 LRI(44, POSTED), 153 REG(0x028), 154 REG(0x09c), 155 REG(0x0c0), 156 REG(0x178), 157 REG(0x17c), 158 REG16(0x358), 159 REG(0x170), 160 REG(0x150), 161 REG(0x154), 162 REG(0x158), 163 REG16(0x41c), 164 REG16(0x600), 165 REG16(0x604), 166 REG16(0x608), 167 REG16(0x60c), 168 REG16(0x610), 169 REG16(0x614), 170 REG16(0x618), 171 REG16(0x61c), 172 REG16(0x620), 173 REG16(0x624), 174 REG16(0x628), 175 REG16(0x62c), 176 REG16(0x630), 177 REG16(0x634), 178 REG16(0x638), 179 REG16(0x63c), 180 REG16(0x640), 181 REG16(0x644), 182 REG16(0x648), 183 REG16(0x64c), 184 REG16(0x650), 185 REG16(0x654), 186 REG16(0x658), 187 REG16(0x65c), 188 REG16(0x660), 189 REG16(0x664), 190 REG16(0x668), 191 REG16(0x66c), 192 REG16(0x670), 193 REG16(0x674), 194 REG16(0x678), 195 REG16(0x67c), 196 REG(0x068), 197 198 END 199 }; 200 201 static const u8 gen12_xcs_offsets[] = { 202 NOP(1), 203 LRI(13, POSTED), 204 REG16(0x244), 205 REG(0x034), 206 REG(0x030), 207 REG(0x038), 208 REG(0x03c), 209 REG(0x168), 210 REG(0x140), 211 REG(0x110), 212 REG(0x1c0), 213 REG(0x1c4), 214 REG(0x1c8), 215 REG(0x180), 216 REG16(0x2b4), 217 218 NOP(5), 219 LRI(9, POSTED), 220 REG16(0x3a8), 221 REG16(0x28c), 222 REG16(0x288), 223 REG16(0x284), 224 REG16(0x280), 225 REG16(0x27c), 226 REG16(0x278), 227 REG16(0x274), 228 REG16(0x270), 229 230 END 231 }; 232 233 static const u8 dg2_xcs_offsets[] = { 234 NOP(1), 235 LRI(15, POSTED), 236 REG16(0x244), 237 REG(0x034), 238 REG(0x030), 239 REG(0x038), 240 REG(0x03c), 241 REG(0x168), 242 REG(0x140), 243 REG(0x110), 244 REG(0x1c0), 245 REG(0x1c4), 246 REG(0x1c8), 247 REG(0x180), 248 REG16(0x2b4), 249 REG(0x120), 250 REG(0x124), 251 252 NOP(1), 253 LRI(9, POSTED), 254 REG16(0x3a8), 255 REG16(0x28c), 256 REG16(0x288), 257 REG16(0x284), 258 REG16(0x280), 259 REG16(0x27c), 260 REG16(0x278), 261 REG16(0x274), 262 REG16(0x270), 263 264 END 265 }; 266 267 static const u8 mtl_xcs_offsets[] = { 268 NOP(1), 269 LRI(13, POSTED), 270 REG16(0x244), 271 REG(0x034), 272 REG(0x030), 273 REG(0x038), 274 REG(0x03c), 275 REG(0x168), 276 REG(0x140), 277 REG(0x110), 278 REG(0x1c0), 279 REG(0x1c4), 280 REG(0x1c8), 281 REG(0x180), 282 REG16(0x2b4), 283 NOP(4), 284 285 NOP(1), 286 LRI(9, POSTED), 287 REG16(0x3a8), 288 REG16(0x28c), 289 REG16(0x288), 290 REG16(0x284), 291 REG16(0x280), 292 REG16(0x27c), 293 REG16(0x278), 294 REG16(0x274), 295 REG16(0x270), 296 297 END 298 }; 299 300 static const u8 gen8_rcs_offsets[] = { 301 NOP(1), 302 LRI(14, POSTED), 303 REG16(0x244), 304 REG(0x034), 305 REG(0x030), 306 REG(0x038), 307 REG(0x03c), 308 REG(0x168), 309 REG(0x140), 310 REG(0x110), 311 REG(0x11c), 312 REG(0x114), 313 REG(0x118), 314 REG(0x1c0), 315 REG(0x1c4), 316 REG(0x1c8), 317 318 NOP(3), 319 LRI(9, POSTED), 320 REG16(0x3a8), 321 REG16(0x28c), 322 REG16(0x288), 323 REG16(0x284), 324 REG16(0x280), 325 REG16(0x27c), 326 REG16(0x278), 327 REG16(0x274), 328 REG16(0x270), 329 330 NOP(13), 331 LRI(1, 0), 332 REG(0x0c8), 333 334 END 335 }; 336 337 static const u8 gen9_rcs_offsets[] = { 338 NOP(1), 339 LRI(14, POSTED), 340 REG16(0x244), 341 REG(0x34), 342 REG(0x30), 343 REG(0x38), 344 REG(0x3c), 345 REG(0x168), 346 REG(0x140), 347 REG(0x110), 348 REG(0x11c), 349 REG(0x114), 350 REG(0x118), 351 REG(0x1c0), 352 REG(0x1c4), 353 REG(0x1c8), 354 355 NOP(3), 356 LRI(9, POSTED), 357 REG16(0x3a8), 358 REG16(0x28c), 359 REG16(0x288), 360 REG16(0x284), 361 REG16(0x280), 362 REG16(0x27c), 363 REG16(0x278), 364 REG16(0x274), 365 REG16(0x270), 366 367 NOP(13), 368 LRI(1, 0), 369 REG(0xc8), 370 371 NOP(13), 372 LRI(44, POSTED), 373 REG(0x28), 374 REG(0x9c), 375 REG(0xc0), 376 REG(0x178), 377 REG(0x17c), 378 REG16(0x358), 379 REG(0x170), 380 REG(0x150), 381 REG(0x154), 382 REG(0x158), 383 REG16(0x41c), 384 REG16(0x600), 385 REG16(0x604), 386 REG16(0x608), 387 REG16(0x60c), 388 REG16(0x610), 389 REG16(0x614), 390 REG16(0x618), 391 REG16(0x61c), 392 REG16(0x620), 393 REG16(0x624), 394 REG16(0x628), 395 REG16(0x62c), 396 REG16(0x630), 397 REG16(0x634), 398 REG16(0x638), 399 REG16(0x63c), 400 REG16(0x640), 401 REG16(0x644), 402 REG16(0x648), 403 REG16(0x64c), 404 REG16(0x650), 405 REG16(0x654), 406 REG16(0x658), 407 REG16(0x65c), 408 REG16(0x660), 409 REG16(0x664), 410 REG16(0x668), 411 REG16(0x66c), 412 REG16(0x670), 413 REG16(0x674), 414 REG16(0x678), 415 REG16(0x67c), 416 REG(0x68), 417 418 END 419 }; 420 421 static const u8 gen11_rcs_offsets[] = { 422 NOP(1), 423 LRI(15, POSTED), 424 REG16(0x244), 425 REG(0x034), 426 REG(0x030), 427 REG(0x038), 428 REG(0x03c), 429 REG(0x168), 430 REG(0x140), 431 REG(0x110), 432 REG(0x11c), 433 REG(0x114), 434 REG(0x118), 435 REG(0x1c0), 436 REG(0x1c4), 437 REG(0x1c8), 438 REG(0x180), 439 440 NOP(1), 441 LRI(9, POSTED), 442 REG16(0x3a8), 443 REG16(0x28c), 444 REG16(0x288), 445 REG16(0x284), 446 REG16(0x280), 447 REG16(0x27c), 448 REG16(0x278), 449 REG16(0x274), 450 REG16(0x270), 451 452 LRI(1, POSTED), 453 REG(0x1b0), 454 455 NOP(10), 456 LRI(1, 0), 457 REG(0x0c8), 458 459 END 460 }; 461 462 static const u8 gen12_rcs_offsets[] = { 463 NOP(1), 464 LRI(13, POSTED), 465 REG16(0x244), 466 REG(0x034), 467 REG(0x030), 468 REG(0x038), 469 REG(0x03c), 470 REG(0x168), 471 REG(0x140), 472 REG(0x110), 473 REG(0x1c0), 474 REG(0x1c4), 475 REG(0x1c8), 476 REG(0x180), 477 REG16(0x2b4), 478 479 NOP(5), 480 LRI(9, POSTED), 481 REG16(0x3a8), 482 REG16(0x28c), 483 REG16(0x288), 484 REG16(0x284), 485 REG16(0x280), 486 REG16(0x27c), 487 REG16(0x278), 488 REG16(0x274), 489 REG16(0x270), 490 491 LRI(3, POSTED), 492 REG(0x1b0), 493 REG16(0x5a8), 494 REG16(0x5ac), 495 496 NOP(6), 497 LRI(1, 0), 498 REG(0x0c8), 499 NOP(3 + 9 + 1), 500 501 LRI(51, POSTED), 502 REG16(0x588), 503 REG16(0x588), 504 REG16(0x588), 505 REG16(0x588), 506 REG16(0x588), 507 REG16(0x588), 508 REG(0x028), 509 REG(0x09c), 510 REG(0x0c0), 511 REG(0x178), 512 REG(0x17c), 513 REG16(0x358), 514 REG(0x170), 515 REG(0x150), 516 REG(0x154), 517 REG(0x158), 518 REG16(0x41c), 519 REG16(0x600), 520 REG16(0x604), 521 REG16(0x608), 522 REG16(0x60c), 523 REG16(0x610), 524 REG16(0x614), 525 REG16(0x618), 526 REG16(0x61c), 527 REG16(0x620), 528 REG16(0x624), 529 REG16(0x628), 530 REG16(0x62c), 531 REG16(0x630), 532 REG16(0x634), 533 REG16(0x638), 534 REG16(0x63c), 535 REG16(0x640), 536 REG16(0x644), 537 REG16(0x648), 538 REG16(0x64c), 539 REG16(0x650), 540 REG16(0x654), 541 REG16(0x658), 542 REG16(0x65c), 543 REG16(0x660), 544 REG16(0x664), 545 REG16(0x668), 546 REG16(0x66c), 547 REG16(0x670), 548 REG16(0x674), 549 REG16(0x678), 550 REG16(0x67c), 551 REG(0x068), 552 REG(0x084), 553 NOP(1), 554 555 END 556 }; 557 558 static const u8 xehp_rcs_offsets[] = { 559 NOP(1), 560 LRI(13, POSTED), 561 REG16(0x244), 562 REG(0x034), 563 REG(0x030), 564 REG(0x038), 565 REG(0x03c), 566 REG(0x168), 567 REG(0x140), 568 REG(0x110), 569 REG(0x1c0), 570 REG(0x1c4), 571 REG(0x1c8), 572 REG(0x180), 573 REG16(0x2b4), 574 575 NOP(5), 576 LRI(9, POSTED), 577 REG16(0x3a8), 578 REG16(0x28c), 579 REG16(0x288), 580 REG16(0x284), 581 REG16(0x280), 582 REG16(0x27c), 583 REG16(0x278), 584 REG16(0x274), 585 REG16(0x270), 586 587 LRI(3, POSTED), 588 REG(0x1b0), 589 REG16(0x5a8), 590 REG16(0x5ac), 591 592 NOP(6), 593 LRI(1, 0), 594 REG(0x0c8), 595 596 END 597 }; 598 599 static const u8 dg2_rcs_offsets[] = { 600 NOP(1), 601 LRI(15, POSTED), 602 REG16(0x244), 603 REG(0x034), 604 REG(0x030), 605 REG(0x038), 606 REG(0x03c), 607 REG(0x168), 608 REG(0x140), 609 REG(0x110), 610 REG(0x1c0), 611 REG(0x1c4), 612 REG(0x1c8), 613 REG(0x180), 614 REG16(0x2b4), 615 REG(0x120), 616 REG(0x124), 617 618 NOP(1), 619 LRI(9, POSTED), 620 REG16(0x3a8), 621 REG16(0x28c), 622 REG16(0x288), 623 REG16(0x284), 624 REG16(0x280), 625 REG16(0x27c), 626 REG16(0x278), 627 REG16(0x274), 628 REG16(0x270), 629 630 LRI(3, POSTED), 631 REG(0x1b0), 632 REG16(0x5a8), 633 REG16(0x5ac), 634 635 NOP(6), 636 LRI(1, 0), 637 REG(0x0c8), 638 639 END 640 }; 641 642 static const u8 mtl_rcs_offsets[] = { 643 NOP(1), 644 LRI(15, POSTED), 645 REG16(0x244), 646 REG(0x034), 647 REG(0x030), 648 REG(0x038), 649 REG(0x03c), 650 REG(0x168), 651 REG(0x140), 652 REG(0x110), 653 REG(0x1c0), 654 REG(0x1c4), 655 REG(0x1c8), 656 REG(0x180), 657 REG16(0x2b4), 658 REG(0x120), 659 REG(0x124), 660 661 NOP(1), 662 LRI(9, POSTED), 663 REG16(0x3a8), 664 REG16(0x28c), 665 REG16(0x288), 666 REG16(0x284), 667 REG16(0x280), 668 REG16(0x27c), 669 REG16(0x278), 670 REG16(0x274), 671 REG16(0x270), 672 673 NOP(2), 674 LRI(2, POSTED), 675 REG16(0x5a8), 676 REG16(0x5ac), 677 678 NOP(6), 679 LRI(1, 0), 680 REG(0x0c8), 681 682 END 683 }; 684 685 #undef END 686 #undef REG16 687 #undef REG 688 #undef LRI 689 #undef NOP 690 691 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 692 { 693 /* 694 * The gen12+ lists only have the registers we program in the basic 695 * default state. We rely on the context image using relative 696 * addressing to automatic fixup the register state between the 697 * physical engines for virtual engine. 698 */ 699 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && 700 !intel_engine_has_relative_mmio(engine)); 701 702 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) { 703 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) 704 return mtl_rcs_offsets; 705 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 706 return dg2_rcs_offsets; 707 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 708 return xehp_rcs_offsets; 709 else if (GRAPHICS_VER(engine->i915) >= 12) 710 return gen12_rcs_offsets; 711 else if (GRAPHICS_VER(engine->i915) >= 11) 712 return gen11_rcs_offsets; 713 else if (GRAPHICS_VER(engine->i915) >= 9) 714 return gen9_rcs_offsets; 715 else 716 return gen8_rcs_offsets; 717 } else { 718 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) 719 return mtl_xcs_offsets; 720 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 721 return dg2_xcs_offsets; 722 else if (GRAPHICS_VER(engine->i915) >= 12) 723 return gen12_xcs_offsets; 724 else if (GRAPHICS_VER(engine->i915) >= 9) 725 return gen9_xcs_offsets; 726 else 727 return gen8_xcs_offsets; 728 } 729 } 730 731 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 732 { 733 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 734 return 0x70; 735 else if (GRAPHICS_VER(engine->i915) >= 12) 736 return 0x60; 737 else if (GRAPHICS_VER(engine->i915) >= 9) 738 return 0x54; 739 else if (engine->class == RENDER_CLASS) 740 return 0x58; 741 else 742 return -1; 743 } 744 745 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine) 746 { 747 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 748 return 0x80; 749 else if (GRAPHICS_VER(engine->i915) >= 12) 750 return 0x70; 751 else if (GRAPHICS_VER(engine->i915) >= 9) 752 return 0x64; 753 else if (GRAPHICS_VER(engine->i915) >= 8 && 754 engine->class == RENDER_CLASS) 755 return 0xc4; 756 else 757 return -1; 758 } 759 760 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 761 { 762 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 763 return 0x84; 764 else if (GRAPHICS_VER(engine->i915) >= 12) 765 return 0x74; 766 else if (GRAPHICS_VER(engine->i915) >= 9) 767 return 0x68; 768 else if (engine->class == RENDER_CLASS) 769 return 0xd8; 770 else 771 return -1; 772 } 773 774 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 775 { 776 if (GRAPHICS_VER(engine->i915) >= 12) 777 return 0x12; 778 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS) 779 return 0x18; 780 else 781 return -1; 782 } 783 784 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 785 { 786 int x; 787 788 x = lrc_ring_wa_bb_per_ctx(engine); 789 if (x < 0) 790 return x; 791 792 return x + 2; 793 } 794 795 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 796 { 797 int x; 798 799 x = lrc_ring_indirect_ptr(engine); 800 if (x < 0) 801 return x; 802 803 return x + 2; 804 } 805 806 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 807 { 808 809 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 810 /* 811 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL 812 * simply to match the RCS context image layout. 813 */ 814 return 0xc6; 815 else if (engine->class != RENDER_CLASS) 816 return -1; 817 else if (GRAPHICS_VER(engine->i915) >= 12) 818 return 0xb6; 819 else if (GRAPHICS_VER(engine->i915) >= 11) 820 return 0xaa; 821 else 822 return -1; 823 } 824 825 static u32 826 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 827 { 828 if (GRAPHICS_VER(engine->i915) >= 12) 829 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 830 else if (GRAPHICS_VER(engine->i915) >= 11) 831 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 832 else if (GRAPHICS_VER(engine->i915) >= 9) 833 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 834 else if (GRAPHICS_VER(engine->i915) >= 8) 835 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 836 837 GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8); 838 839 return 0; 840 } 841 842 static void 843 lrc_setup_indirect_ctx(u32 *regs, 844 const struct intel_engine_cs *engine, 845 u32 ctx_bb_ggtt_addr, 846 u32 size) 847 { 848 GEM_BUG_ON(!size); 849 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 850 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 851 regs[lrc_ring_indirect_ptr(engine) + 1] = 852 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 853 854 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 855 regs[lrc_ring_indirect_offset(engine) + 1] = 856 lrc_ring_indirect_offset_default(engine) << 6; 857 } 858 859 static void init_common_regs(u32 * const regs, 860 const struct intel_context *ce, 861 const struct intel_engine_cs *engine, 862 bool inhibit) 863 { 864 u32 ctl; 865 int loc; 866 867 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 868 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 869 if (inhibit) 870 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 871 if (GRAPHICS_VER(engine->i915) < 11) 872 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 873 CTX_CTRL_RS_CTX_ENABLE); 874 regs[CTX_CONTEXT_CONTROL] = ctl; 875 876 regs[CTX_TIMESTAMP] = ce->stats.runtime.last; 877 878 loc = lrc_ring_bb_offset(engine); 879 if (loc != -1) 880 regs[loc + 1] = 0; 881 } 882 883 static void init_wa_bb_regs(u32 * const regs, 884 const struct intel_engine_cs *engine) 885 { 886 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 887 888 if (wa_ctx->per_ctx.size) { 889 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 890 891 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 892 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 893 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 894 } 895 896 if (wa_ctx->indirect_ctx.size) { 897 lrc_setup_indirect_ctx(regs, engine, 898 i915_ggtt_offset(wa_ctx->vma) + 899 wa_ctx->indirect_ctx.offset, 900 wa_ctx->indirect_ctx.size); 901 } 902 } 903 904 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 905 { 906 if (i915_vm_is_4lvl(&ppgtt->vm)) { 907 /* 64b PPGTT (48bit canonical) 908 * PDP0_DESCRIPTOR contains the base address to PML4 and 909 * other PDP Descriptors are ignored. 910 */ 911 ASSIGN_CTX_PML4(ppgtt, regs); 912 } else { 913 ASSIGN_CTX_PDP(ppgtt, regs, 3); 914 ASSIGN_CTX_PDP(ppgtt, regs, 2); 915 ASSIGN_CTX_PDP(ppgtt, regs, 1); 916 ASSIGN_CTX_PDP(ppgtt, regs, 0); 917 } 918 } 919 920 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 921 { 922 if (i915_is_ggtt(vm)) 923 return i915_vm_to_ggtt(vm)->alias; 924 else 925 return i915_vm_to_ppgtt(vm); 926 } 927 928 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 929 { 930 int x; 931 932 x = lrc_ring_mi_mode(engine); 933 if (x != -1) { 934 regs[x + 1] &= ~STOP_RING; 935 regs[x + 1] |= STOP_RING << 16; 936 } 937 } 938 939 static void __lrc_init_regs(u32 *regs, 940 const struct intel_context *ce, 941 const struct intel_engine_cs *engine, 942 bool inhibit) 943 { 944 /* 945 * A context is actually a big batch buffer with several 946 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 947 * values we are setting here are only for the first context restore: 948 * on a subsequent save, the GPU will recreate this batchbuffer with new 949 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 950 * we are not initializing here). 951 * 952 * Must keep consistent with virtual_update_register_offsets(). 953 */ 954 955 if (inhibit) 956 memset(regs, 0, PAGE_SIZE); 957 958 set_offsets(regs, reg_offsets(engine), engine, inhibit); 959 960 init_common_regs(regs, ce, engine, inhibit); 961 init_ppgtt_regs(regs, vm_alias(ce->vm)); 962 963 init_wa_bb_regs(regs, engine); 964 965 __reset_stop_ring(regs, engine); 966 } 967 968 void lrc_init_regs(const struct intel_context *ce, 969 const struct intel_engine_cs *engine, 970 bool inhibit) 971 { 972 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 973 } 974 975 void lrc_reset_regs(const struct intel_context *ce, 976 const struct intel_engine_cs *engine) 977 { 978 __reset_stop_ring(ce->lrc_reg_state, engine); 979 } 980 981 static void 982 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 983 { 984 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 985 return; 986 987 vaddr += engine->context_size; 988 989 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 990 } 991 992 static void 993 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 994 { 995 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 996 return; 997 998 vaddr += engine->context_size; 999 1000 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 1001 drm_err_once(&engine->i915->drm, 1002 "%s context redzone overwritten!\n", 1003 engine->name); 1004 } 1005 1006 static u32 context_wa_bb_offset(const struct intel_context *ce) 1007 { 1008 return PAGE_SIZE * ce->wa_bb_page; 1009 } 1010 1011 static u32 *context_indirect_bb(const struct intel_context *ce) 1012 { 1013 void *ptr; 1014 1015 GEM_BUG_ON(!ce->wa_bb_page); 1016 1017 ptr = ce->lrc_reg_state; 1018 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 1019 ptr += context_wa_bb_offset(ce); 1020 1021 return ptr; 1022 } 1023 1024 void lrc_init_state(struct intel_context *ce, 1025 struct intel_engine_cs *engine, 1026 void *state) 1027 { 1028 bool inhibit = true; 1029 1030 set_redzone(state, engine); 1031 1032 if (engine->default_state) { 1033 shmem_read(engine->default_state, 0, 1034 state, engine->context_size); 1035 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 1036 inhibit = false; 1037 } 1038 1039 /* Clear the ppHWSP (inc. per-context counters) */ 1040 memset(state, 0, PAGE_SIZE); 1041 1042 /* Clear the indirect wa and storage */ 1043 if (ce->wa_bb_page) 1044 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE); 1045 1046 /* 1047 * The second page of the context object contains some registers which 1048 * must be set up prior to the first execution. 1049 */ 1050 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 1051 } 1052 1053 u32 lrc_indirect_bb(const struct intel_context *ce) 1054 { 1055 return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce); 1056 } 1057 1058 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs) 1059 { 1060 /* If predication is active, this will be noop'ed */ 1061 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 1062 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 1063 *cs++ = 0; 1064 *cs++ = 0; /* No predication */ 1065 1066 /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */ 1067 *cs++ = MI_BATCH_BUFFER_END | BIT(15); 1068 *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE; 1069 1070 /* Instructions are no longer predicated (disabled), we can proceed */ 1071 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 1072 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 1073 *cs++ = 0; 1074 *cs++ = 1; /* enable predication before the next BB */ 1075 1076 *cs++ = MI_BATCH_BUFFER_END; 1077 GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA); 1078 1079 return cs; 1080 } 1081 1082 static struct i915_vma * 1083 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 1084 { 1085 struct drm_i915_gem_object *obj; 1086 struct i915_vma *vma; 1087 u32 context_size; 1088 1089 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 1090 1091 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1092 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 1093 1094 if (GRAPHICS_VER(engine->i915) >= 12) { 1095 ce->wa_bb_page = context_size / PAGE_SIZE; 1096 context_size += PAGE_SIZE; 1097 } 1098 1099 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { 1100 ce->parallel.guc.parent_page = context_size / PAGE_SIZE; 1101 context_size += PARENT_SCRATCH_SIZE; 1102 } 1103 1104 obj = i915_gem_object_create_lmem(engine->i915, context_size, 1105 I915_BO_ALLOC_PM_VOLATILE); 1106 if (IS_ERR(obj)) 1107 obj = i915_gem_object_create_shmem(engine->i915, context_size); 1108 if (IS_ERR(obj)) 1109 return ERR_CAST(obj); 1110 1111 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1112 if (IS_ERR(vma)) { 1113 i915_gem_object_put(obj); 1114 return vma; 1115 } 1116 1117 return vma; 1118 } 1119 1120 static struct intel_timeline * 1121 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 1122 { 1123 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 1124 1125 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 1126 } 1127 1128 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 1129 { 1130 struct intel_ring *ring; 1131 struct i915_vma *vma; 1132 int err; 1133 1134 GEM_BUG_ON(ce->state); 1135 1136 vma = __lrc_alloc_state(ce, engine); 1137 if (IS_ERR(vma)) 1138 return PTR_ERR(vma); 1139 1140 ring = intel_engine_create_ring(engine, ce->ring_size); 1141 if (IS_ERR(ring)) { 1142 err = PTR_ERR(ring); 1143 goto err_vma; 1144 } 1145 1146 if (!page_mask_bits(ce->timeline)) { 1147 struct intel_timeline *tl; 1148 1149 /* 1150 * Use the static global HWSP for the kernel context, and 1151 * a dynamically allocated cacheline for everyone else. 1152 */ 1153 if (unlikely(ce->timeline)) 1154 tl = pinned_timeline(ce, engine); 1155 else 1156 tl = intel_timeline_create(engine->gt); 1157 if (IS_ERR(tl)) { 1158 err = PTR_ERR(tl); 1159 goto err_ring; 1160 } 1161 1162 ce->timeline = tl; 1163 } 1164 1165 ce->ring = ring; 1166 ce->state = vma; 1167 1168 return 0; 1169 1170 err_ring: 1171 intel_ring_put(ring); 1172 err_vma: 1173 i915_vma_put(vma); 1174 return err; 1175 } 1176 1177 void lrc_reset(struct intel_context *ce) 1178 { 1179 GEM_BUG_ON(!intel_context_is_pinned(ce)); 1180 1181 intel_ring_reset(ce->ring, ce->ring->emit); 1182 1183 /* Scrub away the garbage */ 1184 lrc_init_regs(ce, ce->engine, true); 1185 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 1186 } 1187 1188 int 1189 lrc_pre_pin(struct intel_context *ce, 1190 struct intel_engine_cs *engine, 1191 struct i915_gem_ww_ctx *ww, 1192 void **vaddr) 1193 { 1194 GEM_BUG_ON(!ce->state); 1195 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1196 1197 *vaddr = i915_gem_object_pin_map(ce->state->obj, 1198 i915_coherent_map_type(ce->engine->i915, 1199 ce->state->obj, 1200 false) | 1201 I915_MAP_OVERRIDE); 1202 1203 return PTR_ERR_OR_ZERO(*vaddr); 1204 } 1205 1206 int 1207 lrc_pin(struct intel_context *ce, 1208 struct intel_engine_cs *engine, 1209 void *vaddr) 1210 { 1211 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 1212 1213 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 1214 lrc_init_state(ce, engine, vaddr); 1215 1216 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 1217 return 0; 1218 } 1219 1220 void lrc_unpin(struct intel_context *ce) 1221 { 1222 if (unlikely(ce->parallel.last_rq)) { 1223 i915_request_put(ce->parallel.last_rq); 1224 ce->parallel.last_rq = NULL; 1225 } 1226 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 1227 ce->engine); 1228 } 1229 1230 void lrc_post_unpin(struct intel_context *ce) 1231 { 1232 i915_gem_object_unpin_map(ce->state->obj); 1233 } 1234 1235 void lrc_fini(struct intel_context *ce) 1236 { 1237 if (!ce->state) 1238 return; 1239 1240 intel_ring_put(fetch_and_zero(&ce->ring)); 1241 i915_vma_put(fetch_and_zero(&ce->state)); 1242 } 1243 1244 void lrc_destroy(struct kref *kref) 1245 { 1246 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1247 1248 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1249 GEM_BUG_ON(intel_context_is_pinned(ce)); 1250 1251 lrc_fini(ce); 1252 1253 intel_context_fini(ce); 1254 intel_context_free(ce); 1255 } 1256 1257 static u32 * 1258 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 1259 { 1260 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1261 MI_SRM_LRM_GLOBAL_GTT | 1262 MI_LRI_LRM_CS_MMIO; 1263 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1264 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1265 CTX_TIMESTAMP * sizeof(u32); 1266 *cs++ = 0; 1267 1268 *cs++ = MI_LOAD_REGISTER_REG | 1269 MI_LRR_SOURCE_CS_MMIO | 1270 MI_LRI_LRM_CS_MMIO; 1271 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1272 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1273 1274 *cs++ = MI_LOAD_REGISTER_REG | 1275 MI_LRR_SOURCE_CS_MMIO | 1276 MI_LRI_LRM_CS_MMIO; 1277 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1278 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1279 1280 return cs; 1281 } 1282 1283 static u32 * 1284 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 1285 { 1286 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 1287 1288 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1289 MI_SRM_LRM_GLOBAL_GTT | 1290 MI_LRI_LRM_CS_MMIO; 1291 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1292 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1293 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 1294 *cs++ = 0; 1295 1296 return cs; 1297 } 1298 1299 static u32 * 1300 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1301 { 1302 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1303 1304 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1305 MI_SRM_LRM_GLOBAL_GTT | 1306 MI_LRI_LRM_CS_MMIO; 1307 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1308 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1309 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1310 *cs++ = 0; 1311 1312 *cs++ = MI_LOAD_REGISTER_REG | 1313 MI_LRR_SOURCE_CS_MMIO | 1314 MI_LRI_LRM_CS_MMIO; 1315 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1316 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1317 1318 return cs; 1319 } 1320 1321 /* 1322 * On DG2 during context restore of a preempted context in GPGPU mode, 1323 * RCS restore hang is detected. This is extremely timing dependent. 1324 * To address this below sw wabb is implemented for DG2 A steppings. 1325 */ 1326 static u32 * 1327 dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs) 1328 { 1329 *cs++ = MI_LOAD_REGISTER_IMM(1); 1330 *cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG); 1331 *cs++ = 0x21; 1332 1333 *cs++ = MI_LOAD_REGISTER_REG; 1334 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base)); 1335 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1); 1336 1337 *cs++ = MI_LOAD_REGISTER_REG; 1338 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base)); 1339 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2); 1340 1341 return cs; 1342 } 1343 1344 /* 1345 * The bspec's tuning guide asks us to program a vertical watermark value of 1346 * 0x3FF. However this register is not saved/restored properly by the 1347 * hardware, so we're required to apply the desired value via INDIRECT_CTX 1348 * batch buffer to ensure the value takes effect properly. All other bits 1349 * in this register should remain at 0 (the hardware default). 1350 */ 1351 static u32 * 1352 dg2_emit_draw_watermark_setting(u32 *cs) 1353 { 1354 *cs++ = MI_LOAD_REGISTER_IMM(1); 1355 *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK); 1356 *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF); 1357 1358 return cs; 1359 } 1360 1361 static u32 * 1362 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1363 { 1364 cs = gen12_emit_timestamp_wa(ce, cs); 1365 cs = gen12_emit_cmd_buf_wa(ce, cs); 1366 cs = gen12_emit_restore_scratch(ce, cs); 1367 1368 /* Wa_22011450934:dg2 */ 1369 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) || 1370 IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0)) 1371 cs = dg2_emit_rcs_hang_wabb(ce, cs); 1372 1373 /* Wa_16013000631:dg2 */ 1374 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || 1375 IS_DG2_G11(ce->engine->i915)) 1376 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); 1377 1378 /* hsdes: 1809175790 */ 1379 if (!HAS_FLAT_CCS(ce->engine->i915)) 1380 cs = gen12_emit_aux_table_inv(ce->engine->gt, 1381 cs, GEN12_GFX_CCS_AUX_NV); 1382 1383 /* Wa_16014892111 */ 1384 if (IS_DG2(ce->engine->i915)) 1385 cs = dg2_emit_draw_watermark_setting(cs); 1386 1387 return cs; 1388 } 1389 1390 static u32 * 1391 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1392 { 1393 cs = gen12_emit_timestamp_wa(ce, cs); 1394 cs = gen12_emit_restore_scratch(ce, cs); 1395 1396 /* Wa_16013000631:dg2 */ 1397 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || 1398 IS_DG2_G11(ce->engine->i915)) 1399 if (ce->engine->class == COMPUTE_CLASS) 1400 cs = gen8_emit_pipe_control(cs, 1401 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 1402 0); 1403 1404 /* hsdes: 1809175790 */ 1405 if (!HAS_FLAT_CCS(ce->engine->i915)) { 1406 if (ce->engine->class == VIDEO_DECODE_CLASS) 1407 cs = gen12_emit_aux_table_inv(ce->engine->gt, 1408 cs, GEN12_VD0_AUX_NV); 1409 else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS) 1410 cs = gen12_emit_aux_table_inv(ce->engine->gt, 1411 cs, GEN12_VE0_AUX_NV); 1412 } 1413 1414 return cs; 1415 } 1416 1417 static void 1418 setup_indirect_ctx_bb(const struct intel_context *ce, 1419 const struct intel_engine_cs *engine, 1420 u32 *(*emit)(const struct intel_context *, u32 *)) 1421 { 1422 u32 * const start = context_indirect_bb(ce); 1423 u32 *cs; 1424 1425 cs = emit(ce, start); 1426 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1427 while ((unsigned long)cs % CACHELINE_BYTES) 1428 *cs++ = MI_NOOP; 1429 1430 GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1431 setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1432 1433 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1434 lrc_indirect_bb(ce), 1435 (cs - start) * sizeof(*cs)); 1436 } 1437 1438 /* 1439 * The context descriptor encodes various attributes of a context, 1440 * including its GTT address and some flags. Because it's fairly 1441 * expensive to calculate, we'll just do it once and cache the result, 1442 * which remains valid until the context is unpinned. 1443 * 1444 * This is what a descriptor looks like, from LSB to MSB:: 1445 * 1446 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1447 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1448 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1449 * bits 53-54: mbz, reserved for use by hardware 1450 * bits 55-63: group ID, currently unused and set to 0 1451 * 1452 * Starting from Gen11, the upper dword of the descriptor has a new format: 1453 * 1454 * bits 32-36: reserved 1455 * bits 37-47: SW context ID 1456 * bits 48:53: engine instance 1457 * bit 54: mbz, reserved for use by hardware 1458 * bits 55-60: SW counter 1459 * bits 61-63: engine class 1460 * 1461 * On Xe_HP, the upper dword of the descriptor has a new format: 1462 * 1463 * bits 32-37: virtual function number 1464 * bit 38: mbz, reserved for use by hardware 1465 * bits 39-54: SW context ID 1466 * bits 55-57: reserved 1467 * bits 58-63: SW counter 1468 * 1469 * engine info, SW context ID and SW counter need to form a unique number 1470 * (Context ID) per lrc. 1471 */ 1472 static u32 lrc_descriptor(const struct intel_context *ce) 1473 { 1474 u32 desc; 1475 1476 desc = INTEL_LEGACY_32B_CONTEXT; 1477 if (i915_vm_is_4lvl(ce->vm)) 1478 desc = INTEL_LEGACY_64B_CONTEXT; 1479 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1480 1481 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1482 if (GRAPHICS_VER(ce->vm->i915) == 8) 1483 desc |= GEN8_CTX_L3LLC_COHERENT; 1484 1485 return i915_ggtt_offset(ce->state) | desc; 1486 } 1487 1488 u32 lrc_update_regs(const struct intel_context *ce, 1489 const struct intel_engine_cs *engine, 1490 u32 head) 1491 { 1492 struct intel_ring *ring = ce->ring; 1493 u32 *regs = ce->lrc_reg_state; 1494 1495 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1496 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1497 1498 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1499 regs[CTX_RING_HEAD] = head; 1500 regs[CTX_RING_TAIL] = ring->tail; 1501 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1502 1503 /* RPCS */ 1504 if (engine->class == RENDER_CLASS) { 1505 regs[CTX_R_PWR_CLK_STATE] = 1506 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1507 1508 i915_oa_init_reg_state(ce, engine); 1509 } 1510 1511 if (ce->wa_bb_page) { 1512 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1513 1514 fn = gen12_emit_indirect_ctx_xcs; 1515 if (ce->engine->class == RENDER_CLASS) 1516 fn = gen12_emit_indirect_ctx_rcs; 1517 1518 /* Mutually exclusive wrt to global indirect bb */ 1519 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1520 setup_indirect_ctx_bb(ce, engine, fn); 1521 } 1522 1523 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1524 } 1525 1526 void lrc_update_offsets(struct intel_context *ce, 1527 struct intel_engine_cs *engine) 1528 { 1529 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1530 } 1531 1532 void lrc_check_regs(const struct intel_context *ce, 1533 const struct intel_engine_cs *engine, 1534 const char *when) 1535 { 1536 const struct intel_ring *ring = ce->ring; 1537 u32 *regs = ce->lrc_reg_state; 1538 bool valid = true; 1539 int x; 1540 1541 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1542 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1543 engine->name, 1544 regs[CTX_RING_START], 1545 i915_ggtt_offset(ring->vma)); 1546 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1547 valid = false; 1548 } 1549 1550 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1551 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1552 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1553 engine->name, 1554 regs[CTX_RING_CTL], 1555 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1556 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1557 valid = false; 1558 } 1559 1560 x = lrc_ring_mi_mode(engine); 1561 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1562 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1563 engine->name, regs[x + 1]); 1564 regs[x + 1] &= ~STOP_RING; 1565 regs[x + 1] |= STOP_RING << 16; 1566 valid = false; 1567 } 1568 1569 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1570 } 1571 1572 /* 1573 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1574 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1575 * but there is a slight complication as this is applied in WA batch where the 1576 * values are only initialized once so we cannot take register value at the 1577 * beginning and reuse it further; hence we save its value to memory, upload a 1578 * constant value with bit21 set and then we restore it back with the saved value. 1579 * To simplify the WA, a constant value is formed by using the default value 1580 * of this register. This shouldn't be a problem because we are only modifying 1581 * it for a short period and this batch in non-premptible. We can ofcourse 1582 * use additional instructions that read the actual value of the register 1583 * at that time and set our bit of interest but it makes the WA complicated. 1584 * 1585 * This WA is also required for Gen9 so extracting as a function avoids 1586 * code duplication. 1587 */ 1588 static u32 * 1589 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1590 { 1591 /* NB no one else is allowed to scribble over scratch + 256! */ 1592 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1593 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1594 *batch++ = intel_gt_scratch_offset(engine->gt, 1595 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1596 *batch++ = 0; 1597 1598 *batch++ = MI_LOAD_REGISTER_IMM(1); 1599 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1600 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1601 1602 batch = gen8_emit_pipe_control(batch, 1603 PIPE_CONTROL_CS_STALL | 1604 PIPE_CONTROL_DC_FLUSH_ENABLE, 1605 0); 1606 1607 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1608 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1609 *batch++ = intel_gt_scratch_offset(engine->gt, 1610 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1611 *batch++ = 0; 1612 1613 return batch; 1614 } 1615 1616 /* 1617 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1618 * initialized at the beginning and shared across all contexts but this field 1619 * helps us to have multiple batches at different offsets and select them based 1620 * on a criteria. At the moment this batch always start at the beginning of the page 1621 * and at this point we don't have multiple wa_ctx batch buffers. 1622 * 1623 * The number of WA applied are not known at the beginning; we use this field 1624 * to return the no of DWORDS written. 1625 * 1626 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1627 * so it adds NOOPs as padding to make it cacheline aligned. 1628 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1629 * makes a complete batch buffer. 1630 */ 1631 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1632 { 1633 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1634 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1635 1636 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1637 if (IS_BROADWELL(engine->i915)) 1638 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1639 1640 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1641 /* Actual scratch location is at 128 bytes offset */ 1642 batch = gen8_emit_pipe_control(batch, 1643 PIPE_CONTROL_FLUSH_L3 | 1644 PIPE_CONTROL_STORE_DATA_INDEX | 1645 PIPE_CONTROL_CS_STALL | 1646 PIPE_CONTROL_QW_WRITE, 1647 LRC_PPHWSP_SCRATCH_ADDR); 1648 1649 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1650 1651 /* Pad to end of cacheline */ 1652 while ((unsigned long)batch % CACHELINE_BYTES) 1653 *batch++ = MI_NOOP; 1654 1655 /* 1656 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1657 * execution depends on the length specified in terms of cache lines 1658 * in the register CTX_RCS_INDIRECT_CTX 1659 */ 1660 1661 return batch; 1662 } 1663 1664 struct lri { 1665 i915_reg_t reg; 1666 u32 value; 1667 }; 1668 1669 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1670 { 1671 GEM_BUG_ON(!count || count > 63); 1672 1673 *batch++ = MI_LOAD_REGISTER_IMM(count); 1674 do { 1675 *batch++ = i915_mmio_reg_offset(lri->reg); 1676 *batch++ = lri->value; 1677 } while (lri++, --count); 1678 *batch++ = MI_NOOP; 1679 1680 return batch; 1681 } 1682 1683 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1684 { 1685 static const struct lri lri[] = { 1686 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1687 { 1688 COMMON_SLICE_CHICKEN2, 1689 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1690 0), 1691 }, 1692 1693 /* BSpec: 11391 */ 1694 { 1695 FF_SLICE_CHICKEN, 1696 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1697 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1698 }, 1699 1700 /* BSpec: 11299 */ 1701 { 1702 _3D_CHICKEN3, 1703 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1704 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1705 } 1706 }; 1707 1708 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1709 1710 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1711 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1712 1713 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1714 batch = gen8_emit_pipe_control(batch, 1715 PIPE_CONTROL_FLUSH_L3 | 1716 PIPE_CONTROL_STORE_DATA_INDEX | 1717 PIPE_CONTROL_CS_STALL | 1718 PIPE_CONTROL_QW_WRITE, 1719 LRC_PPHWSP_SCRATCH_ADDR); 1720 1721 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1722 1723 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1724 if (HAS_POOLED_EU(engine->i915)) { 1725 /* 1726 * EU pool configuration is setup along with golden context 1727 * during context initialization. This value depends on 1728 * device type (2x6 or 3x6) and needs to be updated based 1729 * on which subslice is disabled especially for 2x6 1730 * devices, however it is safe to load default 1731 * configuration of 3x6 device instead of masking off 1732 * corresponding bits because HW ignores bits of a disabled 1733 * subslice and drops down to appropriate config. Please 1734 * see render_state_setup() in i915_gem_render_state.c for 1735 * possible configurations, to avoid duplication they are 1736 * not shown here again. 1737 */ 1738 *batch++ = GEN9_MEDIA_POOL_STATE; 1739 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1740 *batch++ = 0x00777000; 1741 *batch++ = 0; 1742 *batch++ = 0; 1743 *batch++ = 0; 1744 } 1745 1746 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1747 1748 /* Pad to end of cacheline */ 1749 while ((unsigned long)batch % CACHELINE_BYTES) 1750 *batch++ = MI_NOOP; 1751 1752 return batch; 1753 } 1754 1755 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1756 1757 static int lrc_create_wa_ctx(struct intel_engine_cs *engine) 1758 { 1759 struct drm_i915_gem_object *obj; 1760 struct i915_vma *vma; 1761 int err; 1762 1763 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1764 if (IS_ERR(obj)) 1765 return PTR_ERR(obj); 1766 1767 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1768 if (IS_ERR(vma)) { 1769 err = PTR_ERR(vma); 1770 goto err; 1771 } 1772 1773 engine->wa_ctx.vma = vma; 1774 return 0; 1775 1776 err: 1777 i915_gem_object_put(obj); 1778 return err; 1779 } 1780 1781 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1782 { 1783 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1784 } 1785 1786 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1787 1788 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1789 { 1790 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1791 struct i915_wa_ctx_bb *wa_bb[] = { 1792 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1793 }; 1794 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1795 struct i915_gem_ww_ctx ww; 1796 void *batch, *batch_ptr; 1797 unsigned int i; 1798 int err; 1799 1800 if (GRAPHICS_VER(engine->i915) >= 11 || 1801 !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)) 1802 return; 1803 1804 if (GRAPHICS_VER(engine->i915) == 9) { 1805 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1806 wa_bb_fn[1] = NULL; 1807 } else if (GRAPHICS_VER(engine->i915) == 8) { 1808 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1809 wa_bb_fn[1] = NULL; 1810 } 1811 1812 err = lrc_create_wa_ctx(engine); 1813 if (err) { 1814 /* 1815 * We continue even if we fail to initialize WA batch 1816 * because we only expect rare glitches but nothing 1817 * critical to prevent us from using GPU 1818 */ 1819 drm_err(&engine->i915->drm, 1820 "Ignoring context switch w/a allocation error:%d\n", 1821 err); 1822 return; 1823 } 1824 1825 if (!engine->wa_ctx.vma) 1826 return; 1827 1828 i915_gem_ww_ctx_init(&ww, true); 1829 retry: 1830 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); 1831 if (!err) 1832 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); 1833 if (err) 1834 goto err; 1835 1836 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1837 if (IS_ERR(batch)) { 1838 err = PTR_ERR(batch); 1839 goto err_unpin; 1840 } 1841 1842 /* 1843 * Emit the two workaround batch buffers, recording the offset from the 1844 * start of the workaround batch buffer object for each and their 1845 * respective sizes. 1846 */ 1847 batch_ptr = batch; 1848 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1849 wa_bb[i]->offset = batch_ptr - batch; 1850 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1851 CACHELINE_BYTES))) { 1852 err = -EINVAL; 1853 break; 1854 } 1855 if (wa_bb_fn[i]) 1856 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1857 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1858 } 1859 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1860 1861 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1862 __i915_gem_object_release_map(wa_ctx->vma->obj); 1863 1864 /* Verify that we can handle failure to setup the wa_ctx */ 1865 if (!err) 1866 err = i915_inject_probe_error(engine->i915, -ENODEV); 1867 1868 err_unpin: 1869 if (err) 1870 i915_vma_unpin(wa_ctx->vma); 1871 err: 1872 if (err == -EDEADLK) { 1873 err = i915_gem_ww_ctx_backoff(&ww); 1874 if (!err) 1875 goto retry; 1876 } 1877 i915_gem_ww_ctx_fini(&ww); 1878 1879 if (err) { 1880 i915_vma_put(engine->wa_ctx.vma); 1881 1882 /* Clear all flags to prevent further use */ 1883 memset(wa_ctx, 0, sizeof(*wa_ctx)); 1884 } 1885 } 1886 1887 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt) 1888 { 1889 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1890 stats->runtime.num_underflow++; 1891 stats->runtime.max_underflow = 1892 max_t(u32, stats->runtime.max_underflow, -dt); 1893 #endif 1894 } 1895 1896 static u32 lrc_get_runtime(const struct intel_context *ce) 1897 { 1898 /* 1899 * We can use either ppHWSP[16] which is recorded before the context 1900 * switch (and so excludes the cost of context switches) or use the 1901 * value from the context image itself, which is saved/restored earlier 1902 * and so includes the cost of the save. 1903 */ 1904 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 1905 } 1906 1907 void lrc_update_runtime(struct intel_context *ce) 1908 { 1909 struct intel_context_stats *stats = &ce->stats; 1910 u32 old; 1911 s32 dt; 1912 1913 old = stats->runtime.last; 1914 stats->runtime.last = lrc_get_runtime(ce); 1915 dt = stats->runtime.last - old; 1916 if (!dt) 1917 return; 1918 1919 if (unlikely(dt < 0)) { 1920 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1921 old, stats->runtime.last, dt); 1922 st_runtime_underflow(stats, dt); 1923 return; 1924 } 1925 1926 ewma_runtime_add(&stats->runtime.avg, dt); 1927 stats->runtime.total += dt; 1928 } 1929 1930 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1931 #include "selftest_lrc.c" 1932 #endif 1933