1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include "gem/i915_gem_lmem.h" 7 8 #include "gen8_engine_cs.h" 9 #include "i915_drv.h" 10 #include "i915_perf.h" 11 #include "i915_reg.h" 12 #include "intel_context.h" 13 #include "intel_engine.h" 14 #include "intel_engine_regs.h" 15 #include "intel_gpu_commands.h" 16 #include "intel_gt.h" 17 #include "intel_gt_regs.h" 18 #include "intel_lrc.h" 19 #include "intel_lrc_reg.h" 20 #include "intel_ring.h" 21 #include "shmem_utils.h" 22 23 static void set_offsets(u32 *regs, 24 const u8 *data, 25 const struct intel_engine_cs *engine, 26 bool close) 27 #define NOP(x) (BIT(7) | (x)) 28 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 29 #define POSTED BIT(0) 30 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 31 #define REG16(x) \ 32 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 33 (((x) >> 2) & 0x7f) 34 #define END 0 35 { 36 const u32 base = engine->mmio_base; 37 38 while (*data) { 39 u8 count, flags; 40 41 if (*data & BIT(7)) { /* skip */ 42 count = *data++ & ~BIT(7); 43 regs += count; 44 continue; 45 } 46 47 count = *data & 0x3f; 48 flags = *data >> 6; 49 data++; 50 51 *regs = MI_LOAD_REGISTER_IMM(count); 52 if (flags & POSTED) 53 *regs |= MI_LRI_FORCE_POSTED; 54 if (GRAPHICS_VER(engine->i915) >= 11) 55 *regs |= MI_LRI_LRM_CS_MMIO; 56 regs++; 57 58 GEM_BUG_ON(!count); 59 do { 60 u32 offset = 0; 61 u8 v; 62 63 do { 64 v = *data++; 65 offset <<= 7; 66 offset |= v & ~BIT(7); 67 } while (v & BIT(7)); 68 69 regs[0] = base + (offset << 2); 70 regs += 2; 71 } while (--count); 72 } 73 74 if (close) { 75 /* Close the batch; used mainly by live_lrc_layout() */ 76 *regs = MI_BATCH_BUFFER_END; 77 if (GRAPHICS_VER(engine->i915) >= 11) 78 *regs |= BIT(0); 79 } 80 } 81 82 static const u8 gen8_xcs_offsets[] = { 83 NOP(1), 84 LRI(11, 0), 85 REG16(0x244), 86 REG(0x034), 87 REG(0x030), 88 REG(0x038), 89 REG(0x03c), 90 REG(0x168), 91 REG(0x140), 92 REG(0x110), 93 REG(0x11c), 94 REG(0x114), 95 REG(0x118), 96 97 NOP(9), 98 LRI(9, 0), 99 REG16(0x3a8), 100 REG16(0x28c), 101 REG16(0x288), 102 REG16(0x284), 103 REG16(0x280), 104 REG16(0x27c), 105 REG16(0x278), 106 REG16(0x274), 107 REG16(0x270), 108 109 NOP(13), 110 LRI(2, 0), 111 REG16(0x200), 112 REG(0x028), 113 114 END 115 }; 116 117 static const u8 gen9_xcs_offsets[] = { 118 NOP(1), 119 LRI(14, POSTED), 120 REG16(0x244), 121 REG(0x034), 122 REG(0x030), 123 REG(0x038), 124 REG(0x03c), 125 REG(0x168), 126 REG(0x140), 127 REG(0x110), 128 REG(0x11c), 129 REG(0x114), 130 REG(0x118), 131 REG(0x1c0), 132 REG(0x1c4), 133 REG(0x1c8), 134 135 NOP(3), 136 LRI(9, POSTED), 137 REG16(0x3a8), 138 REG16(0x28c), 139 REG16(0x288), 140 REG16(0x284), 141 REG16(0x280), 142 REG16(0x27c), 143 REG16(0x278), 144 REG16(0x274), 145 REG16(0x270), 146 147 NOP(13), 148 LRI(1, POSTED), 149 REG16(0x200), 150 151 NOP(13), 152 LRI(44, POSTED), 153 REG(0x028), 154 REG(0x09c), 155 REG(0x0c0), 156 REG(0x178), 157 REG(0x17c), 158 REG16(0x358), 159 REG(0x170), 160 REG(0x150), 161 REG(0x154), 162 REG(0x158), 163 REG16(0x41c), 164 REG16(0x600), 165 REG16(0x604), 166 REG16(0x608), 167 REG16(0x60c), 168 REG16(0x610), 169 REG16(0x614), 170 REG16(0x618), 171 REG16(0x61c), 172 REG16(0x620), 173 REG16(0x624), 174 REG16(0x628), 175 REG16(0x62c), 176 REG16(0x630), 177 REG16(0x634), 178 REG16(0x638), 179 REG16(0x63c), 180 REG16(0x640), 181 REG16(0x644), 182 REG16(0x648), 183 REG16(0x64c), 184 REG16(0x650), 185 REG16(0x654), 186 REG16(0x658), 187 REG16(0x65c), 188 REG16(0x660), 189 REG16(0x664), 190 REG16(0x668), 191 REG16(0x66c), 192 REG16(0x670), 193 REG16(0x674), 194 REG16(0x678), 195 REG16(0x67c), 196 REG(0x068), 197 198 END 199 }; 200 201 static const u8 gen12_xcs_offsets[] = { 202 NOP(1), 203 LRI(13, POSTED), 204 REG16(0x244), 205 REG(0x034), 206 REG(0x030), 207 REG(0x038), 208 REG(0x03c), 209 REG(0x168), 210 REG(0x140), 211 REG(0x110), 212 REG(0x1c0), 213 REG(0x1c4), 214 REG(0x1c8), 215 REG(0x180), 216 REG16(0x2b4), 217 218 NOP(5), 219 LRI(9, POSTED), 220 REG16(0x3a8), 221 REG16(0x28c), 222 REG16(0x288), 223 REG16(0x284), 224 REG16(0x280), 225 REG16(0x27c), 226 REG16(0x278), 227 REG16(0x274), 228 REG16(0x270), 229 230 END 231 }; 232 233 static const u8 dg2_xcs_offsets[] = { 234 NOP(1), 235 LRI(15, POSTED), 236 REG16(0x244), 237 REG(0x034), 238 REG(0x030), 239 REG(0x038), 240 REG(0x03c), 241 REG(0x168), 242 REG(0x140), 243 REG(0x110), 244 REG(0x1c0), 245 REG(0x1c4), 246 REG(0x1c8), 247 REG(0x180), 248 REG16(0x2b4), 249 REG(0x120), 250 REG(0x124), 251 252 NOP(1), 253 LRI(9, POSTED), 254 REG16(0x3a8), 255 REG16(0x28c), 256 REG16(0x288), 257 REG16(0x284), 258 REG16(0x280), 259 REG16(0x27c), 260 REG16(0x278), 261 REG16(0x274), 262 REG16(0x270), 263 264 END 265 }; 266 267 static const u8 gen8_rcs_offsets[] = { 268 NOP(1), 269 LRI(14, POSTED), 270 REG16(0x244), 271 REG(0x034), 272 REG(0x030), 273 REG(0x038), 274 REG(0x03c), 275 REG(0x168), 276 REG(0x140), 277 REG(0x110), 278 REG(0x11c), 279 REG(0x114), 280 REG(0x118), 281 REG(0x1c0), 282 REG(0x1c4), 283 REG(0x1c8), 284 285 NOP(3), 286 LRI(9, POSTED), 287 REG16(0x3a8), 288 REG16(0x28c), 289 REG16(0x288), 290 REG16(0x284), 291 REG16(0x280), 292 REG16(0x27c), 293 REG16(0x278), 294 REG16(0x274), 295 REG16(0x270), 296 297 NOP(13), 298 LRI(1, 0), 299 REG(0x0c8), 300 301 END 302 }; 303 304 static const u8 gen9_rcs_offsets[] = { 305 NOP(1), 306 LRI(14, POSTED), 307 REG16(0x244), 308 REG(0x34), 309 REG(0x30), 310 REG(0x38), 311 REG(0x3c), 312 REG(0x168), 313 REG(0x140), 314 REG(0x110), 315 REG(0x11c), 316 REG(0x114), 317 REG(0x118), 318 REG(0x1c0), 319 REG(0x1c4), 320 REG(0x1c8), 321 322 NOP(3), 323 LRI(9, POSTED), 324 REG16(0x3a8), 325 REG16(0x28c), 326 REG16(0x288), 327 REG16(0x284), 328 REG16(0x280), 329 REG16(0x27c), 330 REG16(0x278), 331 REG16(0x274), 332 REG16(0x270), 333 334 NOP(13), 335 LRI(1, 0), 336 REG(0xc8), 337 338 NOP(13), 339 LRI(44, POSTED), 340 REG(0x28), 341 REG(0x9c), 342 REG(0xc0), 343 REG(0x178), 344 REG(0x17c), 345 REG16(0x358), 346 REG(0x170), 347 REG(0x150), 348 REG(0x154), 349 REG(0x158), 350 REG16(0x41c), 351 REG16(0x600), 352 REG16(0x604), 353 REG16(0x608), 354 REG16(0x60c), 355 REG16(0x610), 356 REG16(0x614), 357 REG16(0x618), 358 REG16(0x61c), 359 REG16(0x620), 360 REG16(0x624), 361 REG16(0x628), 362 REG16(0x62c), 363 REG16(0x630), 364 REG16(0x634), 365 REG16(0x638), 366 REG16(0x63c), 367 REG16(0x640), 368 REG16(0x644), 369 REG16(0x648), 370 REG16(0x64c), 371 REG16(0x650), 372 REG16(0x654), 373 REG16(0x658), 374 REG16(0x65c), 375 REG16(0x660), 376 REG16(0x664), 377 REG16(0x668), 378 REG16(0x66c), 379 REG16(0x670), 380 REG16(0x674), 381 REG16(0x678), 382 REG16(0x67c), 383 REG(0x68), 384 385 END 386 }; 387 388 static const u8 gen11_rcs_offsets[] = { 389 NOP(1), 390 LRI(15, POSTED), 391 REG16(0x244), 392 REG(0x034), 393 REG(0x030), 394 REG(0x038), 395 REG(0x03c), 396 REG(0x168), 397 REG(0x140), 398 REG(0x110), 399 REG(0x11c), 400 REG(0x114), 401 REG(0x118), 402 REG(0x1c0), 403 REG(0x1c4), 404 REG(0x1c8), 405 REG(0x180), 406 407 NOP(1), 408 LRI(9, POSTED), 409 REG16(0x3a8), 410 REG16(0x28c), 411 REG16(0x288), 412 REG16(0x284), 413 REG16(0x280), 414 REG16(0x27c), 415 REG16(0x278), 416 REG16(0x274), 417 REG16(0x270), 418 419 LRI(1, POSTED), 420 REG(0x1b0), 421 422 NOP(10), 423 LRI(1, 0), 424 REG(0x0c8), 425 426 END 427 }; 428 429 static const u8 gen12_rcs_offsets[] = { 430 NOP(1), 431 LRI(13, POSTED), 432 REG16(0x244), 433 REG(0x034), 434 REG(0x030), 435 REG(0x038), 436 REG(0x03c), 437 REG(0x168), 438 REG(0x140), 439 REG(0x110), 440 REG(0x1c0), 441 REG(0x1c4), 442 REG(0x1c8), 443 REG(0x180), 444 REG16(0x2b4), 445 446 NOP(5), 447 LRI(9, POSTED), 448 REG16(0x3a8), 449 REG16(0x28c), 450 REG16(0x288), 451 REG16(0x284), 452 REG16(0x280), 453 REG16(0x27c), 454 REG16(0x278), 455 REG16(0x274), 456 REG16(0x270), 457 458 LRI(3, POSTED), 459 REG(0x1b0), 460 REG16(0x5a8), 461 REG16(0x5ac), 462 463 NOP(6), 464 LRI(1, 0), 465 REG(0x0c8), 466 NOP(3 + 9 + 1), 467 468 LRI(51, POSTED), 469 REG16(0x588), 470 REG16(0x588), 471 REG16(0x588), 472 REG16(0x588), 473 REG16(0x588), 474 REG16(0x588), 475 REG(0x028), 476 REG(0x09c), 477 REG(0x0c0), 478 REG(0x178), 479 REG(0x17c), 480 REG16(0x358), 481 REG(0x170), 482 REG(0x150), 483 REG(0x154), 484 REG(0x158), 485 REG16(0x41c), 486 REG16(0x600), 487 REG16(0x604), 488 REG16(0x608), 489 REG16(0x60c), 490 REG16(0x610), 491 REG16(0x614), 492 REG16(0x618), 493 REG16(0x61c), 494 REG16(0x620), 495 REG16(0x624), 496 REG16(0x628), 497 REG16(0x62c), 498 REG16(0x630), 499 REG16(0x634), 500 REG16(0x638), 501 REG16(0x63c), 502 REG16(0x640), 503 REG16(0x644), 504 REG16(0x648), 505 REG16(0x64c), 506 REG16(0x650), 507 REG16(0x654), 508 REG16(0x658), 509 REG16(0x65c), 510 REG16(0x660), 511 REG16(0x664), 512 REG16(0x668), 513 REG16(0x66c), 514 REG16(0x670), 515 REG16(0x674), 516 REG16(0x678), 517 REG16(0x67c), 518 REG(0x068), 519 REG(0x084), 520 NOP(1), 521 522 END 523 }; 524 525 static const u8 xehp_rcs_offsets[] = { 526 NOP(1), 527 LRI(13, POSTED), 528 REG16(0x244), 529 REG(0x034), 530 REG(0x030), 531 REG(0x038), 532 REG(0x03c), 533 REG(0x168), 534 REG(0x140), 535 REG(0x110), 536 REG(0x1c0), 537 REG(0x1c4), 538 REG(0x1c8), 539 REG(0x180), 540 REG16(0x2b4), 541 542 NOP(5), 543 LRI(9, POSTED), 544 REG16(0x3a8), 545 REG16(0x28c), 546 REG16(0x288), 547 REG16(0x284), 548 REG16(0x280), 549 REG16(0x27c), 550 REG16(0x278), 551 REG16(0x274), 552 REG16(0x270), 553 554 LRI(3, POSTED), 555 REG(0x1b0), 556 REG16(0x5a8), 557 REG16(0x5ac), 558 559 NOP(6), 560 LRI(1, 0), 561 REG(0x0c8), 562 563 END 564 }; 565 566 static const u8 dg2_rcs_offsets[] = { 567 NOP(1), 568 LRI(15, POSTED), 569 REG16(0x244), 570 REG(0x034), 571 REG(0x030), 572 REG(0x038), 573 REG(0x03c), 574 REG(0x168), 575 REG(0x140), 576 REG(0x110), 577 REG(0x1c0), 578 REG(0x1c4), 579 REG(0x1c8), 580 REG(0x180), 581 REG16(0x2b4), 582 REG(0x120), 583 REG(0x124), 584 585 NOP(1), 586 LRI(9, POSTED), 587 REG16(0x3a8), 588 REG16(0x28c), 589 REG16(0x288), 590 REG16(0x284), 591 REG16(0x280), 592 REG16(0x27c), 593 REG16(0x278), 594 REG16(0x274), 595 REG16(0x270), 596 597 LRI(3, POSTED), 598 REG(0x1b0), 599 REG16(0x5a8), 600 REG16(0x5ac), 601 602 NOP(6), 603 LRI(1, 0), 604 REG(0x0c8), 605 606 END 607 }; 608 609 #undef END 610 #undef REG16 611 #undef REG 612 #undef LRI 613 #undef NOP 614 615 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 616 { 617 /* 618 * The gen12+ lists only have the registers we program in the basic 619 * default state. We rely on the context image using relative 620 * addressing to automatic fixup the register state between the 621 * physical engines for virtual engine. 622 */ 623 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && 624 !intel_engine_has_relative_mmio(engine)); 625 626 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) { 627 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 628 return dg2_rcs_offsets; 629 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 630 return xehp_rcs_offsets; 631 else if (GRAPHICS_VER(engine->i915) >= 12) 632 return gen12_rcs_offsets; 633 else if (GRAPHICS_VER(engine->i915) >= 11) 634 return gen11_rcs_offsets; 635 else if (GRAPHICS_VER(engine->i915) >= 9) 636 return gen9_rcs_offsets; 637 else 638 return gen8_rcs_offsets; 639 } else { 640 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 641 return dg2_xcs_offsets; 642 else if (GRAPHICS_VER(engine->i915) >= 12) 643 return gen12_xcs_offsets; 644 else if (GRAPHICS_VER(engine->i915) >= 9) 645 return gen9_xcs_offsets; 646 else 647 return gen8_xcs_offsets; 648 } 649 } 650 651 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 652 { 653 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 654 return 0x70; 655 else if (GRAPHICS_VER(engine->i915) >= 12) 656 return 0x60; 657 else if (GRAPHICS_VER(engine->i915) >= 9) 658 return 0x54; 659 else if (engine->class == RENDER_CLASS) 660 return 0x58; 661 else 662 return -1; 663 } 664 665 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 666 { 667 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 668 return 0x84; 669 else if (GRAPHICS_VER(engine->i915) >= 12) 670 return 0x74; 671 else if (GRAPHICS_VER(engine->i915) >= 9) 672 return 0x68; 673 else if (engine->class == RENDER_CLASS) 674 return 0xd8; 675 else 676 return -1; 677 } 678 679 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 680 { 681 if (GRAPHICS_VER(engine->i915) >= 12) 682 return 0x12; 683 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS) 684 return 0x18; 685 else 686 return -1; 687 } 688 689 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 690 { 691 int x; 692 693 x = lrc_ring_wa_bb_per_ctx(engine); 694 if (x < 0) 695 return x; 696 697 return x + 2; 698 } 699 700 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 701 { 702 int x; 703 704 x = lrc_ring_indirect_ptr(engine); 705 if (x < 0) 706 return x; 707 708 return x + 2; 709 } 710 711 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 712 { 713 714 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 715 /* 716 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL 717 * simply to match the RCS context image layout. 718 */ 719 return 0xc6; 720 else if (engine->class != RENDER_CLASS) 721 return -1; 722 else if (GRAPHICS_VER(engine->i915) >= 12) 723 return 0xb6; 724 else if (GRAPHICS_VER(engine->i915) >= 11) 725 return 0xaa; 726 else 727 return -1; 728 } 729 730 static u32 731 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 732 { 733 switch (GRAPHICS_VER(engine->i915)) { 734 default: 735 MISSING_CASE(GRAPHICS_VER(engine->i915)); 736 fallthrough; 737 case 12: 738 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 739 case 11: 740 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 741 case 9: 742 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 743 case 8: 744 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 745 } 746 } 747 748 static void 749 lrc_setup_indirect_ctx(u32 *regs, 750 const struct intel_engine_cs *engine, 751 u32 ctx_bb_ggtt_addr, 752 u32 size) 753 { 754 GEM_BUG_ON(!size); 755 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 756 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 757 regs[lrc_ring_indirect_ptr(engine) + 1] = 758 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 759 760 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 761 regs[lrc_ring_indirect_offset(engine) + 1] = 762 lrc_ring_indirect_offset_default(engine) << 6; 763 } 764 765 static void init_common_regs(u32 * const regs, 766 const struct intel_context *ce, 767 const struct intel_engine_cs *engine, 768 bool inhibit) 769 { 770 u32 ctl; 771 772 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 773 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 774 if (inhibit) 775 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 776 if (GRAPHICS_VER(engine->i915) < 11) 777 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 778 CTX_CTRL_RS_CTX_ENABLE); 779 regs[CTX_CONTEXT_CONTROL] = ctl; 780 781 regs[CTX_TIMESTAMP] = ce->stats.runtime.last; 782 } 783 784 static void init_wa_bb_regs(u32 * const regs, 785 const struct intel_engine_cs *engine) 786 { 787 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 788 789 if (wa_ctx->per_ctx.size) { 790 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 791 792 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 793 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 794 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 795 } 796 797 if (wa_ctx->indirect_ctx.size) { 798 lrc_setup_indirect_ctx(regs, engine, 799 i915_ggtt_offset(wa_ctx->vma) + 800 wa_ctx->indirect_ctx.offset, 801 wa_ctx->indirect_ctx.size); 802 } 803 } 804 805 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 806 { 807 if (i915_vm_is_4lvl(&ppgtt->vm)) { 808 /* 64b PPGTT (48bit canonical) 809 * PDP0_DESCRIPTOR contains the base address to PML4 and 810 * other PDP Descriptors are ignored. 811 */ 812 ASSIGN_CTX_PML4(ppgtt, regs); 813 } else { 814 ASSIGN_CTX_PDP(ppgtt, regs, 3); 815 ASSIGN_CTX_PDP(ppgtt, regs, 2); 816 ASSIGN_CTX_PDP(ppgtt, regs, 1); 817 ASSIGN_CTX_PDP(ppgtt, regs, 0); 818 } 819 } 820 821 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 822 { 823 if (i915_is_ggtt(vm)) 824 return i915_vm_to_ggtt(vm)->alias; 825 else 826 return i915_vm_to_ppgtt(vm); 827 } 828 829 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 830 { 831 int x; 832 833 x = lrc_ring_mi_mode(engine); 834 if (x != -1) { 835 regs[x + 1] &= ~STOP_RING; 836 regs[x + 1] |= STOP_RING << 16; 837 } 838 } 839 840 static void __lrc_init_regs(u32 *regs, 841 const struct intel_context *ce, 842 const struct intel_engine_cs *engine, 843 bool inhibit) 844 { 845 /* 846 * A context is actually a big batch buffer with several 847 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 848 * values we are setting here are only for the first context restore: 849 * on a subsequent save, the GPU will recreate this batchbuffer with new 850 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 851 * we are not initializing here). 852 * 853 * Must keep consistent with virtual_update_register_offsets(). 854 */ 855 856 if (inhibit) 857 memset(regs, 0, PAGE_SIZE); 858 859 set_offsets(regs, reg_offsets(engine), engine, inhibit); 860 861 init_common_regs(regs, ce, engine, inhibit); 862 init_ppgtt_regs(regs, vm_alias(ce->vm)); 863 864 init_wa_bb_regs(regs, engine); 865 866 __reset_stop_ring(regs, engine); 867 } 868 869 void lrc_init_regs(const struct intel_context *ce, 870 const struct intel_engine_cs *engine, 871 bool inhibit) 872 { 873 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 874 } 875 876 void lrc_reset_regs(const struct intel_context *ce, 877 const struct intel_engine_cs *engine) 878 { 879 __reset_stop_ring(ce->lrc_reg_state, engine); 880 } 881 882 static void 883 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 884 { 885 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 886 return; 887 888 vaddr += engine->context_size; 889 890 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 891 } 892 893 static void 894 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 895 { 896 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 897 return; 898 899 vaddr += engine->context_size; 900 901 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 902 drm_err_once(&engine->i915->drm, 903 "%s context redzone overwritten!\n", 904 engine->name); 905 } 906 907 void lrc_init_state(struct intel_context *ce, 908 struct intel_engine_cs *engine, 909 void *state) 910 { 911 bool inhibit = true; 912 913 set_redzone(state, engine); 914 915 if (engine->default_state) { 916 shmem_read(engine->default_state, 0, 917 state, engine->context_size); 918 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 919 inhibit = false; 920 } 921 922 /* Clear the ppHWSP (inc. per-context counters) */ 923 memset(state, 0, PAGE_SIZE); 924 925 /* 926 * The second page of the context object contains some registers which 927 * must be set up prior to the first execution. 928 */ 929 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 930 } 931 932 static struct i915_vma * 933 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 934 { 935 struct drm_i915_gem_object *obj; 936 struct i915_vma *vma; 937 u32 context_size; 938 939 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 940 941 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 942 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 943 944 if (GRAPHICS_VER(engine->i915) == 12) { 945 ce->wa_bb_page = context_size / PAGE_SIZE; 946 context_size += PAGE_SIZE; 947 } 948 949 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { 950 ce->parallel.guc.parent_page = context_size / PAGE_SIZE; 951 context_size += PARENT_SCRATCH_SIZE; 952 } 953 954 obj = i915_gem_object_create_lmem(engine->i915, context_size, 955 I915_BO_ALLOC_PM_VOLATILE); 956 if (IS_ERR(obj)) 957 obj = i915_gem_object_create_shmem(engine->i915, context_size); 958 if (IS_ERR(obj)) 959 return ERR_CAST(obj); 960 961 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 962 if (IS_ERR(vma)) { 963 i915_gem_object_put(obj); 964 return vma; 965 } 966 967 return vma; 968 } 969 970 static struct intel_timeline * 971 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 972 { 973 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 974 975 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 976 } 977 978 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 979 { 980 struct intel_ring *ring; 981 struct i915_vma *vma; 982 int err; 983 984 GEM_BUG_ON(ce->state); 985 986 vma = __lrc_alloc_state(ce, engine); 987 if (IS_ERR(vma)) 988 return PTR_ERR(vma); 989 990 ring = intel_engine_create_ring(engine, ce->ring_size); 991 if (IS_ERR(ring)) { 992 err = PTR_ERR(ring); 993 goto err_vma; 994 } 995 996 if (!page_mask_bits(ce->timeline)) { 997 struct intel_timeline *tl; 998 999 /* 1000 * Use the static global HWSP for the kernel context, and 1001 * a dynamically allocated cacheline for everyone else. 1002 */ 1003 if (unlikely(ce->timeline)) 1004 tl = pinned_timeline(ce, engine); 1005 else 1006 tl = intel_timeline_create(engine->gt); 1007 if (IS_ERR(tl)) { 1008 err = PTR_ERR(tl); 1009 goto err_ring; 1010 } 1011 1012 ce->timeline = tl; 1013 } 1014 1015 ce->ring = ring; 1016 ce->state = vma; 1017 1018 return 0; 1019 1020 err_ring: 1021 intel_ring_put(ring); 1022 err_vma: 1023 i915_vma_put(vma); 1024 return err; 1025 } 1026 1027 void lrc_reset(struct intel_context *ce) 1028 { 1029 GEM_BUG_ON(!intel_context_is_pinned(ce)); 1030 1031 intel_ring_reset(ce->ring, ce->ring->emit); 1032 1033 /* Scrub away the garbage */ 1034 lrc_init_regs(ce, ce->engine, true); 1035 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 1036 } 1037 1038 int 1039 lrc_pre_pin(struct intel_context *ce, 1040 struct intel_engine_cs *engine, 1041 struct i915_gem_ww_ctx *ww, 1042 void **vaddr) 1043 { 1044 GEM_BUG_ON(!ce->state); 1045 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1046 1047 *vaddr = i915_gem_object_pin_map(ce->state->obj, 1048 i915_coherent_map_type(ce->engine->i915, 1049 ce->state->obj, 1050 false) | 1051 I915_MAP_OVERRIDE); 1052 1053 return PTR_ERR_OR_ZERO(*vaddr); 1054 } 1055 1056 int 1057 lrc_pin(struct intel_context *ce, 1058 struct intel_engine_cs *engine, 1059 void *vaddr) 1060 { 1061 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 1062 1063 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 1064 lrc_init_state(ce, engine, vaddr); 1065 1066 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 1067 return 0; 1068 } 1069 1070 void lrc_unpin(struct intel_context *ce) 1071 { 1072 if (unlikely(ce->parallel.last_rq)) { 1073 i915_request_put(ce->parallel.last_rq); 1074 ce->parallel.last_rq = NULL; 1075 } 1076 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 1077 ce->engine); 1078 } 1079 1080 void lrc_post_unpin(struct intel_context *ce) 1081 { 1082 i915_gem_object_unpin_map(ce->state->obj); 1083 } 1084 1085 void lrc_fini(struct intel_context *ce) 1086 { 1087 if (!ce->state) 1088 return; 1089 1090 intel_ring_put(fetch_and_zero(&ce->ring)); 1091 i915_vma_put(fetch_and_zero(&ce->state)); 1092 } 1093 1094 void lrc_destroy(struct kref *kref) 1095 { 1096 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1097 1098 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1099 GEM_BUG_ON(intel_context_is_pinned(ce)); 1100 1101 lrc_fini(ce); 1102 1103 intel_context_fini(ce); 1104 intel_context_free(ce); 1105 } 1106 1107 static u32 * 1108 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 1109 { 1110 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1111 MI_SRM_LRM_GLOBAL_GTT | 1112 MI_LRI_LRM_CS_MMIO; 1113 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1114 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1115 CTX_TIMESTAMP * sizeof(u32); 1116 *cs++ = 0; 1117 1118 *cs++ = MI_LOAD_REGISTER_REG | 1119 MI_LRR_SOURCE_CS_MMIO | 1120 MI_LRI_LRM_CS_MMIO; 1121 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1122 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1123 1124 *cs++ = MI_LOAD_REGISTER_REG | 1125 MI_LRR_SOURCE_CS_MMIO | 1126 MI_LRI_LRM_CS_MMIO; 1127 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1128 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1129 1130 return cs; 1131 } 1132 1133 static u32 * 1134 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 1135 { 1136 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 1137 1138 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1139 MI_SRM_LRM_GLOBAL_GTT | 1140 MI_LRI_LRM_CS_MMIO; 1141 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1142 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1143 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 1144 *cs++ = 0; 1145 1146 return cs; 1147 } 1148 1149 static u32 * 1150 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1151 { 1152 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1153 1154 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1155 MI_SRM_LRM_GLOBAL_GTT | 1156 MI_LRI_LRM_CS_MMIO; 1157 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1158 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1159 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1160 *cs++ = 0; 1161 1162 *cs++ = MI_LOAD_REGISTER_REG | 1163 MI_LRR_SOURCE_CS_MMIO | 1164 MI_LRI_LRM_CS_MMIO; 1165 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1166 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1167 1168 return cs; 1169 } 1170 1171 /* 1172 * On DG2 during context restore of a preempted context in GPGPU mode, 1173 * RCS restore hang is detected. This is extremely timing dependent. 1174 * To address this below sw wabb is implemented for DG2 A steppings. 1175 */ 1176 static u32 * 1177 dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs) 1178 { 1179 *cs++ = MI_LOAD_REGISTER_IMM(1); 1180 *cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG); 1181 *cs++ = 0x21; 1182 1183 *cs++ = MI_LOAD_REGISTER_REG; 1184 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base)); 1185 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1); 1186 1187 *cs++ = MI_LOAD_REGISTER_REG; 1188 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base)); 1189 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2); 1190 1191 return cs; 1192 } 1193 1194 static u32 * 1195 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1196 { 1197 cs = gen12_emit_timestamp_wa(ce, cs); 1198 cs = gen12_emit_cmd_buf_wa(ce, cs); 1199 cs = gen12_emit_restore_scratch(ce, cs); 1200 1201 /* Wa_22011450934:dg2 */ 1202 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) || 1203 IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0)) 1204 cs = dg2_emit_rcs_hang_wabb(ce, cs); 1205 1206 /* Wa_16013000631:dg2 */ 1207 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || 1208 IS_DG2_G11(ce->engine->i915)) 1209 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); 1210 1211 /* hsdes: 1809175790 */ 1212 if (!HAS_FLAT_CCS(ce->engine->i915)) 1213 cs = gen12_emit_aux_table_inv(cs, GEN12_GFX_CCS_AUX_NV); 1214 1215 return cs; 1216 } 1217 1218 static u32 * 1219 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1220 { 1221 cs = gen12_emit_timestamp_wa(ce, cs); 1222 cs = gen12_emit_restore_scratch(ce, cs); 1223 1224 /* Wa_16013000631:dg2 */ 1225 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || 1226 IS_DG2_G11(ce->engine->i915)) 1227 if (ce->engine->class == COMPUTE_CLASS) 1228 cs = gen8_emit_pipe_control(cs, 1229 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 1230 0); 1231 1232 /* hsdes: 1809175790 */ 1233 if (!HAS_FLAT_CCS(ce->engine->i915)) { 1234 if (ce->engine->class == VIDEO_DECODE_CLASS) 1235 cs = gen12_emit_aux_table_inv(cs, GEN12_VD0_AUX_NV); 1236 else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS) 1237 cs = gen12_emit_aux_table_inv(cs, GEN12_VE0_AUX_NV); 1238 } 1239 1240 return cs; 1241 } 1242 1243 static u32 context_wa_bb_offset(const struct intel_context *ce) 1244 { 1245 return PAGE_SIZE * ce->wa_bb_page; 1246 } 1247 1248 static u32 *context_indirect_bb(const struct intel_context *ce) 1249 { 1250 void *ptr; 1251 1252 GEM_BUG_ON(!ce->wa_bb_page); 1253 1254 ptr = ce->lrc_reg_state; 1255 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 1256 ptr += context_wa_bb_offset(ce); 1257 1258 return ptr; 1259 } 1260 1261 static void 1262 setup_indirect_ctx_bb(const struct intel_context *ce, 1263 const struct intel_engine_cs *engine, 1264 u32 *(*emit)(const struct intel_context *, u32 *)) 1265 { 1266 u32 * const start = context_indirect_bb(ce); 1267 u32 *cs; 1268 1269 cs = emit(ce, start); 1270 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1271 while ((unsigned long)cs % CACHELINE_BYTES) 1272 *cs++ = MI_NOOP; 1273 1274 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1275 i915_ggtt_offset(ce->state) + 1276 context_wa_bb_offset(ce), 1277 (cs - start) * sizeof(*cs)); 1278 } 1279 1280 /* 1281 * The context descriptor encodes various attributes of a context, 1282 * including its GTT address and some flags. Because it's fairly 1283 * expensive to calculate, we'll just do it once and cache the result, 1284 * which remains valid until the context is unpinned. 1285 * 1286 * This is what a descriptor looks like, from LSB to MSB:: 1287 * 1288 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1289 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1290 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1291 * bits 53-54: mbz, reserved for use by hardware 1292 * bits 55-63: group ID, currently unused and set to 0 1293 * 1294 * Starting from Gen11, the upper dword of the descriptor has a new format: 1295 * 1296 * bits 32-36: reserved 1297 * bits 37-47: SW context ID 1298 * bits 48:53: engine instance 1299 * bit 54: mbz, reserved for use by hardware 1300 * bits 55-60: SW counter 1301 * bits 61-63: engine class 1302 * 1303 * On Xe_HP, the upper dword of the descriptor has a new format: 1304 * 1305 * bits 32-37: virtual function number 1306 * bit 38: mbz, reserved for use by hardware 1307 * bits 39-54: SW context ID 1308 * bits 55-57: reserved 1309 * bits 58-63: SW counter 1310 * 1311 * engine info, SW context ID and SW counter need to form a unique number 1312 * (Context ID) per lrc. 1313 */ 1314 static u32 lrc_descriptor(const struct intel_context *ce) 1315 { 1316 u32 desc; 1317 1318 desc = INTEL_LEGACY_32B_CONTEXT; 1319 if (i915_vm_is_4lvl(ce->vm)) 1320 desc = INTEL_LEGACY_64B_CONTEXT; 1321 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1322 1323 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1324 if (GRAPHICS_VER(ce->vm->i915) == 8) 1325 desc |= GEN8_CTX_L3LLC_COHERENT; 1326 1327 return i915_ggtt_offset(ce->state) | desc; 1328 } 1329 1330 u32 lrc_update_regs(const struct intel_context *ce, 1331 const struct intel_engine_cs *engine, 1332 u32 head) 1333 { 1334 struct intel_ring *ring = ce->ring; 1335 u32 *regs = ce->lrc_reg_state; 1336 1337 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1338 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1339 1340 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1341 regs[CTX_RING_HEAD] = head; 1342 regs[CTX_RING_TAIL] = ring->tail; 1343 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1344 1345 /* RPCS */ 1346 if (engine->class == RENDER_CLASS) { 1347 regs[CTX_R_PWR_CLK_STATE] = 1348 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1349 1350 i915_oa_init_reg_state(ce, engine); 1351 } 1352 1353 if (ce->wa_bb_page) { 1354 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1355 1356 fn = gen12_emit_indirect_ctx_xcs; 1357 if (ce->engine->class == RENDER_CLASS) 1358 fn = gen12_emit_indirect_ctx_rcs; 1359 1360 /* Mutually exclusive wrt to global indirect bb */ 1361 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1362 setup_indirect_ctx_bb(ce, engine, fn); 1363 } 1364 1365 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1366 } 1367 1368 void lrc_update_offsets(struct intel_context *ce, 1369 struct intel_engine_cs *engine) 1370 { 1371 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1372 } 1373 1374 void lrc_check_regs(const struct intel_context *ce, 1375 const struct intel_engine_cs *engine, 1376 const char *when) 1377 { 1378 const struct intel_ring *ring = ce->ring; 1379 u32 *regs = ce->lrc_reg_state; 1380 bool valid = true; 1381 int x; 1382 1383 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1384 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1385 engine->name, 1386 regs[CTX_RING_START], 1387 i915_ggtt_offset(ring->vma)); 1388 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1389 valid = false; 1390 } 1391 1392 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1393 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1394 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1395 engine->name, 1396 regs[CTX_RING_CTL], 1397 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1398 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1399 valid = false; 1400 } 1401 1402 x = lrc_ring_mi_mode(engine); 1403 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1404 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1405 engine->name, regs[x + 1]); 1406 regs[x + 1] &= ~STOP_RING; 1407 regs[x + 1] |= STOP_RING << 16; 1408 valid = false; 1409 } 1410 1411 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1412 } 1413 1414 /* 1415 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1416 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1417 * but there is a slight complication as this is applied in WA batch where the 1418 * values are only initialized once so we cannot take register value at the 1419 * beginning and reuse it further; hence we save its value to memory, upload a 1420 * constant value with bit21 set and then we restore it back with the saved value. 1421 * To simplify the WA, a constant value is formed by using the default value 1422 * of this register. This shouldn't be a problem because we are only modifying 1423 * it for a short period and this batch in non-premptible. We can ofcourse 1424 * use additional instructions that read the actual value of the register 1425 * at that time and set our bit of interest but it makes the WA complicated. 1426 * 1427 * This WA is also required for Gen9 so extracting as a function avoids 1428 * code duplication. 1429 */ 1430 static u32 * 1431 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1432 { 1433 /* NB no one else is allowed to scribble over scratch + 256! */ 1434 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1435 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1436 *batch++ = intel_gt_scratch_offset(engine->gt, 1437 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1438 *batch++ = 0; 1439 1440 *batch++ = MI_LOAD_REGISTER_IMM(1); 1441 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1442 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1443 1444 batch = gen8_emit_pipe_control(batch, 1445 PIPE_CONTROL_CS_STALL | 1446 PIPE_CONTROL_DC_FLUSH_ENABLE, 1447 0); 1448 1449 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1450 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1451 *batch++ = intel_gt_scratch_offset(engine->gt, 1452 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1453 *batch++ = 0; 1454 1455 return batch; 1456 } 1457 1458 /* 1459 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1460 * initialized at the beginning and shared across all contexts but this field 1461 * helps us to have multiple batches at different offsets and select them based 1462 * on a criteria. At the moment this batch always start at the beginning of the page 1463 * and at this point we don't have multiple wa_ctx batch buffers. 1464 * 1465 * The number of WA applied are not known at the beginning; we use this field 1466 * to return the no of DWORDS written. 1467 * 1468 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1469 * so it adds NOOPs as padding to make it cacheline aligned. 1470 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1471 * makes a complete batch buffer. 1472 */ 1473 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1474 { 1475 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1476 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1477 1478 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1479 if (IS_BROADWELL(engine->i915)) 1480 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1481 1482 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1483 /* Actual scratch location is at 128 bytes offset */ 1484 batch = gen8_emit_pipe_control(batch, 1485 PIPE_CONTROL_FLUSH_L3 | 1486 PIPE_CONTROL_STORE_DATA_INDEX | 1487 PIPE_CONTROL_CS_STALL | 1488 PIPE_CONTROL_QW_WRITE, 1489 LRC_PPHWSP_SCRATCH_ADDR); 1490 1491 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1492 1493 /* Pad to end of cacheline */ 1494 while ((unsigned long)batch % CACHELINE_BYTES) 1495 *batch++ = MI_NOOP; 1496 1497 /* 1498 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1499 * execution depends on the length specified in terms of cache lines 1500 * in the register CTX_RCS_INDIRECT_CTX 1501 */ 1502 1503 return batch; 1504 } 1505 1506 struct lri { 1507 i915_reg_t reg; 1508 u32 value; 1509 }; 1510 1511 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1512 { 1513 GEM_BUG_ON(!count || count > 63); 1514 1515 *batch++ = MI_LOAD_REGISTER_IMM(count); 1516 do { 1517 *batch++ = i915_mmio_reg_offset(lri->reg); 1518 *batch++ = lri->value; 1519 } while (lri++, --count); 1520 *batch++ = MI_NOOP; 1521 1522 return batch; 1523 } 1524 1525 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1526 { 1527 static const struct lri lri[] = { 1528 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1529 { 1530 COMMON_SLICE_CHICKEN2, 1531 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1532 0), 1533 }, 1534 1535 /* BSpec: 11391 */ 1536 { 1537 FF_SLICE_CHICKEN, 1538 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1539 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1540 }, 1541 1542 /* BSpec: 11299 */ 1543 { 1544 _3D_CHICKEN3, 1545 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1546 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1547 } 1548 }; 1549 1550 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1551 1552 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1553 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1554 1555 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1556 batch = gen8_emit_pipe_control(batch, 1557 PIPE_CONTROL_FLUSH_L3 | 1558 PIPE_CONTROL_STORE_DATA_INDEX | 1559 PIPE_CONTROL_CS_STALL | 1560 PIPE_CONTROL_QW_WRITE, 1561 LRC_PPHWSP_SCRATCH_ADDR); 1562 1563 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1564 1565 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1566 if (HAS_POOLED_EU(engine->i915)) { 1567 /* 1568 * EU pool configuration is setup along with golden context 1569 * during context initialization. This value depends on 1570 * device type (2x6 or 3x6) and needs to be updated based 1571 * on which subslice is disabled especially for 2x6 1572 * devices, however it is safe to load default 1573 * configuration of 3x6 device instead of masking off 1574 * corresponding bits because HW ignores bits of a disabled 1575 * subslice and drops down to appropriate config. Please 1576 * see render_state_setup() in i915_gem_render_state.c for 1577 * possible configurations, to avoid duplication they are 1578 * not shown here again. 1579 */ 1580 *batch++ = GEN9_MEDIA_POOL_STATE; 1581 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1582 *batch++ = 0x00777000; 1583 *batch++ = 0; 1584 *batch++ = 0; 1585 *batch++ = 0; 1586 } 1587 1588 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1589 1590 /* Pad to end of cacheline */ 1591 while ((unsigned long)batch % CACHELINE_BYTES) 1592 *batch++ = MI_NOOP; 1593 1594 return batch; 1595 } 1596 1597 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1598 1599 static int lrc_create_wa_ctx(struct intel_engine_cs *engine) 1600 { 1601 struct drm_i915_gem_object *obj; 1602 struct i915_vma *vma; 1603 int err; 1604 1605 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1606 if (IS_ERR(obj)) 1607 return PTR_ERR(obj); 1608 1609 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1610 if (IS_ERR(vma)) { 1611 err = PTR_ERR(vma); 1612 goto err; 1613 } 1614 1615 engine->wa_ctx.vma = vma; 1616 return 0; 1617 1618 err: 1619 i915_gem_object_put(obj); 1620 return err; 1621 } 1622 1623 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1624 { 1625 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1626 } 1627 1628 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1629 1630 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1631 { 1632 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1633 struct i915_wa_ctx_bb *wa_bb[] = { 1634 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1635 }; 1636 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1637 struct i915_gem_ww_ctx ww; 1638 void *batch, *batch_ptr; 1639 unsigned int i; 1640 int err; 1641 1642 if (!(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)) 1643 return; 1644 1645 switch (GRAPHICS_VER(engine->i915)) { 1646 case 12: 1647 case 11: 1648 return; 1649 case 9: 1650 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1651 wa_bb_fn[1] = NULL; 1652 break; 1653 case 8: 1654 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1655 wa_bb_fn[1] = NULL; 1656 break; 1657 default: 1658 MISSING_CASE(GRAPHICS_VER(engine->i915)); 1659 return; 1660 } 1661 1662 err = lrc_create_wa_ctx(engine); 1663 if (err) { 1664 /* 1665 * We continue even if we fail to initialize WA batch 1666 * because we only expect rare glitches but nothing 1667 * critical to prevent us from using GPU 1668 */ 1669 drm_err(&engine->i915->drm, 1670 "Ignoring context switch w/a allocation error:%d\n", 1671 err); 1672 return; 1673 } 1674 1675 if (!engine->wa_ctx.vma) 1676 return; 1677 1678 i915_gem_ww_ctx_init(&ww, true); 1679 retry: 1680 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); 1681 if (!err) 1682 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); 1683 if (err) 1684 goto err; 1685 1686 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1687 if (IS_ERR(batch)) { 1688 err = PTR_ERR(batch); 1689 goto err_unpin; 1690 } 1691 1692 /* 1693 * Emit the two workaround batch buffers, recording the offset from the 1694 * start of the workaround batch buffer object for each and their 1695 * respective sizes. 1696 */ 1697 batch_ptr = batch; 1698 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1699 wa_bb[i]->offset = batch_ptr - batch; 1700 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1701 CACHELINE_BYTES))) { 1702 err = -EINVAL; 1703 break; 1704 } 1705 if (wa_bb_fn[i]) 1706 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1707 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1708 } 1709 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1710 1711 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1712 __i915_gem_object_release_map(wa_ctx->vma->obj); 1713 1714 /* Verify that we can handle failure to setup the wa_ctx */ 1715 if (!err) 1716 err = i915_inject_probe_error(engine->i915, -ENODEV); 1717 1718 err_unpin: 1719 if (err) 1720 i915_vma_unpin(wa_ctx->vma); 1721 err: 1722 if (err == -EDEADLK) { 1723 err = i915_gem_ww_ctx_backoff(&ww); 1724 if (!err) 1725 goto retry; 1726 } 1727 i915_gem_ww_ctx_fini(&ww); 1728 1729 if (err) { 1730 i915_vma_put(engine->wa_ctx.vma); 1731 1732 /* Clear all flags to prevent further use */ 1733 memset(wa_ctx, 0, sizeof(*wa_ctx)); 1734 } 1735 } 1736 1737 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt) 1738 { 1739 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1740 stats->runtime.num_underflow++; 1741 stats->runtime.max_underflow = 1742 max_t(u32, stats->runtime.max_underflow, -dt); 1743 #endif 1744 } 1745 1746 static u32 lrc_get_runtime(const struct intel_context *ce) 1747 { 1748 /* 1749 * We can use either ppHWSP[16] which is recorded before the context 1750 * switch (and so excludes the cost of context switches) or use the 1751 * value from the context image itself, which is saved/restored earlier 1752 * and so includes the cost of the save. 1753 */ 1754 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 1755 } 1756 1757 void lrc_update_runtime(struct intel_context *ce) 1758 { 1759 struct intel_context_stats *stats = &ce->stats; 1760 u32 old; 1761 s32 dt; 1762 1763 old = stats->runtime.last; 1764 stats->runtime.last = lrc_get_runtime(ce); 1765 dt = stats->runtime.last - old; 1766 if (!dt) 1767 return; 1768 1769 if (unlikely(dt < 0)) { 1770 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1771 old, stats->runtime.last, dt); 1772 st_runtime_underflow(stats, dt); 1773 return; 1774 } 1775 1776 ewma_runtime_add(&stats->runtime.avg, dt); 1777 stats->runtime.total += dt; 1778 } 1779 1780 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1781 #include "selftest_lrc.c" 1782 #endif 1783