1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include "gem/i915_gem_lmem.h" 7 8 #include "gen8_engine_cs.h" 9 #include "i915_drv.h" 10 #include "i915_perf.h" 11 #include "intel_engine.h" 12 #include "intel_engine_regs.h" 13 #include "intel_gpu_commands.h" 14 #include "intel_gt.h" 15 #include "intel_gt_regs.h" 16 #include "intel_lrc.h" 17 #include "intel_lrc_reg.h" 18 #include "intel_ring.h" 19 #include "shmem_utils.h" 20 21 static void set_offsets(u32 *regs, 22 const u8 *data, 23 const struct intel_engine_cs *engine, 24 bool close) 25 #define NOP(x) (BIT(7) | (x)) 26 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 27 #define POSTED BIT(0) 28 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 29 #define REG16(x) \ 30 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 31 (((x) >> 2) & 0x7f) 32 #define END 0 33 { 34 const u32 base = engine->mmio_base; 35 36 while (*data) { 37 u8 count, flags; 38 39 if (*data & BIT(7)) { /* skip */ 40 count = *data++ & ~BIT(7); 41 regs += count; 42 continue; 43 } 44 45 count = *data & 0x3f; 46 flags = *data >> 6; 47 data++; 48 49 *regs = MI_LOAD_REGISTER_IMM(count); 50 if (flags & POSTED) 51 *regs |= MI_LRI_FORCE_POSTED; 52 if (GRAPHICS_VER(engine->i915) >= 11) 53 *regs |= MI_LRI_LRM_CS_MMIO; 54 regs++; 55 56 GEM_BUG_ON(!count); 57 do { 58 u32 offset = 0; 59 u8 v; 60 61 do { 62 v = *data++; 63 offset <<= 7; 64 offset |= v & ~BIT(7); 65 } while (v & BIT(7)); 66 67 regs[0] = base + (offset << 2); 68 regs += 2; 69 } while (--count); 70 } 71 72 if (close) { 73 /* Close the batch; used mainly by live_lrc_layout() */ 74 *regs = MI_BATCH_BUFFER_END; 75 if (GRAPHICS_VER(engine->i915) >= 11) 76 *regs |= BIT(0); 77 } 78 } 79 80 static const u8 gen8_xcs_offsets[] = { 81 NOP(1), 82 LRI(11, 0), 83 REG16(0x244), 84 REG(0x034), 85 REG(0x030), 86 REG(0x038), 87 REG(0x03c), 88 REG(0x168), 89 REG(0x140), 90 REG(0x110), 91 REG(0x11c), 92 REG(0x114), 93 REG(0x118), 94 95 NOP(9), 96 LRI(9, 0), 97 REG16(0x3a8), 98 REG16(0x28c), 99 REG16(0x288), 100 REG16(0x284), 101 REG16(0x280), 102 REG16(0x27c), 103 REG16(0x278), 104 REG16(0x274), 105 REG16(0x270), 106 107 NOP(13), 108 LRI(2, 0), 109 REG16(0x200), 110 REG(0x028), 111 112 END 113 }; 114 115 static const u8 gen9_xcs_offsets[] = { 116 NOP(1), 117 LRI(14, POSTED), 118 REG16(0x244), 119 REG(0x034), 120 REG(0x030), 121 REG(0x038), 122 REG(0x03c), 123 REG(0x168), 124 REG(0x140), 125 REG(0x110), 126 REG(0x11c), 127 REG(0x114), 128 REG(0x118), 129 REG(0x1c0), 130 REG(0x1c4), 131 REG(0x1c8), 132 133 NOP(3), 134 LRI(9, POSTED), 135 REG16(0x3a8), 136 REG16(0x28c), 137 REG16(0x288), 138 REG16(0x284), 139 REG16(0x280), 140 REG16(0x27c), 141 REG16(0x278), 142 REG16(0x274), 143 REG16(0x270), 144 145 NOP(13), 146 LRI(1, POSTED), 147 REG16(0x200), 148 149 NOP(13), 150 LRI(44, POSTED), 151 REG(0x028), 152 REG(0x09c), 153 REG(0x0c0), 154 REG(0x178), 155 REG(0x17c), 156 REG16(0x358), 157 REG(0x170), 158 REG(0x150), 159 REG(0x154), 160 REG(0x158), 161 REG16(0x41c), 162 REG16(0x600), 163 REG16(0x604), 164 REG16(0x608), 165 REG16(0x60c), 166 REG16(0x610), 167 REG16(0x614), 168 REG16(0x618), 169 REG16(0x61c), 170 REG16(0x620), 171 REG16(0x624), 172 REG16(0x628), 173 REG16(0x62c), 174 REG16(0x630), 175 REG16(0x634), 176 REG16(0x638), 177 REG16(0x63c), 178 REG16(0x640), 179 REG16(0x644), 180 REG16(0x648), 181 REG16(0x64c), 182 REG16(0x650), 183 REG16(0x654), 184 REG16(0x658), 185 REG16(0x65c), 186 REG16(0x660), 187 REG16(0x664), 188 REG16(0x668), 189 REG16(0x66c), 190 REG16(0x670), 191 REG16(0x674), 192 REG16(0x678), 193 REG16(0x67c), 194 REG(0x068), 195 196 END 197 }; 198 199 static const u8 gen12_xcs_offsets[] = { 200 NOP(1), 201 LRI(13, POSTED), 202 REG16(0x244), 203 REG(0x034), 204 REG(0x030), 205 REG(0x038), 206 REG(0x03c), 207 REG(0x168), 208 REG(0x140), 209 REG(0x110), 210 REG(0x1c0), 211 REG(0x1c4), 212 REG(0x1c8), 213 REG(0x180), 214 REG16(0x2b4), 215 216 NOP(5), 217 LRI(9, POSTED), 218 REG16(0x3a8), 219 REG16(0x28c), 220 REG16(0x288), 221 REG16(0x284), 222 REG16(0x280), 223 REG16(0x27c), 224 REG16(0x278), 225 REG16(0x274), 226 REG16(0x270), 227 228 END 229 }; 230 231 static const u8 dg2_xcs_offsets[] = { 232 NOP(1), 233 LRI(15, POSTED), 234 REG16(0x244), 235 REG(0x034), 236 REG(0x030), 237 REG(0x038), 238 REG(0x03c), 239 REG(0x168), 240 REG(0x140), 241 REG(0x110), 242 REG(0x1c0), 243 REG(0x1c4), 244 REG(0x1c8), 245 REG(0x180), 246 REG16(0x2b4), 247 REG(0x120), 248 REG(0x124), 249 250 NOP(1), 251 LRI(9, POSTED), 252 REG16(0x3a8), 253 REG16(0x28c), 254 REG16(0x288), 255 REG16(0x284), 256 REG16(0x280), 257 REG16(0x27c), 258 REG16(0x278), 259 REG16(0x274), 260 REG16(0x270), 261 262 END 263 }; 264 265 static const u8 gen8_rcs_offsets[] = { 266 NOP(1), 267 LRI(14, POSTED), 268 REG16(0x244), 269 REG(0x034), 270 REG(0x030), 271 REG(0x038), 272 REG(0x03c), 273 REG(0x168), 274 REG(0x140), 275 REG(0x110), 276 REG(0x11c), 277 REG(0x114), 278 REG(0x118), 279 REG(0x1c0), 280 REG(0x1c4), 281 REG(0x1c8), 282 283 NOP(3), 284 LRI(9, POSTED), 285 REG16(0x3a8), 286 REG16(0x28c), 287 REG16(0x288), 288 REG16(0x284), 289 REG16(0x280), 290 REG16(0x27c), 291 REG16(0x278), 292 REG16(0x274), 293 REG16(0x270), 294 295 NOP(13), 296 LRI(1, 0), 297 REG(0x0c8), 298 299 END 300 }; 301 302 static const u8 gen9_rcs_offsets[] = { 303 NOP(1), 304 LRI(14, POSTED), 305 REG16(0x244), 306 REG(0x34), 307 REG(0x30), 308 REG(0x38), 309 REG(0x3c), 310 REG(0x168), 311 REG(0x140), 312 REG(0x110), 313 REG(0x11c), 314 REG(0x114), 315 REG(0x118), 316 REG(0x1c0), 317 REG(0x1c4), 318 REG(0x1c8), 319 320 NOP(3), 321 LRI(9, POSTED), 322 REG16(0x3a8), 323 REG16(0x28c), 324 REG16(0x288), 325 REG16(0x284), 326 REG16(0x280), 327 REG16(0x27c), 328 REG16(0x278), 329 REG16(0x274), 330 REG16(0x270), 331 332 NOP(13), 333 LRI(1, 0), 334 REG(0xc8), 335 336 NOP(13), 337 LRI(44, POSTED), 338 REG(0x28), 339 REG(0x9c), 340 REG(0xc0), 341 REG(0x178), 342 REG(0x17c), 343 REG16(0x358), 344 REG(0x170), 345 REG(0x150), 346 REG(0x154), 347 REG(0x158), 348 REG16(0x41c), 349 REG16(0x600), 350 REG16(0x604), 351 REG16(0x608), 352 REG16(0x60c), 353 REG16(0x610), 354 REG16(0x614), 355 REG16(0x618), 356 REG16(0x61c), 357 REG16(0x620), 358 REG16(0x624), 359 REG16(0x628), 360 REG16(0x62c), 361 REG16(0x630), 362 REG16(0x634), 363 REG16(0x638), 364 REG16(0x63c), 365 REG16(0x640), 366 REG16(0x644), 367 REG16(0x648), 368 REG16(0x64c), 369 REG16(0x650), 370 REG16(0x654), 371 REG16(0x658), 372 REG16(0x65c), 373 REG16(0x660), 374 REG16(0x664), 375 REG16(0x668), 376 REG16(0x66c), 377 REG16(0x670), 378 REG16(0x674), 379 REG16(0x678), 380 REG16(0x67c), 381 REG(0x68), 382 383 END 384 }; 385 386 static const u8 gen11_rcs_offsets[] = { 387 NOP(1), 388 LRI(15, POSTED), 389 REG16(0x244), 390 REG(0x034), 391 REG(0x030), 392 REG(0x038), 393 REG(0x03c), 394 REG(0x168), 395 REG(0x140), 396 REG(0x110), 397 REG(0x11c), 398 REG(0x114), 399 REG(0x118), 400 REG(0x1c0), 401 REG(0x1c4), 402 REG(0x1c8), 403 REG(0x180), 404 405 NOP(1), 406 LRI(9, POSTED), 407 REG16(0x3a8), 408 REG16(0x28c), 409 REG16(0x288), 410 REG16(0x284), 411 REG16(0x280), 412 REG16(0x27c), 413 REG16(0x278), 414 REG16(0x274), 415 REG16(0x270), 416 417 LRI(1, POSTED), 418 REG(0x1b0), 419 420 NOP(10), 421 LRI(1, 0), 422 REG(0x0c8), 423 424 END 425 }; 426 427 static const u8 gen12_rcs_offsets[] = { 428 NOP(1), 429 LRI(13, POSTED), 430 REG16(0x244), 431 REG(0x034), 432 REG(0x030), 433 REG(0x038), 434 REG(0x03c), 435 REG(0x168), 436 REG(0x140), 437 REG(0x110), 438 REG(0x1c0), 439 REG(0x1c4), 440 REG(0x1c8), 441 REG(0x180), 442 REG16(0x2b4), 443 444 NOP(5), 445 LRI(9, POSTED), 446 REG16(0x3a8), 447 REG16(0x28c), 448 REG16(0x288), 449 REG16(0x284), 450 REG16(0x280), 451 REG16(0x27c), 452 REG16(0x278), 453 REG16(0x274), 454 REG16(0x270), 455 456 LRI(3, POSTED), 457 REG(0x1b0), 458 REG16(0x5a8), 459 REG16(0x5ac), 460 461 NOP(6), 462 LRI(1, 0), 463 REG(0x0c8), 464 NOP(3 + 9 + 1), 465 466 LRI(51, POSTED), 467 REG16(0x588), 468 REG16(0x588), 469 REG16(0x588), 470 REG16(0x588), 471 REG16(0x588), 472 REG16(0x588), 473 REG(0x028), 474 REG(0x09c), 475 REG(0x0c0), 476 REG(0x178), 477 REG(0x17c), 478 REG16(0x358), 479 REG(0x170), 480 REG(0x150), 481 REG(0x154), 482 REG(0x158), 483 REG16(0x41c), 484 REG16(0x600), 485 REG16(0x604), 486 REG16(0x608), 487 REG16(0x60c), 488 REG16(0x610), 489 REG16(0x614), 490 REG16(0x618), 491 REG16(0x61c), 492 REG16(0x620), 493 REG16(0x624), 494 REG16(0x628), 495 REG16(0x62c), 496 REG16(0x630), 497 REG16(0x634), 498 REG16(0x638), 499 REG16(0x63c), 500 REG16(0x640), 501 REG16(0x644), 502 REG16(0x648), 503 REG16(0x64c), 504 REG16(0x650), 505 REG16(0x654), 506 REG16(0x658), 507 REG16(0x65c), 508 REG16(0x660), 509 REG16(0x664), 510 REG16(0x668), 511 REG16(0x66c), 512 REG16(0x670), 513 REG16(0x674), 514 REG16(0x678), 515 REG16(0x67c), 516 REG(0x068), 517 REG(0x084), 518 NOP(1), 519 520 END 521 }; 522 523 static const u8 xehp_rcs_offsets[] = { 524 NOP(1), 525 LRI(13, POSTED), 526 REG16(0x244), 527 REG(0x034), 528 REG(0x030), 529 REG(0x038), 530 REG(0x03c), 531 REG(0x168), 532 REG(0x140), 533 REG(0x110), 534 REG(0x1c0), 535 REG(0x1c4), 536 REG(0x1c8), 537 REG(0x180), 538 REG16(0x2b4), 539 540 NOP(5), 541 LRI(9, POSTED), 542 REG16(0x3a8), 543 REG16(0x28c), 544 REG16(0x288), 545 REG16(0x284), 546 REG16(0x280), 547 REG16(0x27c), 548 REG16(0x278), 549 REG16(0x274), 550 REG16(0x270), 551 552 LRI(3, POSTED), 553 REG(0x1b0), 554 REG16(0x5a8), 555 REG16(0x5ac), 556 557 NOP(6), 558 LRI(1, 0), 559 REG(0x0c8), 560 561 END 562 }; 563 564 static const u8 dg2_rcs_offsets[] = { 565 NOP(1), 566 LRI(15, POSTED), 567 REG16(0x244), 568 REG(0x034), 569 REG(0x030), 570 REG(0x038), 571 REG(0x03c), 572 REG(0x168), 573 REG(0x140), 574 REG(0x110), 575 REG(0x1c0), 576 REG(0x1c4), 577 REG(0x1c8), 578 REG(0x180), 579 REG16(0x2b4), 580 REG(0x120), 581 REG(0x124), 582 583 NOP(1), 584 LRI(9, POSTED), 585 REG16(0x3a8), 586 REG16(0x28c), 587 REG16(0x288), 588 REG16(0x284), 589 REG16(0x280), 590 REG16(0x27c), 591 REG16(0x278), 592 REG16(0x274), 593 REG16(0x270), 594 595 LRI(3, POSTED), 596 REG(0x1b0), 597 REG16(0x5a8), 598 REG16(0x5ac), 599 600 NOP(6), 601 LRI(1, 0), 602 REG(0x0c8), 603 604 END 605 }; 606 607 #undef END 608 #undef REG16 609 #undef REG 610 #undef LRI 611 #undef NOP 612 613 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 614 { 615 /* 616 * The gen12+ lists only have the registers we program in the basic 617 * default state. We rely on the context image using relative 618 * addressing to automatic fixup the register state between the 619 * physical engines for virtual engine. 620 */ 621 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && 622 !intel_engine_has_relative_mmio(engine)); 623 624 if (engine->class == RENDER_CLASS) { 625 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 626 return dg2_rcs_offsets; 627 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 628 return xehp_rcs_offsets; 629 else if (GRAPHICS_VER(engine->i915) >= 12) 630 return gen12_rcs_offsets; 631 else if (GRAPHICS_VER(engine->i915) >= 11) 632 return gen11_rcs_offsets; 633 else if (GRAPHICS_VER(engine->i915) >= 9) 634 return gen9_rcs_offsets; 635 else 636 return gen8_rcs_offsets; 637 } else { 638 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 639 return dg2_xcs_offsets; 640 else if (GRAPHICS_VER(engine->i915) >= 12) 641 return gen12_xcs_offsets; 642 else if (GRAPHICS_VER(engine->i915) >= 9) 643 return gen9_xcs_offsets; 644 else 645 return gen8_xcs_offsets; 646 } 647 } 648 649 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 650 { 651 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 652 return 0x70; 653 else if (GRAPHICS_VER(engine->i915) >= 12) 654 return 0x60; 655 else if (GRAPHICS_VER(engine->i915) >= 9) 656 return 0x54; 657 else if (engine->class == RENDER_CLASS) 658 return 0x58; 659 else 660 return -1; 661 } 662 663 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 664 { 665 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 666 return 0x84; 667 else if (GRAPHICS_VER(engine->i915) >= 12) 668 return 0x74; 669 else if (GRAPHICS_VER(engine->i915) >= 9) 670 return 0x68; 671 else if (engine->class == RENDER_CLASS) 672 return 0xd8; 673 else 674 return -1; 675 } 676 677 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 678 { 679 if (GRAPHICS_VER(engine->i915) >= 12) 680 return 0x12; 681 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS) 682 return 0x18; 683 else 684 return -1; 685 } 686 687 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 688 { 689 int x; 690 691 x = lrc_ring_wa_bb_per_ctx(engine); 692 if (x < 0) 693 return x; 694 695 return x + 2; 696 } 697 698 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 699 { 700 int x; 701 702 x = lrc_ring_indirect_ptr(engine); 703 if (x < 0) 704 return x; 705 706 return x + 2; 707 } 708 709 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 710 { 711 712 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 713 /* 714 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL 715 * simply to match the RCS context image layout. 716 */ 717 return 0xc6; 718 else if (engine->class != RENDER_CLASS) 719 return -1; 720 else if (GRAPHICS_VER(engine->i915) >= 12) 721 return 0xb6; 722 else if (GRAPHICS_VER(engine->i915) >= 11) 723 return 0xaa; 724 else 725 return -1; 726 } 727 728 static u32 729 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 730 { 731 switch (GRAPHICS_VER(engine->i915)) { 732 default: 733 MISSING_CASE(GRAPHICS_VER(engine->i915)); 734 fallthrough; 735 case 12: 736 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 737 case 11: 738 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 739 case 9: 740 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 741 case 8: 742 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 743 } 744 } 745 746 static void 747 lrc_setup_indirect_ctx(u32 *regs, 748 const struct intel_engine_cs *engine, 749 u32 ctx_bb_ggtt_addr, 750 u32 size) 751 { 752 GEM_BUG_ON(!size); 753 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 754 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 755 regs[lrc_ring_indirect_ptr(engine) + 1] = 756 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 757 758 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 759 regs[lrc_ring_indirect_offset(engine) + 1] = 760 lrc_ring_indirect_offset_default(engine) << 6; 761 } 762 763 static void init_common_regs(u32 * const regs, 764 const struct intel_context *ce, 765 const struct intel_engine_cs *engine, 766 bool inhibit) 767 { 768 u32 ctl; 769 770 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 771 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 772 if (inhibit) 773 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 774 if (GRAPHICS_VER(engine->i915) < 11) 775 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 776 CTX_CTRL_RS_CTX_ENABLE); 777 regs[CTX_CONTEXT_CONTROL] = ctl; 778 779 regs[CTX_TIMESTAMP] = ce->runtime.last; 780 } 781 782 static void init_wa_bb_regs(u32 * const regs, 783 const struct intel_engine_cs *engine) 784 { 785 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 786 787 if (wa_ctx->per_ctx.size) { 788 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 789 790 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 791 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 792 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 793 } 794 795 if (wa_ctx->indirect_ctx.size) { 796 lrc_setup_indirect_ctx(regs, engine, 797 i915_ggtt_offset(wa_ctx->vma) + 798 wa_ctx->indirect_ctx.offset, 799 wa_ctx->indirect_ctx.size); 800 } 801 } 802 803 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 804 { 805 if (i915_vm_is_4lvl(&ppgtt->vm)) { 806 /* 64b PPGTT (48bit canonical) 807 * PDP0_DESCRIPTOR contains the base address to PML4 and 808 * other PDP Descriptors are ignored. 809 */ 810 ASSIGN_CTX_PML4(ppgtt, regs); 811 } else { 812 ASSIGN_CTX_PDP(ppgtt, regs, 3); 813 ASSIGN_CTX_PDP(ppgtt, regs, 2); 814 ASSIGN_CTX_PDP(ppgtt, regs, 1); 815 ASSIGN_CTX_PDP(ppgtt, regs, 0); 816 } 817 } 818 819 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 820 { 821 if (i915_is_ggtt(vm)) 822 return i915_vm_to_ggtt(vm)->alias; 823 else 824 return i915_vm_to_ppgtt(vm); 825 } 826 827 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 828 { 829 int x; 830 831 x = lrc_ring_mi_mode(engine); 832 if (x != -1) { 833 regs[x + 1] &= ~STOP_RING; 834 regs[x + 1] |= STOP_RING << 16; 835 } 836 } 837 838 static void __lrc_init_regs(u32 *regs, 839 const struct intel_context *ce, 840 const struct intel_engine_cs *engine, 841 bool inhibit) 842 { 843 /* 844 * A context is actually a big batch buffer with several 845 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 846 * values we are setting here are only for the first context restore: 847 * on a subsequent save, the GPU will recreate this batchbuffer with new 848 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 849 * we are not initializing here). 850 * 851 * Must keep consistent with virtual_update_register_offsets(). 852 */ 853 854 if (inhibit) 855 memset(regs, 0, PAGE_SIZE); 856 857 set_offsets(regs, reg_offsets(engine), engine, inhibit); 858 859 init_common_regs(regs, ce, engine, inhibit); 860 init_ppgtt_regs(regs, vm_alias(ce->vm)); 861 862 init_wa_bb_regs(regs, engine); 863 864 __reset_stop_ring(regs, engine); 865 } 866 867 void lrc_init_regs(const struct intel_context *ce, 868 const struct intel_engine_cs *engine, 869 bool inhibit) 870 { 871 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 872 } 873 874 void lrc_reset_regs(const struct intel_context *ce, 875 const struct intel_engine_cs *engine) 876 { 877 __reset_stop_ring(ce->lrc_reg_state, engine); 878 } 879 880 static void 881 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 882 { 883 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 884 return; 885 886 vaddr += engine->context_size; 887 888 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 889 } 890 891 static void 892 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 893 { 894 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 895 return; 896 897 vaddr += engine->context_size; 898 899 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 900 drm_err_once(&engine->i915->drm, 901 "%s context redzone overwritten!\n", 902 engine->name); 903 } 904 905 void lrc_init_state(struct intel_context *ce, 906 struct intel_engine_cs *engine, 907 void *state) 908 { 909 bool inhibit = true; 910 911 set_redzone(state, engine); 912 913 if (engine->default_state) { 914 shmem_read(engine->default_state, 0, 915 state, engine->context_size); 916 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 917 inhibit = false; 918 } 919 920 /* Clear the ppHWSP (inc. per-context counters) */ 921 memset(state, 0, PAGE_SIZE); 922 923 /* 924 * The second page of the context object contains some registers which 925 * must be set up prior to the first execution. 926 */ 927 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 928 } 929 930 static struct i915_vma * 931 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 932 { 933 struct drm_i915_gem_object *obj; 934 struct i915_vma *vma; 935 u32 context_size; 936 937 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 938 939 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 940 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 941 942 if (GRAPHICS_VER(engine->i915) == 12) { 943 ce->wa_bb_page = context_size / PAGE_SIZE; 944 context_size += PAGE_SIZE; 945 } 946 947 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { 948 ce->parallel.guc.parent_page = context_size / PAGE_SIZE; 949 context_size += PARENT_SCRATCH_SIZE; 950 } 951 952 obj = i915_gem_object_create_lmem(engine->i915, context_size, 953 I915_BO_ALLOC_PM_VOLATILE); 954 if (IS_ERR(obj)) 955 obj = i915_gem_object_create_shmem(engine->i915, context_size); 956 if (IS_ERR(obj)) 957 return ERR_CAST(obj); 958 959 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 960 if (IS_ERR(vma)) { 961 i915_gem_object_put(obj); 962 return vma; 963 } 964 965 return vma; 966 } 967 968 static struct intel_timeline * 969 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 970 { 971 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 972 973 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 974 } 975 976 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 977 { 978 struct intel_ring *ring; 979 struct i915_vma *vma; 980 int err; 981 982 GEM_BUG_ON(ce->state); 983 984 vma = __lrc_alloc_state(ce, engine); 985 if (IS_ERR(vma)) 986 return PTR_ERR(vma); 987 988 ring = intel_engine_create_ring(engine, ce->ring_size); 989 if (IS_ERR(ring)) { 990 err = PTR_ERR(ring); 991 goto err_vma; 992 } 993 994 if (!page_mask_bits(ce->timeline)) { 995 struct intel_timeline *tl; 996 997 /* 998 * Use the static global HWSP for the kernel context, and 999 * a dynamically allocated cacheline for everyone else. 1000 */ 1001 if (unlikely(ce->timeline)) 1002 tl = pinned_timeline(ce, engine); 1003 else 1004 tl = intel_timeline_create(engine->gt); 1005 if (IS_ERR(tl)) { 1006 err = PTR_ERR(tl); 1007 goto err_ring; 1008 } 1009 1010 ce->timeline = tl; 1011 } 1012 1013 ce->ring = ring; 1014 ce->state = vma; 1015 1016 return 0; 1017 1018 err_ring: 1019 intel_ring_put(ring); 1020 err_vma: 1021 i915_vma_put(vma); 1022 return err; 1023 } 1024 1025 void lrc_reset(struct intel_context *ce) 1026 { 1027 GEM_BUG_ON(!intel_context_is_pinned(ce)); 1028 1029 intel_ring_reset(ce->ring, ce->ring->emit); 1030 1031 /* Scrub away the garbage */ 1032 lrc_init_regs(ce, ce->engine, true); 1033 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 1034 } 1035 1036 int 1037 lrc_pre_pin(struct intel_context *ce, 1038 struct intel_engine_cs *engine, 1039 struct i915_gem_ww_ctx *ww, 1040 void **vaddr) 1041 { 1042 GEM_BUG_ON(!ce->state); 1043 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1044 1045 *vaddr = i915_gem_object_pin_map(ce->state->obj, 1046 i915_coherent_map_type(ce->engine->i915, 1047 ce->state->obj, 1048 false) | 1049 I915_MAP_OVERRIDE); 1050 1051 return PTR_ERR_OR_ZERO(*vaddr); 1052 } 1053 1054 int 1055 lrc_pin(struct intel_context *ce, 1056 struct intel_engine_cs *engine, 1057 void *vaddr) 1058 { 1059 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 1060 1061 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 1062 lrc_init_state(ce, engine, vaddr); 1063 1064 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 1065 return 0; 1066 } 1067 1068 void lrc_unpin(struct intel_context *ce) 1069 { 1070 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 1071 ce->engine); 1072 } 1073 1074 void lrc_post_unpin(struct intel_context *ce) 1075 { 1076 i915_gem_object_unpin_map(ce->state->obj); 1077 } 1078 1079 void lrc_fini(struct intel_context *ce) 1080 { 1081 if (!ce->state) 1082 return; 1083 1084 intel_ring_put(fetch_and_zero(&ce->ring)); 1085 i915_vma_put(fetch_and_zero(&ce->state)); 1086 } 1087 1088 void lrc_destroy(struct kref *kref) 1089 { 1090 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1091 1092 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1093 GEM_BUG_ON(intel_context_is_pinned(ce)); 1094 1095 lrc_fini(ce); 1096 1097 intel_context_fini(ce); 1098 intel_context_free(ce); 1099 } 1100 1101 static u32 * 1102 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 1103 { 1104 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1105 MI_SRM_LRM_GLOBAL_GTT | 1106 MI_LRI_LRM_CS_MMIO; 1107 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1108 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1109 CTX_TIMESTAMP * sizeof(u32); 1110 *cs++ = 0; 1111 1112 *cs++ = MI_LOAD_REGISTER_REG | 1113 MI_LRR_SOURCE_CS_MMIO | 1114 MI_LRI_LRM_CS_MMIO; 1115 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1116 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1117 1118 *cs++ = MI_LOAD_REGISTER_REG | 1119 MI_LRR_SOURCE_CS_MMIO | 1120 MI_LRI_LRM_CS_MMIO; 1121 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1122 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1123 1124 return cs; 1125 } 1126 1127 static u32 * 1128 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 1129 { 1130 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 1131 1132 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1133 MI_SRM_LRM_GLOBAL_GTT | 1134 MI_LRI_LRM_CS_MMIO; 1135 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1136 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1137 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 1138 *cs++ = 0; 1139 1140 return cs; 1141 } 1142 1143 static u32 * 1144 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1145 { 1146 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1147 1148 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1149 MI_SRM_LRM_GLOBAL_GTT | 1150 MI_LRI_LRM_CS_MMIO; 1151 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1152 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1153 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1154 *cs++ = 0; 1155 1156 *cs++ = MI_LOAD_REGISTER_REG | 1157 MI_LRR_SOURCE_CS_MMIO | 1158 MI_LRI_LRM_CS_MMIO; 1159 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1160 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1161 1162 return cs; 1163 } 1164 1165 static u32 * 1166 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1167 { 1168 cs = gen12_emit_timestamp_wa(ce, cs); 1169 cs = gen12_emit_cmd_buf_wa(ce, cs); 1170 cs = gen12_emit_restore_scratch(ce, cs); 1171 1172 /* Wa_16013000631:dg2 */ 1173 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || 1174 IS_DG2_G11(ce->engine->i915)) 1175 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); 1176 1177 return cs; 1178 } 1179 1180 static u32 * 1181 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1182 { 1183 cs = gen12_emit_timestamp_wa(ce, cs); 1184 cs = gen12_emit_restore_scratch(ce, cs); 1185 1186 return cs; 1187 } 1188 1189 static u32 context_wa_bb_offset(const struct intel_context *ce) 1190 { 1191 return PAGE_SIZE * ce->wa_bb_page; 1192 } 1193 1194 static u32 *context_indirect_bb(const struct intel_context *ce) 1195 { 1196 void *ptr; 1197 1198 GEM_BUG_ON(!ce->wa_bb_page); 1199 1200 ptr = ce->lrc_reg_state; 1201 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 1202 ptr += context_wa_bb_offset(ce); 1203 1204 return ptr; 1205 } 1206 1207 static void 1208 setup_indirect_ctx_bb(const struct intel_context *ce, 1209 const struct intel_engine_cs *engine, 1210 u32 *(*emit)(const struct intel_context *, u32 *)) 1211 { 1212 u32 * const start = context_indirect_bb(ce); 1213 u32 *cs; 1214 1215 cs = emit(ce, start); 1216 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1217 while ((unsigned long)cs % CACHELINE_BYTES) 1218 *cs++ = MI_NOOP; 1219 1220 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1221 i915_ggtt_offset(ce->state) + 1222 context_wa_bb_offset(ce), 1223 (cs - start) * sizeof(*cs)); 1224 } 1225 1226 /* 1227 * The context descriptor encodes various attributes of a context, 1228 * including its GTT address and some flags. Because it's fairly 1229 * expensive to calculate, we'll just do it once and cache the result, 1230 * which remains valid until the context is unpinned. 1231 * 1232 * This is what a descriptor looks like, from LSB to MSB:: 1233 * 1234 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1235 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1236 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1237 * bits 53-54: mbz, reserved for use by hardware 1238 * bits 55-63: group ID, currently unused and set to 0 1239 * 1240 * Starting from Gen11, the upper dword of the descriptor has a new format: 1241 * 1242 * bits 32-36: reserved 1243 * bits 37-47: SW context ID 1244 * bits 48:53: engine instance 1245 * bit 54: mbz, reserved for use by hardware 1246 * bits 55-60: SW counter 1247 * bits 61-63: engine class 1248 * 1249 * On Xe_HP, the upper dword of the descriptor has a new format: 1250 * 1251 * bits 32-37: virtual function number 1252 * bit 38: mbz, reserved for use by hardware 1253 * bits 39-54: SW context ID 1254 * bits 55-57: reserved 1255 * bits 58-63: SW counter 1256 * 1257 * engine info, SW context ID and SW counter need to form a unique number 1258 * (Context ID) per lrc. 1259 */ 1260 static u32 lrc_descriptor(const struct intel_context *ce) 1261 { 1262 u32 desc; 1263 1264 desc = INTEL_LEGACY_32B_CONTEXT; 1265 if (i915_vm_is_4lvl(ce->vm)) 1266 desc = INTEL_LEGACY_64B_CONTEXT; 1267 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1268 1269 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1270 if (GRAPHICS_VER(ce->vm->i915) == 8) 1271 desc |= GEN8_CTX_L3LLC_COHERENT; 1272 1273 return i915_ggtt_offset(ce->state) | desc; 1274 } 1275 1276 u32 lrc_update_regs(const struct intel_context *ce, 1277 const struct intel_engine_cs *engine, 1278 u32 head) 1279 { 1280 struct intel_ring *ring = ce->ring; 1281 u32 *regs = ce->lrc_reg_state; 1282 1283 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1284 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1285 1286 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1287 regs[CTX_RING_HEAD] = head; 1288 regs[CTX_RING_TAIL] = ring->tail; 1289 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1290 1291 /* RPCS */ 1292 if (engine->class == RENDER_CLASS) { 1293 regs[CTX_R_PWR_CLK_STATE] = 1294 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1295 1296 i915_oa_init_reg_state(ce, engine); 1297 } 1298 1299 if (ce->wa_bb_page) { 1300 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1301 1302 fn = gen12_emit_indirect_ctx_xcs; 1303 if (ce->engine->class == RENDER_CLASS) 1304 fn = gen12_emit_indirect_ctx_rcs; 1305 1306 /* Mutually exclusive wrt to global indirect bb */ 1307 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1308 setup_indirect_ctx_bb(ce, engine, fn); 1309 } 1310 1311 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1312 } 1313 1314 void lrc_update_offsets(struct intel_context *ce, 1315 struct intel_engine_cs *engine) 1316 { 1317 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1318 } 1319 1320 void lrc_check_regs(const struct intel_context *ce, 1321 const struct intel_engine_cs *engine, 1322 const char *when) 1323 { 1324 const struct intel_ring *ring = ce->ring; 1325 u32 *regs = ce->lrc_reg_state; 1326 bool valid = true; 1327 int x; 1328 1329 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1330 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1331 engine->name, 1332 regs[CTX_RING_START], 1333 i915_ggtt_offset(ring->vma)); 1334 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1335 valid = false; 1336 } 1337 1338 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1339 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1340 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1341 engine->name, 1342 regs[CTX_RING_CTL], 1343 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1344 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1345 valid = false; 1346 } 1347 1348 x = lrc_ring_mi_mode(engine); 1349 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1350 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1351 engine->name, regs[x + 1]); 1352 regs[x + 1] &= ~STOP_RING; 1353 regs[x + 1] |= STOP_RING << 16; 1354 valid = false; 1355 } 1356 1357 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1358 } 1359 1360 /* 1361 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1362 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1363 * but there is a slight complication as this is applied in WA batch where the 1364 * values are only initialized once so we cannot take register value at the 1365 * beginning and reuse it further; hence we save its value to memory, upload a 1366 * constant value with bit21 set and then we restore it back with the saved value. 1367 * To simplify the WA, a constant value is formed by using the default value 1368 * of this register. This shouldn't be a problem because we are only modifying 1369 * it for a short period and this batch in non-premptible. We can ofcourse 1370 * use additional instructions that read the actual value of the register 1371 * at that time and set our bit of interest but it makes the WA complicated. 1372 * 1373 * This WA is also required for Gen9 so extracting as a function avoids 1374 * code duplication. 1375 */ 1376 static u32 * 1377 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1378 { 1379 /* NB no one else is allowed to scribble over scratch + 256! */ 1380 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1381 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1382 *batch++ = intel_gt_scratch_offset(engine->gt, 1383 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1384 *batch++ = 0; 1385 1386 *batch++ = MI_LOAD_REGISTER_IMM(1); 1387 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1388 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1389 1390 batch = gen8_emit_pipe_control(batch, 1391 PIPE_CONTROL_CS_STALL | 1392 PIPE_CONTROL_DC_FLUSH_ENABLE, 1393 0); 1394 1395 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1396 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1397 *batch++ = intel_gt_scratch_offset(engine->gt, 1398 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1399 *batch++ = 0; 1400 1401 return batch; 1402 } 1403 1404 /* 1405 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1406 * initialized at the beginning and shared across all contexts but this field 1407 * helps us to have multiple batches at different offsets and select them based 1408 * on a criteria. At the moment this batch always start at the beginning of the page 1409 * and at this point we don't have multiple wa_ctx batch buffers. 1410 * 1411 * The number of WA applied are not known at the beginning; we use this field 1412 * to return the no of DWORDS written. 1413 * 1414 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1415 * so it adds NOOPs as padding to make it cacheline aligned. 1416 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1417 * makes a complete batch buffer. 1418 */ 1419 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1420 { 1421 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1422 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1423 1424 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1425 if (IS_BROADWELL(engine->i915)) 1426 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1427 1428 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1429 /* Actual scratch location is at 128 bytes offset */ 1430 batch = gen8_emit_pipe_control(batch, 1431 PIPE_CONTROL_FLUSH_L3 | 1432 PIPE_CONTROL_STORE_DATA_INDEX | 1433 PIPE_CONTROL_CS_STALL | 1434 PIPE_CONTROL_QW_WRITE, 1435 LRC_PPHWSP_SCRATCH_ADDR); 1436 1437 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1438 1439 /* Pad to end of cacheline */ 1440 while ((unsigned long)batch % CACHELINE_BYTES) 1441 *batch++ = MI_NOOP; 1442 1443 /* 1444 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1445 * execution depends on the length specified in terms of cache lines 1446 * in the register CTX_RCS_INDIRECT_CTX 1447 */ 1448 1449 return batch; 1450 } 1451 1452 struct lri { 1453 i915_reg_t reg; 1454 u32 value; 1455 }; 1456 1457 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1458 { 1459 GEM_BUG_ON(!count || count > 63); 1460 1461 *batch++ = MI_LOAD_REGISTER_IMM(count); 1462 do { 1463 *batch++ = i915_mmio_reg_offset(lri->reg); 1464 *batch++ = lri->value; 1465 } while (lri++, --count); 1466 *batch++ = MI_NOOP; 1467 1468 return batch; 1469 } 1470 1471 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1472 { 1473 static const struct lri lri[] = { 1474 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1475 { 1476 COMMON_SLICE_CHICKEN2, 1477 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1478 0), 1479 }, 1480 1481 /* BSpec: 11391 */ 1482 { 1483 FF_SLICE_CHICKEN, 1484 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1485 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1486 }, 1487 1488 /* BSpec: 11299 */ 1489 { 1490 _3D_CHICKEN3, 1491 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1492 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1493 } 1494 }; 1495 1496 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1497 1498 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1499 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1500 1501 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1502 batch = gen8_emit_pipe_control(batch, 1503 PIPE_CONTROL_FLUSH_L3 | 1504 PIPE_CONTROL_STORE_DATA_INDEX | 1505 PIPE_CONTROL_CS_STALL | 1506 PIPE_CONTROL_QW_WRITE, 1507 LRC_PPHWSP_SCRATCH_ADDR); 1508 1509 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1510 1511 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1512 if (HAS_POOLED_EU(engine->i915)) { 1513 /* 1514 * EU pool configuration is setup along with golden context 1515 * during context initialization. This value depends on 1516 * device type (2x6 or 3x6) and needs to be updated based 1517 * on which subslice is disabled especially for 2x6 1518 * devices, however it is safe to load default 1519 * configuration of 3x6 device instead of masking off 1520 * corresponding bits because HW ignores bits of a disabled 1521 * subslice and drops down to appropriate config. Please 1522 * see render_state_setup() in i915_gem_render_state.c for 1523 * possible configurations, to avoid duplication they are 1524 * not shown here again. 1525 */ 1526 *batch++ = GEN9_MEDIA_POOL_STATE; 1527 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1528 *batch++ = 0x00777000; 1529 *batch++ = 0; 1530 *batch++ = 0; 1531 *batch++ = 0; 1532 } 1533 1534 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1535 1536 /* Pad to end of cacheline */ 1537 while ((unsigned long)batch % CACHELINE_BYTES) 1538 *batch++ = MI_NOOP; 1539 1540 return batch; 1541 } 1542 1543 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1544 1545 static int lrc_create_wa_ctx(struct intel_engine_cs *engine) 1546 { 1547 struct drm_i915_gem_object *obj; 1548 struct i915_vma *vma; 1549 int err; 1550 1551 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1552 if (IS_ERR(obj)) 1553 return PTR_ERR(obj); 1554 1555 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1556 if (IS_ERR(vma)) { 1557 err = PTR_ERR(vma); 1558 goto err; 1559 } 1560 1561 engine->wa_ctx.vma = vma; 1562 return 0; 1563 1564 err: 1565 i915_gem_object_put(obj); 1566 return err; 1567 } 1568 1569 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1570 { 1571 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1572 } 1573 1574 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1575 1576 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1577 { 1578 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1579 struct i915_wa_ctx_bb *wa_bb[] = { 1580 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1581 }; 1582 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1583 struct i915_gem_ww_ctx ww; 1584 void *batch, *batch_ptr; 1585 unsigned int i; 1586 int err; 1587 1588 if (engine->class != RENDER_CLASS) 1589 return; 1590 1591 switch (GRAPHICS_VER(engine->i915)) { 1592 case 12: 1593 case 11: 1594 return; 1595 case 9: 1596 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1597 wa_bb_fn[1] = NULL; 1598 break; 1599 case 8: 1600 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1601 wa_bb_fn[1] = NULL; 1602 break; 1603 default: 1604 MISSING_CASE(GRAPHICS_VER(engine->i915)); 1605 return; 1606 } 1607 1608 err = lrc_create_wa_ctx(engine); 1609 if (err) { 1610 /* 1611 * We continue even if we fail to initialize WA batch 1612 * because we only expect rare glitches but nothing 1613 * critical to prevent us from using GPU 1614 */ 1615 drm_err(&engine->i915->drm, 1616 "Ignoring context switch w/a allocation error:%d\n", 1617 err); 1618 return; 1619 } 1620 1621 if (!engine->wa_ctx.vma) 1622 return; 1623 1624 i915_gem_ww_ctx_init(&ww, true); 1625 retry: 1626 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); 1627 if (!err) 1628 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); 1629 if (err) 1630 goto err; 1631 1632 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1633 if (IS_ERR(batch)) { 1634 err = PTR_ERR(batch); 1635 goto err_unpin; 1636 } 1637 1638 /* 1639 * Emit the two workaround batch buffers, recording the offset from the 1640 * start of the workaround batch buffer object for each and their 1641 * respective sizes. 1642 */ 1643 batch_ptr = batch; 1644 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1645 wa_bb[i]->offset = batch_ptr - batch; 1646 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1647 CACHELINE_BYTES))) { 1648 err = -EINVAL; 1649 break; 1650 } 1651 if (wa_bb_fn[i]) 1652 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1653 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1654 } 1655 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1656 1657 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1658 __i915_gem_object_release_map(wa_ctx->vma->obj); 1659 1660 /* Verify that we can handle failure to setup the wa_ctx */ 1661 if (!err) 1662 err = i915_inject_probe_error(engine->i915, -ENODEV); 1663 1664 err_unpin: 1665 if (err) 1666 i915_vma_unpin(wa_ctx->vma); 1667 err: 1668 if (err == -EDEADLK) { 1669 err = i915_gem_ww_ctx_backoff(&ww); 1670 if (!err) 1671 goto retry; 1672 } 1673 i915_gem_ww_ctx_fini(&ww); 1674 1675 if (err) { 1676 i915_vma_put(engine->wa_ctx.vma); 1677 1678 /* Clear all flags to prevent further use */ 1679 memset(wa_ctx, 0, sizeof(*wa_ctx)); 1680 } 1681 } 1682 1683 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1684 { 1685 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1686 ce->runtime.num_underflow++; 1687 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1688 #endif 1689 } 1690 1691 static u32 lrc_get_runtime(const struct intel_context *ce) 1692 { 1693 /* 1694 * We can use either ppHWSP[16] which is recorded before the context 1695 * switch (and so excludes the cost of context switches) or use the 1696 * value from the context image itself, which is saved/restored earlier 1697 * and so includes the cost of the save. 1698 */ 1699 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 1700 } 1701 1702 void lrc_update_runtime(struct intel_context *ce) 1703 { 1704 u32 old; 1705 s32 dt; 1706 1707 if (intel_context_is_barrier(ce)) 1708 return; 1709 1710 old = ce->runtime.last; 1711 ce->runtime.last = lrc_get_runtime(ce); 1712 dt = ce->runtime.last - old; 1713 1714 if (unlikely(dt < 0)) { 1715 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1716 old, ce->runtime.last, dt); 1717 st_update_runtime_underflow(ce, dt); 1718 return; 1719 } 1720 1721 ewma_runtime_add(&ce->runtime.avg, dt); 1722 ce->runtime.total += dt; 1723 } 1724 1725 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1726 #include "selftest_lrc.c" 1727 #endif 1728