1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include "i915_drv.h" 7 #include "intel_context.h" 8 #include "intel_gpu_commands.h" 9 #include "intel_gt.h" 10 #include "intel_gtt.h" 11 #include "intel_migrate.h" 12 #include "intel_ring.h" 13 14 struct insert_pte_data { 15 u64 offset; 16 }; 17 18 #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */ 19 20 static bool engine_supports_migration(struct intel_engine_cs *engine) 21 { 22 if (!engine) 23 return false; 24 25 /* 26 * We need the ability to prevent aribtration (MI_ARB_ON_OFF), 27 * the ability to write PTE using inline data (MI_STORE_DATA) 28 * and of course the ability to do the block transfer (blits). 29 */ 30 GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS); 31 32 return true; 33 } 34 35 static void insert_pte(struct i915_address_space *vm, 36 struct i915_page_table *pt, 37 void *data) 38 { 39 struct insert_pte_data *d = data; 40 41 vm->insert_page(vm, px_dma(pt), d->offset, I915_CACHE_NONE, 42 i915_gem_object_is_lmem(pt->base) ? PTE_LM : 0); 43 d->offset += PAGE_SIZE; 44 } 45 46 static struct i915_address_space *migrate_vm(struct intel_gt *gt) 47 { 48 struct i915_vm_pt_stash stash = {}; 49 struct i915_ppgtt *vm; 50 int err; 51 int i; 52 53 /* 54 * We construct a very special VM for use by all migration contexts, 55 * it is kept pinned so that it can be used at any time. As we need 56 * to pre-allocate the page directories for the migration VM, this 57 * limits us to only using a small number of prepared vma. 58 * 59 * To be able to pipeline and reschedule migration operations while 60 * avoiding unnecessary contention on the vm itself, the PTE updates 61 * are inline with the blits. All the blits use the same fixed 62 * addresses, with the backing store redirection being updated on the 63 * fly. Only 2 implicit vma are used for all migration operations. 64 * 65 * We lay the ppGTT out as: 66 * 67 * [0, CHUNK_SZ) -> first object 68 * [CHUNK_SZ, 2 * CHUNK_SZ) -> second object 69 * [2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE 70 * 71 * By exposing the dma addresses of the page directories themselves 72 * within the ppGTT, we are then able to rewrite the PTE prior to use. 73 * But the PTE update and subsequent migration operation must be atomic, 74 * i.e. within the same non-preemptible window so that we do not switch 75 * to another migration context that overwrites the PTE. 76 * 77 * TODO: Add support for huge LMEM PTEs 78 */ 79 80 vm = i915_ppgtt_create(gt, I915_BO_ALLOC_PM_EARLY); 81 if (IS_ERR(vm)) 82 return ERR_CAST(vm); 83 84 if (!vm->vm.allocate_va_range || !vm->vm.foreach) { 85 err = -ENODEV; 86 goto err_vm; 87 } 88 89 /* 90 * Each engine instance is assigned its own chunk in the VM, so 91 * that we can run multiple instances concurrently 92 */ 93 for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) { 94 struct intel_engine_cs *engine; 95 u64 base = (u64)i << 32; 96 struct insert_pte_data d = {}; 97 struct i915_gem_ww_ctx ww; 98 u64 sz; 99 100 engine = gt->engine_class[COPY_ENGINE_CLASS][i]; 101 if (!engine_supports_migration(engine)) 102 continue; 103 104 /* 105 * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need 106 * 4x2 page directories for source/destination. 107 */ 108 sz = 2 * CHUNK_SZ; 109 d.offset = base + sz; 110 111 /* 112 * We need another page directory setup so that we can write 113 * the 8x512 PTE in each chunk. 114 */ 115 sz += (sz >> 12) * sizeof(u64); 116 117 err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz); 118 if (err) 119 goto err_vm; 120 121 for_i915_gem_ww(&ww, err, true) { 122 err = i915_vm_lock_objects(&vm->vm, &ww); 123 if (err) 124 continue; 125 err = i915_vm_map_pt_stash(&vm->vm, &stash); 126 if (err) 127 continue; 128 129 vm->vm.allocate_va_range(&vm->vm, &stash, base, sz); 130 } 131 i915_vm_free_pt_stash(&vm->vm, &stash); 132 if (err) 133 goto err_vm; 134 135 /* Now allow the GPU to rewrite the PTE via its own ppGTT */ 136 vm->vm.foreach(&vm->vm, base, d.offset - base, insert_pte, &d); 137 } 138 139 return &vm->vm; 140 141 err_vm: 142 i915_vm_put(&vm->vm); 143 return ERR_PTR(err); 144 } 145 146 static struct intel_engine_cs *first_copy_engine(struct intel_gt *gt) 147 { 148 struct intel_engine_cs *engine; 149 int i; 150 151 for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) { 152 engine = gt->engine_class[COPY_ENGINE_CLASS][i]; 153 if (engine_supports_migration(engine)) 154 return engine; 155 } 156 157 return NULL; 158 } 159 160 static struct intel_context *pinned_context(struct intel_gt *gt) 161 { 162 static struct lock_class_key key; 163 struct intel_engine_cs *engine; 164 struct i915_address_space *vm; 165 struct intel_context *ce; 166 167 engine = first_copy_engine(gt); 168 if (!engine) 169 return ERR_PTR(-ENODEV); 170 171 vm = migrate_vm(gt); 172 if (IS_ERR(vm)) 173 return ERR_CAST(vm); 174 175 ce = intel_engine_create_pinned_context(engine, vm, SZ_512K, 176 I915_GEM_HWS_MIGRATE, 177 &key, "migrate"); 178 i915_vm_put(vm); 179 return ce; 180 } 181 182 int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt) 183 { 184 struct intel_context *ce; 185 186 memset(m, 0, sizeof(*m)); 187 188 ce = pinned_context(gt); 189 if (IS_ERR(ce)) 190 return PTR_ERR(ce); 191 192 m->context = ce; 193 return 0; 194 } 195 196 static int random_index(unsigned int max) 197 { 198 return upper_32_bits(mul_u32_u32(get_random_u32(), max)); 199 } 200 201 static struct intel_context *__migrate_engines(struct intel_gt *gt) 202 { 203 struct intel_engine_cs *engines[MAX_ENGINE_INSTANCE]; 204 struct intel_engine_cs *engine; 205 unsigned int count, i; 206 207 count = 0; 208 for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) { 209 engine = gt->engine_class[COPY_ENGINE_CLASS][i]; 210 if (engine_supports_migration(engine)) 211 engines[count++] = engine; 212 } 213 214 return intel_context_create(engines[random_index(count)]); 215 } 216 217 struct intel_context *intel_migrate_create_context(struct intel_migrate *m) 218 { 219 struct intel_context *ce; 220 221 /* 222 * We randomly distribute contexts across the engines upon constrction, 223 * as they all share the same pinned vm, and so in order to allow 224 * multiple blits to run in parallel, we must construct each blit 225 * to use a different range of the vm for its GTT. This has to be 226 * known at construction, so we can not use the late greedy load 227 * balancing of the virtual-engine. 228 */ 229 ce = __migrate_engines(m->context->engine->gt); 230 if (IS_ERR(ce)) 231 return ce; 232 233 ce->ring = NULL; 234 ce->ring_size = SZ_256K; 235 236 i915_vm_put(ce->vm); 237 ce->vm = i915_vm_get(m->context->vm); 238 239 return ce; 240 } 241 242 static inline struct sgt_dma sg_sgt(struct scatterlist *sg) 243 { 244 dma_addr_t addr = sg_dma_address(sg); 245 246 return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) }; 247 } 248 249 static int emit_no_arbitration(struct i915_request *rq) 250 { 251 u32 *cs; 252 253 cs = intel_ring_begin(rq, 2); 254 if (IS_ERR(cs)) 255 return PTR_ERR(cs); 256 257 /* Explicitly disable preemption for this request. */ 258 *cs++ = MI_ARB_ON_OFF; 259 *cs++ = MI_NOOP; 260 intel_ring_advance(rq, cs); 261 262 return 0; 263 } 264 265 static int emit_pte(struct i915_request *rq, 266 struct sgt_dma *it, 267 enum i915_cache_level cache_level, 268 bool is_lmem, 269 u64 offset, 270 int length) 271 { 272 const u64 encode = rq->context->vm->pte_encode(0, cache_level, 273 is_lmem ? PTE_LM : 0); 274 struct intel_ring *ring = rq->ring; 275 int total = 0; 276 u32 *hdr, *cs; 277 int pkt; 278 279 GEM_BUG_ON(GRAPHICS_VER(rq->engine->i915) < 8); 280 281 /* Compute the page directory offset for the target address range */ 282 offset >>= 12; 283 offset *= sizeof(u64); 284 offset += 2 * CHUNK_SZ; 285 offset += (u64)rq->engine->instance << 32; 286 287 cs = intel_ring_begin(rq, 6); 288 if (IS_ERR(cs)) 289 return PTR_ERR(cs); 290 291 /* Pack as many PTE updates as possible into a single MI command */ 292 pkt = min_t(int, 0x400, ring->space / sizeof(u32) + 5); 293 pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5); 294 295 hdr = cs; 296 *cs++ = MI_STORE_DATA_IMM | REG_BIT(21); /* as qword elements */ 297 *cs++ = lower_32_bits(offset); 298 *cs++ = upper_32_bits(offset); 299 300 do { 301 if (cs - hdr >= pkt) { 302 *hdr += cs - hdr - 2; 303 *cs++ = MI_NOOP; 304 305 ring->emit = (void *)cs - ring->vaddr; 306 intel_ring_advance(rq, cs); 307 intel_ring_update_space(ring); 308 309 cs = intel_ring_begin(rq, 6); 310 if (IS_ERR(cs)) 311 return PTR_ERR(cs); 312 313 pkt = min_t(int, 0x400, ring->space / sizeof(u32) + 5); 314 pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5); 315 316 hdr = cs; 317 *cs++ = MI_STORE_DATA_IMM | REG_BIT(21); 318 *cs++ = lower_32_bits(offset); 319 *cs++ = upper_32_bits(offset); 320 } 321 322 *cs++ = lower_32_bits(encode | it->dma); 323 *cs++ = upper_32_bits(encode | it->dma); 324 325 offset += 8; 326 total += I915_GTT_PAGE_SIZE; 327 328 it->dma += I915_GTT_PAGE_SIZE; 329 if (it->dma >= it->max) { 330 it->sg = __sg_next(it->sg); 331 if (!it->sg || sg_dma_len(it->sg) == 0) 332 break; 333 334 it->dma = sg_dma_address(it->sg); 335 it->max = it->dma + sg_dma_len(it->sg); 336 } 337 } while (total < length); 338 339 *hdr += cs - hdr - 2; 340 *cs++ = MI_NOOP; 341 342 ring->emit = (void *)cs - ring->vaddr; 343 intel_ring_advance(rq, cs); 344 intel_ring_update_space(ring); 345 346 return total; 347 } 348 349 static bool wa_1209644611_applies(int ver, u32 size) 350 { 351 u32 height = size >> PAGE_SHIFT; 352 353 if (ver != 11) 354 return false; 355 356 return height % 4 == 3 && height <= 8; 357 } 358 359 static int emit_copy(struct i915_request *rq, int size) 360 { 361 const int ver = GRAPHICS_VER(rq->engine->i915); 362 u32 instance = rq->engine->instance; 363 u32 *cs; 364 365 cs = intel_ring_begin(rq, ver >= 8 ? 10 : 6); 366 if (IS_ERR(cs)) 367 return PTR_ERR(cs); 368 369 if (ver >= 9 && !wa_1209644611_applies(ver, size)) { 370 *cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2); 371 *cs++ = BLT_DEPTH_32 | PAGE_SIZE; 372 *cs++ = 0; 373 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; 374 *cs++ = CHUNK_SZ; /* dst offset */ 375 *cs++ = instance; 376 *cs++ = 0; 377 *cs++ = PAGE_SIZE; 378 *cs++ = 0; /* src offset */ 379 *cs++ = instance; 380 } else if (ver >= 8) { 381 *cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2); 382 *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE; 383 *cs++ = 0; 384 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; 385 *cs++ = CHUNK_SZ; /* dst offset */ 386 *cs++ = instance; 387 *cs++ = 0; 388 *cs++ = PAGE_SIZE; 389 *cs++ = 0; /* src offset */ 390 *cs++ = instance; 391 } else { 392 GEM_BUG_ON(instance); 393 *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2); 394 *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE; 395 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE; 396 *cs++ = CHUNK_SZ; /* dst offset */ 397 *cs++ = PAGE_SIZE; 398 *cs++ = 0; /* src offset */ 399 } 400 401 intel_ring_advance(rq, cs); 402 return 0; 403 } 404 405 int 406 intel_context_migrate_copy(struct intel_context *ce, 407 const struct i915_deps *deps, 408 struct scatterlist *src, 409 enum i915_cache_level src_cache_level, 410 bool src_is_lmem, 411 struct scatterlist *dst, 412 enum i915_cache_level dst_cache_level, 413 bool dst_is_lmem, 414 struct i915_request **out) 415 { 416 struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst); 417 struct i915_request *rq; 418 int err; 419 420 GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm); 421 *out = NULL; 422 423 GEM_BUG_ON(ce->ring->size < SZ_64K); 424 425 do { 426 int len; 427 428 rq = i915_request_create(ce); 429 if (IS_ERR(rq)) { 430 err = PTR_ERR(rq); 431 goto out_ce; 432 } 433 434 if (deps) { 435 err = i915_request_await_deps(rq, deps); 436 if (err) 437 goto out_rq; 438 439 if (rq->engine->emit_init_breadcrumb) { 440 err = rq->engine->emit_init_breadcrumb(rq); 441 if (err) 442 goto out_rq; 443 } 444 445 deps = NULL; 446 } 447 448 /* The PTE updates + copy must not be interrupted. */ 449 err = emit_no_arbitration(rq); 450 if (err) 451 goto out_rq; 452 453 len = emit_pte(rq, &it_src, src_cache_level, src_is_lmem, 0, 454 CHUNK_SZ); 455 if (len <= 0) { 456 err = len; 457 goto out_rq; 458 } 459 460 err = emit_pte(rq, &it_dst, dst_cache_level, dst_is_lmem, 461 CHUNK_SZ, len); 462 if (err < 0) 463 goto out_rq; 464 if (err < len) { 465 err = -EINVAL; 466 goto out_rq; 467 } 468 469 err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); 470 if (err) 471 goto out_rq; 472 473 err = emit_copy(rq, len); 474 475 /* Arbitration is re-enabled between requests. */ 476 out_rq: 477 if (*out) 478 i915_request_put(*out); 479 *out = i915_request_get(rq); 480 i915_request_add(rq); 481 if (err || !it_src.sg || !sg_dma_len(it_src.sg)) 482 break; 483 484 cond_resched(); 485 } while (1); 486 487 out_ce: 488 return err; 489 } 490 491 static int emit_clear(struct i915_request *rq, int size, u32 value) 492 { 493 const int ver = GRAPHICS_VER(rq->engine->i915); 494 u32 instance = rq->engine->instance; 495 u32 *cs; 496 497 GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX); 498 499 cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6); 500 if (IS_ERR(cs)) 501 return PTR_ERR(cs); 502 503 if (ver >= 8) { 504 *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2); 505 *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE; 506 *cs++ = 0; 507 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; 508 *cs++ = 0; /* offset */ 509 *cs++ = instance; 510 *cs++ = value; 511 *cs++ = MI_NOOP; 512 } else { 513 GEM_BUG_ON(instance); 514 *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2); 515 *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE; 516 *cs++ = 0; 517 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; 518 *cs++ = 0; 519 *cs++ = value; 520 } 521 522 intel_ring_advance(rq, cs); 523 return 0; 524 } 525 526 int 527 intel_context_migrate_clear(struct intel_context *ce, 528 const struct i915_deps *deps, 529 struct scatterlist *sg, 530 enum i915_cache_level cache_level, 531 bool is_lmem, 532 u32 value, 533 struct i915_request **out) 534 { 535 struct sgt_dma it = sg_sgt(sg); 536 struct i915_request *rq; 537 int err; 538 539 GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm); 540 *out = NULL; 541 542 GEM_BUG_ON(ce->ring->size < SZ_64K); 543 544 do { 545 int len; 546 547 rq = i915_request_create(ce); 548 if (IS_ERR(rq)) { 549 err = PTR_ERR(rq); 550 goto out_ce; 551 } 552 553 if (deps) { 554 err = i915_request_await_deps(rq, deps); 555 if (err) 556 goto out_rq; 557 558 if (rq->engine->emit_init_breadcrumb) { 559 err = rq->engine->emit_init_breadcrumb(rq); 560 if (err) 561 goto out_rq; 562 } 563 564 deps = NULL; 565 } 566 567 /* The PTE updates + clear must not be interrupted. */ 568 err = emit_no_arbitration(rq); 569 if (err) 570 goto out_rq; 571 572 len = emit_pte(rq, &it, cache_level, is_lmem, 0, CHUNK_SZ); 573 if (len <= 0) { 574 err = len; 575 goto out_rq; 576 } 577 578 err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); 579 if (err) 580 goto out_rq; 581 582 err = emit_clear(rq, len, value); 583 584 /* Arbitration is re-enabled between requests. */ 585 out_rq: 586 if (*out) 587 i915_request_put(*out); 588 *out = i915_request_get(rq); 589 i915_request_add(rq); 590 if (err || !it.sg || !sg_dma_len(it.sg)) 591 break; 592 593 cond_resched(); 594 } while (1); 595 596 out_ce: 597 return err; 598 } 599 600 int intel_migrate_copy(struct intel_migrate *m, 601 struct i915_gem_ww_ctx *ww, 602 const struct i915_deps *deps, 603 struct scatterlist *src, 604 enum i915_cache_level src_cache_level, 605 bool src_is_lmem, 606 struct scatterlist *dst, 607 enum i915_cache_level dst_cache_level, 608 bool dst_is_lmem, 609 struct i915_request **out) 610 { 611 struct intel_context *ce; 612 int err; 613 614 *out = NULL; 615 if (!m->context) 616 return -ENODEV; 617 618 ce = intel_migrate_create_context(m); 619 if (IS_ERR(ce)) 620 ce = intel_context_get(m->context); 621 GEM_BUG_ON(IS_ERR(ce)); 622 623 err = intel_context_pin_ww(ce, ww); 624 if (err) 625 goto out; 626 627 err = intel_context_migrate_copy(ce, deps, 628 src, src_cache_level, src_is_lmem, 629 dst, dst_cache_level, dst_is_lmem, 630 out); 631 632 intel_context_unpin(ce); 633 out: 634 intel_context_put(ce); 635 return err; 636 } 637 638 int 639 intel_migrate_clear(struct intel_migrate *m, 640 struct i915_gem_ww_ctx *ww, 641 const struct i915_deps *deps, 642 struct scatterlist *sg, 643 enum i915_cache_level cache_level, 644 bool is_lmem, 645 u32 value, 646 struct i915_request **out) 647 { 648 struct intel_context *ce; 649 int err; 650 651 *out = NULL; 652 if (!m->context) 653 return -ENODEV; 654 655 ce = intel_migrate_create_context(m); 656 if (IS_ERR(ce)) 657 ce = intel_context_get(m->context); 658 GEM_BUG_ON(IS_ERR(ce)); 659 660 err = intel_context_pin_ww(ce, ww); 661 if (err) 662 goto out; 663 664 err = intel_context_migrate_clear(ce, deps, sg, cache_level, 665 is_lmem, value, out); 666 667 intel_context_unpin(ce); 668 out: 669 intel_context_put(ce); 670 return err; 671 } 672 673 void intel_migrate_fini(struct intel_migrate *m) 674 { 675 struct intel_context *ce; 676 677 ce = fetch_and_zero(&m->context); 678 if (!ce) 679 return; 680 681 intel_engine_destroy_pinned_context(ce); 682 } 683 684 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 685 #include "selftest_migrate.c" 686 #endif 687