1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include "i915_drv.h" 7 #include "intel_context.h" 8 #include "intel_gpu_commands.h" 9 #include "intel_gt.h" 10 #include "intel_gtt.h" 11 #include "intel_migrate.h" 12 #include "intel_ring.h" 13 14 struct insert_pte_data { 15 u64 offset; 16 bool is_lmem; 17 }; 18 19 #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */ 20 21 static bool engine_supports_migration(struct intel_engine_cs *engine) 22 { 23 if (!engine) 24 return false; 25 26 /* 27 * We need the ability to prevent aribtration (MI_ARB_ON_OFF), 28 * the ability to write PTE using inline data (MI_STORE_DATA) 29 * and of course the ability to do the block transfer (blits). 30 */ 31 GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS); 32 33 return true; 34 } 35 36 static void insert_pte(struct i915_address_space *vm, 37 struct i915_page_table *pt, 38 void *data) 39 { 40 struct insert_pte_data *d = data; 41 42 vm->insert_page(vm, px_dma(pt), d->offset, I915_CACHE_NONE, 43 d->is_lmem ? PTE_LM : 0); 44 d->offset += PAGE_SIZE; 45 } 46 47 static struct i915_address_space *migrate_vm(struct intel_gt *gt) 48 { 49 struct i915_vm_pt_stash stash = {}; 50 struct i915_ppgtt *vm; 51 int err; 52 int i; 53 54 /* 55 * We construct a very special VM for use by all migration contexts, 56 * it is kept pinned so that it can be used at any time. As we need 57 * to pre-allocate the page directories for the migration VM, this 58 * limits us to only using a small number of prepared vma. 59 * 60 * To be able to pipeline and reschedule migration operations while 61 * avoiding unnecessary contention on the vm itself, the PTE updates 62 * are inline with the blits. All the blits use the same fixed 63 * addresses, with the backing store redirection being updated on the 64 * fly. Only 2 implicit vma are used for all migration operations. 65 * 66 * We lay the ppGTT out as: 67 * 68 * [0, CHUNK_SZ) -> first object 69 * [CHUNK_SZ, 2 * CHUNK_SZ) -> second object 70 * [2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE 71 * 72 * By exposing the dma addresses of the page directories themselves 73 * within the ppGTT, we are then able to rewrite the PTE prior to use. 74 * But the PTE update and subsequent migration operation must be atomic, 75 * i.e. within the same non-preemptible window so that we do not switch 76 * to another migration context that overwrites the PTE. 77 * 78 * TODO: Add support for huge LMEM PTEs 79 */ 80 81 vm = i915_ppgtt_create(gt); 82 if (IS_ERR(vm)) 83 return ERR_CAST(vm); 84 85 if (!vm->vm.allocate_va_range || !vm->vm.foreach) { 86 err = -ENODEV; 87 goto err_vm; 88 } 89 90 /* 91 * Each engine instance is assigned its own chunk in the VM, so 92 * that we can run multiple instances concurrently 93 */ 94 for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) { 95 struct intel_engine_cs *engine; 96 u64 base = (u64)i << 32; 97 struct insert_pte_data d = {}; 98 struct i915_gem_ww_ctx ww; 99 u64 sz; 100 101 engine = gt->engine_class[COPY_ENGINE_CLASS][i]; 102 if (!engine_supports_migration(engine)) 103 continue; 104 105 /* 106 * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need 107 * 4x2 page directories for source/destination. 108 */ 109 sz = 2 * CHUNK_SZ; 110 d.offset = base + sz; 111 112 /* 113 * We need another page directory setup so that we can write 114 * the 8x512 PTE in each chunk. 115 */ 116 sz += (sz >> 12) * sizeof(u64); 117 118 err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz); 119 if (err) 120 goto err_vm; 121 122 for_i915_gem_ww(&ww, err, true) { 123 err = i915_vm_lock_objects(&vm->vm, &ww); 124 if (err) 125 continue; 126 err = i915_vm_map_pt_stash(&vm->vm, &stash); 127 if (err) 128 continue; 129 130 vm->vm.allocate_va_range(&vm->vm, &stash, base, sz); 131 } 132 i915_vm_free_pt_stash(&vm->vm, &stash); 133 if (err) 134 goto err_vm; 135 136 /* Now allow the GPU to rewrite the PTE via its own ppGTT */ 137 d.is_lmem = i915_gem_object_is_lmem(vm->vm.scratch[0]); 138 vm->vm.foreach(&vm->vm, base, base + sz, insert_pte, &d); 139 } 140 141 return &vm->vm; 142 143 err_vm: 144 i915_vm_put(&vm->vm); 145 return ERR_PTR(err); 146 } 147 148 static struct intel_engine_cs *first_copy_engine(struct intel_gt *gt) 149 { 150 struct intel_engine_cs *engine; 151 int i; 152 153 for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) { 154 engine = gt->engine_class[COPY_ENGINE_CLASS][i]; 155 if (engine_supports_migration(engine)) 156 return engine; 157 } 158 159 return NULL; 160 } 161 162 static struct intel_context *pinned_context(struct intel_gt *gt) 163 { 164 static struct lock_class_key key; 165 struct intel_engine_cs *engine; 166 struct i915_address_space *vm; 167 struct intel_context *ce; 168 169 engine = first_copy_engine(gt); 170 if (!engine) 171 return ERR_PTR(-ENODEV); 172 173 vm = migrate_vm(gt); 174 if (IS_ERR(vm)) 175 return ERR_CAST(vm); 176 177 ce = intel_engine_create_pinned_context(engine, vm, SZ_512K, 178 I915_GEM_HWS_MIGRATE, 179 &key, "migrate"); 180 i915_vm_put(vm); 181 return ce; 182 } 183 184 int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt) 185 { 186 struct intel_context *ce; 187 188 memset(m, 0, sizeof(*m)); 189 190 ce = pinned_context(gt); 191 if (IS_ERR(ce)) 192 return PTR_ERR(ce); 193 194 m->context = ce; 195 return 0; 196 } 197 198 static int random_index(unsigned int max) 199 { 200 return upper_32_bits(mul_u32_u32(get_random_u32(), max)); 201 } 202 203 static struct intel_context *__migrate_engines(struct intel_gt *gt) 204 { 205 struct intel_engine_cs *engines[MAX_ENGINE_INSTANCE]; 206 struct intel_engine_cs *engine; 207 unsigned int count, i; 208 209 count = 0; 210 for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) { 211 engine = gt->engine_class[COPY_ENGINE_CLASS][i]; 212 if (engine_supports_migration(engine)) 213 engines[count++] = engine; 214 } 215 216 return intel_context_create(engines[random_index(count)]); 217 } 218 219 struct intel_context *intel_migrate_create_context(struct intel_migrate *m) 220 { 221 struct intel_context *ce; 222 223 /* 224 * We randomly distribute contexts across the engines upon constrction, 225 * as they all share the same pinned vm, and so in order to allow 226 * multiple blits to run in parallel, we must construct each blit 227 * to use a different range of the vm for its GTT. This has to be 228 * known at construction, so we can not use the late greedy load 229 * balancing of the virtual-engine. 230 */ 231 ce = __migrate_engines(m->context->engine->gt); 232 if (IS_ERR(ce)) 233 return ce; 234 235 ce->ring = NULL; 236 ce->ring_size = SZ_256K; 237 238 i915_vm_put(ce->vm); 239 ce->vm = i915_vm_get(m->context->vm); 240 241 return ce; 242 } 243 244 static inline struct sgt_dma sg_sgt(struct scatterlist *sg) 245 { 246 dma_addr_t addr = sg_dma_address(sg); 247 248 return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) }; 249 } 250 251 static int emit_no_arbitration(struct i915_request *rq) 252 { 253 u32 *cs; 254 255 cs = intel_ring_begin(rq, 2); 256 if (IS_ERR(cs)) 257 return PTR_ERR(cs); 258 259 /* Explicitly disable preemption for this request. */ 260 *cs++ = MI_ARB_ON_OFF; 261 *cs++ = MI_NOOP; 262 intel_ring_advance(rq, cs); 263 264 return 0; 265 } 266 267 static int emit_pte(struct i915_request *rq, 268 struct sgt_dma *it, 269 enum i915_cache_level cache_level, 270 bool is_lmem, 271 u64 offset, 272 int length) 273 { 274 const u64 encode = rq->context->vm->pte_encode(0, cache_level, 275 is_lmem ? PTE_LM : 0); 276 struct intel_ring *ring = rq->ring; 277 int total = 0; 278 u32 *hdr, *cs; 279 int pkt; 280 281 GEM_BUG_ON(GRAPHICS_VER(rq->engine->i915) < 8); 282 283 /* Compute the page directory offset for the target address range */ 284 offset += (u64)rq->engine->instance << 32; 285 offset >>= 12; 286 offset *= sizeof(u64); 287 offset += 2 * CHUNK_SZ; 288 289 cs = intel_ring_begin(rq, 6); 290 if (IS_ERR(cs)) 291 return PTR_ERR(cs); 292 293 /* Pack as many PTE updates as possible into a single MI command */ 294 pkt = min_t(int, 0x400, ring->space / sizeof(u32) + 5); 295 pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5); 296 297 hdr = cs; 298 *cs++ = MI_STORE_DATA_IMM | REG_BIT(21); /* as qword elements */ 299 *cs++ = lower_32_bits(offset); 300 *cs++ = upper_32_bits(offset); 301 302 do { 303 if (cs - hdr >= pkt) { 304 *hdr += cs - hdr - 2; 305 *cs++ = MI_NOOP; 306 307 ring->emit = (void *)cs - ring->vaddr; 308 intel_ring_advance(rq, cs); 309 intel_ring_update_space(ring); 310 311 cs = intel_ring_begin(rq, 6); 312 if (IS_ERR(cs)) 313 return PTR_ERR(cs); 314 315 pkt = min_t(int, 0x400, ring->space / sizeof(u32) + 5); 316 pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5); 317 318 hdr = cs; 319 *cs++ = MI_STORE_DATA_IMM | REG_BIT(21); 320 *cs++ = lower_32_bits(offset); 321 *cs++ = upper_32_bits(offset); 322 } 323 324 *cs++ = lower_32_bits(encode | it->dma); 325 *cs++ = upper_32_bits(encode | it->dma); 326 327 offset += 8; 328 total += I915_GTT_PAGE_SIZE; 329 330 it->dma += I915_GTT_PAGE_SIZE; 331 if (it->dma >= it->max) { 332 it->sg = __sg_next(it->sg); 333 if (!it->sg || sg_dma_len(it->sg) == 0) 334 break; 335 336 it->dma = sg_dma_address(it->sg); 337 it->max = it->dma + sg_dma_len(it->sg); 338 } 339 } while (total < length); 340 341 *hdr += cs - hdr - 2; 342 *cs++ = MI_NOOP; 343 344 ring->emit = (void *)cs - ring->vaddr; 345 intel_ring_advance(rq, cs); 346 intel_ring_update_space(ring); 347 348 return total; 349 } 350 351 static bool wa_1209644611_applies(int ver, u32 size) 352 { 353 u32 height = size >> PAGE_SHIFT; 354 355 if (ver != 11) 356 return false; 357 358 return height % 4 == 3 && height <= 8; 359 } 360 361 static int emit_copy(struct i915_request *rq, int size) 362 { 363 const int ver = GRAPHICS_VER(rq->engine->i915); 364 u32 instance = rq->engine->instance; 365 u32 *cs; 366 367 cs = intel_ring_begin(rq, ver >= 8 ? 10 : 6); 368 if (IS_ERR(cs)) 369 return PTR_ERR(cs); 370 371 if (ver >= 9 && !wa_1209644611_applies(ver, size)) { 372 *cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2); 373 *cs++ = BLT_DEPTH_32 | PAGE_SIZE; 374 *cs++ = 0; 375 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; 376 *cs++ = CHUNK_SZ; /* dst offset */ 377 *cs++ = instance; 378 *cs++ = 0; 379 *cs++ = PAGE_SIZE; 380 *cs++ = 0; /* src offset */ 381 *cs++ = instance; 382 } else if (ver >= 8) { 383 *cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2); 384 *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE; 385 *cs++ = 0; 386 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; 387 *cs++ = CHUNK_SZ; /* dst offset */ 388 *cs++ = instance; 389 *cs++ = 0; 390 *cs++ = PAGE_SIZE; 391 *cs++ = 0; /* src offset */ 392 *cs++ = instance; 393 } else { 394 GEM_BUG_ON(instance); 395 *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2); 396 *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE; 397 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE; 398 *cs++ = CHUNK_SZ; /* dst offset */ 399 *cs++ = PAGE_SIZE; 400 *cs++ = 0; /* src offset */ 401 } 402 403 intel_ring_advance(rq, cs); 404 return 0; 405 } 406 407 int 408 intel_context_migrate_copy(struct intel_context *ce, 409 struct dma_fence *await, 410 struct scatterlist *src, 411 enum i915_cache_level src_cache_level, 412 bool src_is_lmem, 413 struct scatterlist *dst, 414 enum i915_cache_level dst_cache_level, 415 bool dst_is_lmem, 416 struct i915_request **out) 417 { 418 struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst); 419 struct i915_request *rq; 420 int err; 421 422 GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm); 423 *out = NULL; 424 425 GEM_BUG_ON(ce->ring->size < SZ_64K); 426 427 do { 428 int len; 429 430 rq = i915_request_create(ce); 431 if (IS_ERR(rq)) { 432 err = PTR_ERR(rq); 433 goto out_ce; 434 } 435 436 if (await) { 437 err = i915_request_await_dma_fence(rq, await); 438 if (err) 439 goto out_rq; 440 441 if (rq->engine->emit_init_breadcrumb) { 442 err = rq->engine->emit_init_breadcrumb(rq); 443 if (err) 444 goto out_rq; 445 } 446 447 await = NULL; 448 } 449 450 /* The PTE updates + copy must not be interrupted. */ 451 err = emit_no_arbitration(rq); 452 if (err) 453 goto out_rq; 454 455 len = emit_pte(rq, &it_src, src_cache_level, src_is_lmem, 0, 456 CHUNK_SZ); 457 if (len <= 0) { 458 err = len; 459 goto out_rq; 460 } 461 462 err = emit_pte(rq, &it_dst, dst_cache_level, dst_is_lmem, 463 CHUNK_SZ, len); 464 if (err < 0) 465 goto out_rq; 466 if (err < len) { 467 err = -EINVAL; 468 goto out_rq; 469 } 470 471 err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); 472 if (err) 473 goto out_rq; 474 475 err = emit_copy(rq, len); 476 477 /* Arbitration is re-enabled between requests. */ 478 out_rq: 479 if (*out) 480 i915_request_put(*out); 481 *out = i915_request_get(rq); 482 i915_request_add(rq); 483 if (err || !it_src.sg || !sg_dma_len(it_src.sg)) 484 break; 485 486 cond_resched(); 487 } while (1); 488 489 out_ce: 490 return err; 491 } 492 493 static int emit_clear(struct i915_request *rq, int size, u32 value) 494 { 495 const int ver = GRAPHICS_VER(rq->engine->i915); 496 u32 instance = rq->engine->instance; 497 u32 *cs; 498 499 GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX); 500 501 cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6); 502 if (IS_ERR(cs)) 503 return PTR_ERR(cs); 504 505 if (ver >= 8) { 506 *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2); 507 *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE; 508 *cs++ = 0; 509 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; 510 *cs++ = 0; /* offset */ 511 *cs++ = instance; 512 *cs++ = value; 513 *cs++ = MI_NOOP; 514 } else { 515 GEM_BUG_ON(instance); 516 *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2); 517 *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE; 518 *cs++ = 0; 519 *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; 520 *cs++ = 0; 521 *cs++ = value; 522 } 523 524 intel_ring_advance(rq, cs); 525 return 0; 526 } 527 528 int 529 intel_context_migrate_clear(struct intel_context *ce, 530 struct dma_fence *await, 531 struct scatterlist *sg, 532 enum i915_cache_level cache_level, 533 bool is_lmem, 534 u32 value, 535 struct i915_request **out) 536 { 537 struct sgt_dma it = sg_sgt(sg); 538 struct i915_request *rq; 539 int err; 540 541 GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm); 542 *out = NULL; 543 544 GEM_BUG_ON(ce->ring->size < SZ_64K); 545 546 do { 547 int len; 548 549 rq = i915_request_create(ce); 550 if (IS_ERR(rq)) { 551 err = PTR_ERR(rq); 552 goto out_ce; 553 } 554 555 if (await) { 556 err = i915_request_await_dma_fence(rq, await); 557 if (err) 558 goto out_rq; 559 560 if (rq->engine->emit_init_breadcrumb) { 561 err = rq->engine->emit_init_breadcrumb(rq); 562 if (err) 563 goto out_rq; 564 } 565 566 await = NULL; 567 } 568 569 /* The PTE updates + clear must not be interrupted. */ 570 err = emit_no_arbitration(rq); 571 if (err) 572 goto out_rq; 573 574 len = emit_pte(rq, &it, cache_level, is_lmem, 0, CHUNK_SZ); 575 if (len <= 0) { 576 err = len; 577 goto out_rq; 578 } 579 580 err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); 581 if (err) 582 goto out_rq; 583 584 err = emit_clear(rq, len, value); 585 586 /* Arbitration is re-enabled between requests. */ 587 out_rq: 588 if (*out) 589 i915_request_put(*out); 590 *out = i915_request_get(rq); 591 i915_request_add(rq); 592 if (err || !it.sg || !sg_dma_len(it.sg)) 593 break; 594 595 cond_resched(); 596 } while (1); 597 598 out_ce: 599 return err; 600 } 601 602 int intel_migrate_copy(struct intel_migrate *m, 603 struct i915_gem_ww_ctx *ww, 604 struct dma_fence *await, 605 struct scatterlist *src, 606 enum i915_cache_level src_cache_level, 607 bool src_is_lmem, 608 struct scatterlist *dst, 609 enum i915_cache_level dst_cache_level, 610 bool dst_is_lmem, 611 struct i915_request **out) 612 { 613 struct intel_context *ce; 614 int err; 615 616 *out = NULL; 617 if (!m->context) 618 return -ENODEV; 619 620 ce = intel_migrate_create_context(m); 621 if (IS_ERR(ce)) 622 ce = intel_context_get(m->context); 623 GEM_BUG_ON(IS_ERR(ce)); 624 625 err = intel_context_pin_ww(ce, ww); 626 if (err) 627 goto out; 628 629 err = intel_context_migrate_copy(ce, await, 630 src, src_cache_level, src_is_lmem, 631 dst, dst_cache_level, dst_is_lmem, 632 out); 633 634 intel_context_unpin(ce); 635 out: 636 intel_context_put(ce); 637 return err; 638 } 639 640 int 641 intel_migrate_clear(struct intel_migrate *m, 642 struct i915_gem_ww_ctx *ww, 643 struct dma_fence *await, 644 struct scatterlist *sg, 645 enum i915_cache_level cache_level, 646 bool is_lmem, 647 u32 value, 648 struct i915_request **out) 649 { 650 struct intel_context *ce; 651 int err; 652 653 *out = NULL; 654 if (!m->context) 655 return -ENODEV; 656 657 ce = intel_migrate_create_context(m); 658 if (IS_ERR(ce)) 659 ce = intel_context_get(m->context); 660 GEM_BUG_ON(IS_ERR(ce)); 661 662 err = intel_context_pin_ww(ce, ww); 663 if (err) 664 goto out; 665 666 err = intel_context_migrate_clear(ce, await, sg, cache_level, 667 is_lmem, value, out); 668 669 intel_context_unpin(ce); 670 out: 671 intel_context_put(ce); 672 return err; 673 } 674 675 void intel_migrate_fini(struct intel_migrate *m) 676 { 677 struct intel_context *ce; 678 679 ce = fetch_and_zero(&m->context); 680 if (!ce) 681 return; 682 683 intel_engine_destroy_pinned_context(ce); 684 } 685 686 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 687 #include "selftest_migrate.c" 688 #endif 689