1 /* 2 * SPDX-License-Identifier: MIT 3 * 4 * Copyright © 2017-2018 Intel Corporation 5 */ 6 7 #include <linux/prime_numbers.h> 8 9 #include "intel_context.h" 10 #include "intel_engine_heartbeat.h" 11 #include "intel_engine_pm.h" 12 #include "intel_gpu_commands.h" 13 #include "intel_gt.h" 14 #include "intel_gt_requests.h" 15 #include "intel_ring.h" 16 #include "selftest_engine_heartbeat.h" 17 18 #include "../selftests/i915_random.h" 19 #include "../i915_selftest.h" 20 21 #include "selftests/igt_flush_test.h" 22 #include "selftests/lib_sw_fence.h" 23 #include "selftests/mock_gem_device.h" 24 #include "selftests/mock_timeline.h" 25 26 static struct page *hwsp_page(struct intel_timeline *tl) 27 { 28 struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj; 29 30 GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj)); 31 return sg_page(obj->mm.pages->sgl); 32 } 33 34 static unsigned long hwsp_cacheline(struct intel_timeline *tl) 35 { 36 unsigned long address = (unsigned long)page_address(hwsp_page(tl)); 37 38 return (address + tl->hwsp_offset) / CACHELINE_BYTES; 39 } 40 41 #define CACHELINES_PER_PAGE (PAGE_SIZE / CACHELINE_BYTES) 42 43 struct mock_hwsp_freelist { 44 struct intel_gt *gt; 45 struct radix_tree_root cachelines; 46 struct intel_timeline **history; 47 unsigned long count, max; 48 struct rnd_state prng; 49 }; 50 51 enum { 52 SHUFFLE = BIT(0), 53 }; 54 55 static void __mock_hwsp_record(struct mock_hwsp_freelist *state, 56 unsigned int idx, 57 struct intel_timeline *tl) 58 { 59 tl = xchg(&state->history[idx], tl); 60 if (tl) { 61 radix_tree_delete(&state->cachelines, hwsp_cacheline(tl)); 62 intel_timeline_put(tl); 63 } 64 } 65 66 static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state, 67 unsigned int count, 68 unsigned int flags) 69 { 70 struct intel_timeline *tl; 71 unsigned int idx; 72 73 while (count--) { 74 unsigned long cacheline; 75 int err; 76 77 tl = intel_timeline_create(state->gt); 78 if (IS_ERR(tl)) 79 return PTR_ERR(tl); 80 81 cacheline = hwsp_cacheline(tl); 82 err = radix_tree_insert(&state->cachelines, cacheline, tl); 83 if (err) { 84 if (err == -EEXIST) { 85 pr_err("HWSP cacheline %lu already used; duplicate allocation!\n", 86 cacheline); 87 } 88 intel_timeline_put(tl); 89 return err; 90 } 91 92 idx = state->count++ % state->max; 93 __mock_hwsp_record(state, idx, tl); 94 } 95 96 if (flags & SHUFFLE) 97 i915_prandom_shuffle(state->history, 98 sizeof(*state->history), 99 min(state->count, state->max), 100 &state->prng); 101 102 count = i915_prandom_u32_max_state(min(state->count, state->max), 103 &state->prng); 104 while (count--) { 105 idx = --state->count % state->max; 106 __mock_hwsp_record(state, idx, NULL); 107 } 108 109 return 0; 110 } 111 112 static int mock_hwsp_freelist(void *arg) 113 { 114 struct mock_hwsp_freelist state; 115 struct drm_i915_private *i915; 116 const struct { 117 const char *name; 118 unsigned int flags; 119 } phases[] = { 120 { "linear", 0 }, 121 { "shuffled", SHUFFLE }, 122 { }, 123 }, *p; 124 unsigned int na; 125 int err = 0; 126 127 i915 = mock_gem_device(); 128 if (!i915) 129 return -ENOMEM; 130 131 INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL); 132 state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed); 133 134 state.gt = &i915->gt; 135 136 /* 137 * Create a bunch of timelines and check that their HWSP do not overlap. 138 * Free some, and try again. 139 */ 140 141 state.max = PAGE_SIZE / sizeof(*state.history); 142 state.count = 0; 143 state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL); 144 if (!state.history) { 145 err = -ENOMEM; 146 goto err_put; 147 } 148 149 for (p = phases; p->name; p++) { 150 pr_debug("%s(%s)\n", __func__, p->name); 151 for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) { 152 err = __mock_hwsp_timeline(&state, na, p->flags); 153 if (err) 154 goto out; 155 } 156 } 157 158 out: 159 for (na = 0; na < state.max; na++) 160 __mock_hwsp_record(&state, na, NULL); 161 kfree(state.history); 162 err_put: 163 mock_destroy_device(i915); 164 return err; 165 } 166 167 struct __igt_sync { 168 const char *name; 169 u32 seqno; 170 bool expected; 171 bool set; 172 }; 173 174 static int __igt_sync(struct intel_timeline *tl, 175 u64 ctx, 176 const struct __igt_sync *p, 177 const char *name) 178 { 179 int ret; 180 181 if (__intel_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) { 182 pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n", 183 name, p->name, ctx, p->seqno, yesno(p->expected)); 184 return -EINVAL; 185 } 186 187 if (p->set) { 188 ret = __intel_timeline_sync_set(tl, ctx, p->seqno); 189 if (ret) 190 return ret; 191 } 192 193 return 0; 194 } 195 196 static int igt_sync(void *arg) 197 { 198 const struct __igt_sync pass[] = { 199 { "unset", 0, false, false }, 200 { "new", 0, false, true }, 201 { "0a", 0, true, true }, 202 { "1a", 1, false, true }, 203 { "1b", 1, true, true }, 204 { "0b", 0, true, false }, 205 { "2a", 2, false, true }, 206 { "4", 4, false, true }, 207 { "INT_MAX", INT_MAX, false, true }, 208 { "INT_MAX-1", INT_MAX-1, true, false }, 209 { "INT_MAX+1", (u32)INT_MAX+1, false, true }, 210 { "INT_MAX", INT_MAX, true, false }, 211 { "UINT_MAX", UINT_MAX, false, true }, 212 { "wrap", 0, false, true }, 213 { "unwrap", UINT_MAX, true, false }, 214 {}, 215 }, *p; 216 struct intel_timeline tl; 217 int order, offset; 218 int ret = -ENODEV; 219 220 mock_timeline_init(&tl, 0); 221 for (p = pass; p->name; p++) { 222 for (order = 1; order < 64; order++) { 223 for (offset = -1; offset <= (order > 1); offset++) { 224 u64 ctx = BIT_ULL(order) + offset; 225 226 ret = __igt_sync(&tl, ctx, p, "1"); 227 if (ret) 228 goto out; 229 } 230 } 231 } 232 mock_timeline_fini(&tl); 233 234 mock_timeline_init(&tl, 0); 235 for (order = 1; order < 64; order++) { 236 for (offset = -1; offset <= (order > 1); offset++) { 237 u64 ctx = BIT_ULL(order) + offset; 238 239 for (p = pass; p->name; p++) { 240 ret = __igt_sync(&tl, ctx, p, "2"); 241 if (ret) 242 goto out; 243 } 244 } 245 } 246 247 out: 248 mock_timeline_fini(&tl); 249 return ret; 250 } 251 252 static unsigned int random_engine(struct rnd_state *rnd) 253 { 254 return i915_prandom_u32_max_state(I915_NUM_ENGINES, rnd); 255 } 256 257 static int bench_sync(void *arg) 258 { 259 struct rnd_state prng; 260 struct intel_timeline tl; 261 unsigned long end_time, count; 262 u64 prng32_1M; 263 ktime_t kt; 264 int order, last_order; 265 266 mock_timeline_init(&tl, 0); 267 268 /* Lookups from cache are very fast and so the random number generation 269 * and the loop itself becomes a significant factor in the per-iteration 270 * timings. We try to compensate the results by measuring the overhead 271 * of the prng and subtract it from the reported results. 272 */ 273 prandom_seed_state(&prng, i915_selftest.random_seed); 274 count = 0; 275 kt = ktime_get(); 276 end_time = jiffies + HZ/10; 277 do { 278 u32 x; 279 280 /* Make sure the compiler doesn't optimise away the prng call */ 281 WRITE_ONCE(x, prandom_u32_state(&prng)); 282 283 count++; 284 } while (!time_after(jiffies, end_time)); 285 kt = ktime_sub(ktime_get(), kt); 286 pr_debug("%s: %lu random evaluations, %lluns/prng\n", 287 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); 288 prng32_1M = div64_ul(ktime_to_ns(kt) << 20, count); 289 290 /* Benchmark (only) setting random context ids */ 291 prandom_seed_state(&prng, i915_selftest.random_seed); 292 count = 0; 293 kt = ktime_get(); 294 end_time = jiffies + HZ/10; 295 do { 296 u64 id = i915_prandom_u64_state(&prng); 297 298 __intel_timeline_sync_set(&tl, id, 0); 299 count++; 300 } while (!time_after(jiffies, end_time)); 301 kt = ktime_sub(ktime_get(), kt); 302 kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20); 303 pr_info("%s: %lu random insertions, %lluns/insert\n", 304 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); 305 306 /* Benchmark looking up the exact same context ids as we just set */ 307 prandom_seed_state(&prng, i915_selftest.random_seed); 308 end_time = count; 309 kt = ktime_get(); 310 while (end_time--) { 311 u64 id = i915_prandom_u64_state(&prng); 312 313 if (!__intel_timeline_sync_is_later(&tl, id, 0)) { 314 mock_timeline_fini(&tl); 315 pr_err("Lookup of %llu failed\n", id); 316 return -EINVAL; 317 } 318 } 319 kt = ktime_sub(ktime_get(), kt); 320 kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20); 321 pr_info("%s: %lu random lookups, %lluns/lookup\n", 322 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); 323 324 mock_timeline_fini(&tl); 325 cond_resched(); 326 327 mock_timeline_init(&tl, 0); 328 329 /* Benchmark setting the first N (in order) contexts */ 330 count = 0; 331 kt = ktime_get(); 332 end_time = jiffies + HZ/10; 333 do { 334 __intel_timeline_sync_set(&tl, count++, 0); 335 } while (!time_after(jiffies, end_time)); 336 kt = ktime_sub(ktime_get(), kt); 337 pr_info("%s: %lu in-order insertions, %lluns/insert\n", 338 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); 339 340 /* Benchmark looking up the exact same context ids as we just set */ 341 end_time = count; 342 kt = ktime_get(); 343 while (end_time--) { 344 if (!__intel_timeline_sync_is_later(&tl, end_time, 0)) { 345 pr_err("Lookup of %lu failed\n", end_time); 346 mock_timeline_fini(&tl); 347 return -EINVAL; 348 } 349 } 350 kt = ktime_sub(ktime_get(), kt); 351 pr_info("%s: %lu in-order lookups, %lluns/lookup\n", 352 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); 353 354 mock_timeline_fini(&tl); 355 cond_resched(); 356 357 mock_timeline_init(&tl, 0); 358 359 /* Benchmark searching for a random context id and maybe changing it */ 360 prandom_seed_state(&prng, i915_selftest.random_seed); 361 count = 0; 362 kt = ktime_get(); 363 end_time = jiffies + HZ/10; 364 do { 365 u32 id = random_engine(&prng); 366 u32 seqno = prandom_u32_state(&prng); 367 368 if (!__intel_timeline_sync_is_later(&tl, id, seqno)) 369 __intel_timeline_sync_set(&tl, id, seqno); 370 371 count++; 372 } while (!time_after(jiffies, end_time)); 373 kt = ktime_sub(ktime_get(), kt); 374 kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20); 375 pr_info("%s: %lu repeated insert/lookups, %lluns/op\n", 376 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); 377 mock_timeline_fini(&tl); 378 cond_resched(); 379 380 /* Benchmark searching for a known context id and changing the seqno */ 381 for (last_order = 1, order = 1; order < 32; 382 ({ int tmp = last_order; last_order = order; order += tmp; })) { 383 unsigned int mask = BIT(order) - 1; 384 385 mock_timeline_init(&tl, 0); 386 387 count = 0; 388 kt = ktime_get(); 389 end_time = jiffies + HZ/10; 390 do { 391 /* Without assuming too many details of the underlying 392 * implementation, try to identify its phase-changes 393 * (if any)! 394 */ 395 u64 id = (u64)(count & mask) << order; 396 397 __intel_timeline_sync_is_later(&tl, id, 0); 398 __intel_timeline_sync_set(&tl, id, 0); 399 400 count++; 401 } while (!time_after(jiffies, end_time)); 402 kt = ktime_sub(ktime_get(), kt); 403 pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n", 404 __func__, count, order, 405 (long long)div64_ul(ktime_to_ns(kt), count)); 406 mock_timeline_fini(&tl); 407 cond_resched(); 408 } 409 410 return 0; 411 } 412 413 int intel_timeline_mock_selftests(void) 414 { 415 static const struct i915_subtest tests[] = { 416 SUBTEST(mock_hwsp_freelist), 417 SUBTEST(igt_sync), 418 SUBTEST(bench_sync), 419 }; 420 421 return i915_subtests(tests, NULL); 422 } 423 424 static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value) 425 { 426 u32 *cs; 427 428 cs = intel_ring_begin(rq, 4); 429 if (IS_ERR(cs)) 430 return PTR_ERR(cs); 431 432 if (INTEL_GEN(rq->engine->i915) >= 8) { 433 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 434 *cs++ = addr; 435 *cs++ = 0; 436 *cs++ = value; 437 } else if (INTEL_GEN(rq->engine->i915) >= 4) { 438 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 439 *cs++ = 0; 440 *cs++ = addr; 441 *cs++ = value; 442 } else { 443 *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 444 *cs++ = addr; 445 *cs++ = value; 446 *cs++ = MI_NOOP; 447 } 448 449 intel_ring_advance(rq, cs); 450 451 return 0; 452 } 453 454 static struct i915_request * 455 tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value) 456 { 457 struct i915_request *rq; 458 int err; 459 460 err = intel_timeline_pin(tl, NULL); 461 if (err) { 462 rq = ERR_PTR(err); 463 goto out; 464 } 465 466 rq = intel_engine_create_kernel_request(engine); 467 if (IS_ERR(rq)) 468 goto out_unpin; 469 470 i915_request_get(rq); 471 472 err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value); 473 i915_request_add(rq); 474 if (err) { 475 i915_request_put(rq); 476 rq = ERR_PTR(err); 477 } 478 479 out_unpin: 480 intel_timeline_unpin(tl); 481 out: 482 if (IS_ERR(rq)) 483 pr_err("Failed to write to timeline!\n"); 484 return rq; 485 } 486 487 static struct intel_timeline * 488 checked_intel_timeline_create(struct intel_gt *gt) 489 { 490 struct intel_timeline *tl; 491 492 tl = intel_timeline_create(gt); 493 if (IS_ERR(tl)) 494 return tl; 495 496 if (READ_ONCE(*tl->hwsp_seqno) != tl->seqno) { 497 pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n", 498 *tl->hwsp_seqno, tl->seqno); 499 intel_timeline_put(tl); 500 return ERR_PTR(-EINVAL); 501 } 502 503 return tl; 504 } 505 506 static int live_hwsp_engine(void *arg) 507 { 508 #define NUM_TIMELINES 4096 509 struct intel_gt *gt = arg; 510 struct intel_timeline **timelines; 511 struct intel_engine_cs *engine; 512 enum intel_engine_id id; 513 unsigned long count, n; 514 int err = 0; 515 516 /* 517 * Create a bunch of timelines and check we can write 518 * independently to each of their breadcrumb slots. 519 */ 520 521 timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES, 522 sizeof(*timelines), 523 GFP_KERNEL); 524 if (!timelines) 525 return -ENOMEM; 526 527 count = 0; 528 for_each_engine(engine, gt, id) { 529 if (!intel_engine_can_store_dword(engine)) 530 continue; 531 532 intel_engine_pm_get(engine); 533 534 for (n = 0; n < NUM_TIMELINES; n++) { 535 struct intel_timeline *tl; 536 struct i915_request *rq; 537 538 tl = checked_intel_timeline_create(gt); 539 if (IS_ERR(tl)) { 540 err = PTR_ERR(tl); 541 break; 542 } 543 544 rq = tl_write(tl, engine, count); 545 if (IS_ERR(rq)) { 546 intel_timeline_put(tl); 547 err = PTR_ERR(rq); 548 break; 549 } 550 551 timelines[count++] = tl; 552 i915_request_put(rq); 553 } 554 555 intel_engine_pm_put(engine); 556 if (err) 557 break; 558 } 559 560 if (igt_flush_test(gt->i915)) 561 err = -EIO; 562 563 for (n = 0; n < count; n++) { 564 struct intel_timeline *tl = timelines[n]; 565 566 if (!err && READ_ONCE(*tl->hwsp_seqno) != n) { 567 GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n", 568 n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno); 569 GEM_TRACE_DUMP(); 570 err = -EINVAL; 571 } 572 intel_timeline_put(tl); 573 } 574 575 kvfree(timelines); 576 return err; 577 #undef NUM_TIMELINES 578 } 579 580 static int live_hwsp_alternate(void *arg) 581 { 582 #define NUM_TIMELINES 4096 583 struct intel_gt *gt = arg; 584 struct intel_timeline **timelines; 585 struct intel_engine_cs *engine; 586 enum intel_engine_id id; 587 unsigned long count, n; 588 int err = 0; 589 590 /* 591 * Create a bunch of timelines and check we can write 592 * independently to each of their breadcrumb slots with adjacent 593 * engines. 594 */ 595 596 timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES, 597 sizeof(*timelines), 598 GFP_KERNEL); 599 if (!timelines) 600 return -ENOMEM; 601 602 count = 0; 603 for (n = 0; n < NUM_TIMELINES; n++) { 604 for_each_engine(engine, gt, id) { 605 struct intel_timeline *tl; 606 struct i915_request *rq; 607 608 if (!intel_engine_can_store_dword(engine)) 609 continue; 610 611 tl = checked_intel_timeline_create(gt); 612 if (IS_ERR(tl)) { 613 err = PTR_ERR(tl); 614 goto out; 615 } 616 617 intel_engine_pm_get(engine); 618 rq = tl_write(tl, engine, count); 619 intel_engine_pm_put(engine); 620 if (IS_ERR(rq)) { 621 intel_timeline_put(tl); 622 err = PTR_ERR(rq); 623 goto out; 624 } 625 626 timelines[count++] = tl; 627 i915_request_put(rq); 628 } 629 } 630 631 out: 632 if (igt_flush_test(gt->i915)) 633 err = -EIO; 634 635 for (n = 0; n < count; n++) { 636 struct intel_timeline *tl = timelines[n]; 637 638 if (!err && READ_ONCE(*tl->hwsp_seqno) != n) { 639 GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n", 640 n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno); 641 GEM_TRACE_DUMP(); 642 err = -EINVAL; 643 } 644 intel_timeline_put(tl); 645 } 646 647 kvfree(timelines); 648 return err; 649 #undef NUM_TIMELINES 650 } 651 652 static int live_hwsp_wrap(void *arg) 653 { 654 struct intel_gt *gt = arg; 655 struct intel_engine_cs *engine; 656 struct intel_timeline *tl; 657 enum intel_engine_id id; 658 int err = 0; 659 660 /* 661 * Across a seqno wrap, we need to keep the old cacheline alive for 662 * foreign GPU references. 663 */ 664 665 tl = intel_timeline_create(gt); 666 if (IS_ERR(tl)) 667 return PTR_ERR(tl); 668 669 if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline) 670 goto out_free; 671 672 err = intel_timeline_pin(tl, NULL); 673 if (err) 674 goto out_free; 675 676 for_each_engine(engine, gt, id) { 677 const u32 *hwsp_seqno[2]; 678 struct i915_request *rq; 679 u32 seqno[2]; 680 681 if (!intel_engine_can_store_dword(engine)) 682 continue; 683 684 rq = intel_engine_create_kernel_request(engine); 685 if (IS_ERR(rq)) { 686 err = PTR_ERR(rq); 687 goto out; 688 } 689 690 tl->seqno = -4u; 691 692 mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING); 693 err = intel_timeline_get_seqno(tl, rq, &seqno[0]); 694 mutex_unlock(&tl->mutex); 695 if (err) { 696 i915_request_add(rq); 697 goto out; 698 } 699 pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n", 700 seqno[0], tl->hwsp_offset); 701 702 err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]); 703 if (err) { 704 i915_request_add(rq); 705 goto out; 706 } 707 hwsp_seqno[0] = tl->hwsp_seqno; 708 709 mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING); 710 err = intel_timeline_get_seqno(tl, rq, &seqno[1]); 711 mutex_unlock(&tl->mutex); 712 if (err) { 713 i915_request_add(rq); 714 goto out; 715 } 716 pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n", 717 seqno[1], tl->hwsp_offset); 718 719 err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]); 720 if (err) { 721 i915_request_add(rq); 722 goto out; 723 } 724 hwsp_seqno[1] = tl->hwsp_seqno; 725 726 /* With wrap should come a new hwsp */ 727 GEM_BUG_ON(seqno[1] >= seqno[0]); 728 GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]); 729 730 i915_request_add(rq); 731 732 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 733 pr_err("Wait for timeline writes timed out!\n"); 734 err = -EIO; 735 goto out; 736 } 737 738 if (READ_ONCE(*hwsp_seqno[0]) != seqno[0] || 739 READ_ONCE(*hwsp_seqno[1]) != seqno[1]) { 740 pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n", 741 *hwsp_seqno[0], *hwsp_seqno[1], 742 seqno[0], seqno[1]); 743 err = -EINVAL; 744 goto out; 745 } 746 747 intel_gt_retire_requests(gt); /* recycle HWSP */ 748 } 749 750 out: 751 if (igt_flush_test(gt->i915)) 752 err = -EIO; 753 754 intel_timeline_unpin(tl); 755 out_free: 756 intel_timeline_put(tl); 757 return err; 758 } 759 760 static int emit_read_hwsp(struct i915_request *rq, 761 u32 seqno, u32 hwsp, 762 u32 *addr) 763 { 764 const u32 gpr = i915_mmio_reg_offset(GEN8_RING_CS_GPR(rq->engine->mmio_base, 0)); 765 u32 *cs; 766 767 cs = intel_ring_begin(rq, 12); 768 if (IS_ERR(cs)) 769 return PTR_ERR(cs); 770 771 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 772 *cs++ = *addr; 773 *cs++ = 0; 774 *cs++ = seqno; 775 *addr += 4; 776 777 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT; 778 *cs++ = gpr; 779 *cs++ = hwsp; 780 *cs++ = 0; 781 782 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 783 *cs++ = gpr; 784 *cs++ = *addr; 785 *cs++ = 0; 786 *addr += 4; 787 788 intel_ring_advance(rq, cs); 789 790 return 0; 791 } 792 793 struct hwsp_watcher { 794 struct i915_vma *vma; 795 struct i915_request *rq; 796 u32 addr; 797 u32 *map; 798 }; 799 800 static bool cmp_lt(u32 a, u32 b) 801 { 802 return a < b; 803 } 804 805 static bool cmp_gte(u32 a, u32 b) 806 { 807 return a >= b; 808 } 809 810 static int setup_watcher(struct hwsp_watcher *w, struct intel_gt *gt) 811 { 812 struct drm_i915_gem_object *obj; 813 struct i915_vma *vma; 814 815 obj = i915_gem_object_create_internal(gt->i915, SZ_2M); 816 if (IS_ERR(obj)) 817 return PTR_ERR(obj); 818 819 w->map = i915_gem_object_pin_map(obj, I915_MAP_WB); 820 if (IS_ERR(w->map)) { 821 i915_gem_object_put(obj); 822 return PTR_ERR(w->map); 823 } 824 825 vma = i915_gem_object_ggtt_pin_ww(obj, NULL, NULL, 0, 0, 0); 826 if (IS_ERR(vma)) { 827 i915_gem_object_put(obj); 828 return PTR_ERR(vma); 829 } 830 831 w->vma = vma; 832 w->addr = i915_ggtt_offset(vma); 833 return 0; 834 } 835 836 static int create_watcher(struct hwsp_watcher *w, 837 struct intel_engine_cs *engine, 838 int ringsz) 839 { 840 struct intel_context *ce; 841 struct intel_timeline *tl; 842 843 ce = intel_context_create(engine); 844 if (IS_ERR(ce)) 845 return PTR_ERR(ce); 846 847 ce->ring = __intel_context_ring_size(ringsz); 848 w->rq = intel_context_create_request(ce); 849 intel_context_put(ce); 850 if (IS_ERR(w->rq)) 851 return PTR_ERR(w->rq); 852 853 w->addr = i915_ggtt_offset(w->vma); 854 tl = w->rq->context->timeline; 855 856 /* some light mutex juggling required; think co-routines */ 857 lockdep_unpin_lock(&tl->mutex, w->rq->cookie); 858 mutex_unlock(&tl->mutex); 859 860 return 0; 861 } 862 863 static int check_watcher(struct hwsp_watcher *w, const char *name, 864 bool (*op)(u32 hwsp, u32 seqno)) 865 { 866 struct i915_request *rq = fetch_and_zero(&w->rq); 867 struct intel_timeline *tl = rq->context->timeline; 868 u32 offset, end; 869 int err; 870 871 GEM_BUG_ON(w->addr - i915_ggtt_offset(w->vma) > w->vma->size); 872 873 i915_request_get(rq); 874 mutex_lock(&tl->mutex); 875 rq->cookie = lockdep_pin_lock(&tl->mutex); 876 i915_request_add(rq); 877 878 if (i915_request_wait(rq, 0, HZ) < 0) { 879 err = -ETIME; 880 goto out; 881 } 882 883 err = 0; 884 offset = 0; 885 end = (w->addr - i915_ggtt_offset(w->vma)) / sizeof(*w->map); 886 while (offset < end) { 887 if (!op(w->map[offset + 1], w->map[offset])) { 888 pr_err("Watcher '%s' found HWSP value %x for seqno %x\n", 889 name, w->map[offset + 1], w->map[offset]); 890 err = -EINVAL; 891 } 892 893 offset += 2; 894 } 895 896 out: 897 i915_request_put(rq); 898 return err; 899 } 900 901 static void cleanup_watcher(struct hwsp_watcher *w) 902 { 903 if (w->rq) { 904 struct intel_timeline *tl = w->rq->context->timeline; 905 906 mutex_lock(&tl->mutex); 907 w->rq->cookie = lockdep_pin_lock(&tl->mutex); 908 909 i915_request_add(w->rq); 910 } 911 912 i915_vma_unpin_and_release(&w->vma, I915_VMA_RELEASE_MAP); 913 } 914 915 static bool retire_requests(struct intel_timeline *tl) 916 { 917 struct i915_request *rq, *rn; 918 919 mutex_lock(&tl->mutex); 920 list_for_each_entry_safe(rq, rn, &tl->requests, link) 921 if (!i915_request_retire(rq)) 922 break; 923 mutex_unlock(&tl->mutex); 924 925 return !i915_active_fence_isset(&tl->last_request); 926 } 927 928 static struct i915_request *wrap_timeline(struct i915_request *rq) 929 { 930 struct intel_context *ce = rq->context; 931 struct intel_timeline *tl = ce->timeline; 932 u32 seqno = rq->fence.seqno; 933 934 while (tl->seqno >= seqno) { /* Cause a wrap */ 935 i915_request_put(rq); 936 rq = intel_context_create_request(ce); 937 if (IS_ERR(rq)) 938 return rq; 939 940 i915_request_get(rq); 941 i915_request_add(rq); 942 } 943 944 i915_request_put(rq); 945 rq = intel_context_create_request(ce); 946 if (IS_ERR(rq)) 947 return rq; 948 949 i915_request_get(rq); 950 i915_request_add(rq); 951 952 return rq; 953 } 954 955 static int live_hwsp_read(void *arg) 956 { 957 struct intel_gt *gt = arg; 958 struct hwsp_watcher watcher[2] = {}; 959 struct intel_engine_cs *engine; 960 struct intel_timeline *tl; 961 enum intel_engine_id id; 962 int err = 0; 963 int i; 964 965 /* 966 * If we take a reference to the HWSP for reading on the GPU, that 967 * read may be arbitrarily delayed (either by foreign fence or 968 * priority saturation) and a wrap can happen within 30 minutes. 969 * When the GPU read is finally submitted it should be correct, 970 * even across multiple wraps. 971 */ 972 973 if (INTEL_GEN(gt->i915) < 8) /* CS convenience [SRM/LRM] */ 974 return 0; 975 976 tl = intel_timeline_create(gt); 977 if (IS_ERR(tl)) 978 return PTR_ERR(tl); 979 980 if (!tl->hwsp_cacheline) 981 goto out_free; 982 983 for (i = 0; i < ARRAY_SIZE(watcher); i++) { 984 err = setup_watcher(&watcher[i], gt); 985 if (err) 986 goto out; 987 } 988 989 for_each_engine(engine, gt, id) { 990 struct intel_context *ce; 991 unsigned long count = 0; 992 IGT_TIMEOUT(end_time); 993 994 /* Create a request we can use for remote reading of the HWSP */ 995 err = create_watcher(&watcher[1], engine, SZ_512K); 996 if (err) 997 goto out; 998 999 do { 1000 struct i915_sw_fence *submit; 1001 struct i915_request *rq; 1002 u32 hwsp; 1003 1004 submit = heap_fence_create(GFP_KERNEL); 1005 if (!submit) { 1006 err = -ENOMEM; 1007 goto out; 1008 } 1009 1010 err = create_watcher(&watcher[0], engine, SZ_4K); 1011 if (err) 1012 goto out; 1013 1014 ce = intel_context_create(engine); 1015 if (IS_ERR(ce)) { 1016 err = PTR_ERR(ce); 1017 goto out; 1018 } 1019 1020 /* Skip to the end, saving 30 minutes of nops */ 1021 tl->seqno = -10u + 2 * (count & 3); 1022 WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno); 1023 ce->timeline = intel_timeline_get(tl); 1024 1025 rq = intel_context_create_request(ce); 1026 if (IS_ERR(rq)) { 1027 err = PTR_ERR(rq); 1028 intel_context_put(ce); 1029 goto out; 1030 } 1031 1032 err = i915_sw_fence_await_dma_fence(&rq->submit, 1033 &watcher[0].rq->fence, 0, 1034 GFP_KERNEL); 1035 if (err < 0) { 1036 i915_request_add(rq); 1037 intel_context_put(ce); 1038 goto out; 1039 } 1040 1041 mutex_lock(&watcher[0].rq->context->timeline->mutex); 1042 err = intel_timeline_read_hwsp(rq, watcher[0].rq, &hwsp); 1043 if (err == 0) 1044 err = emit_read_hwsp(watcher[0].rq, /* before */ 1045 rq->fence.seqno, hwsp, 1046 &watcher[0].addr); 1047 mutex_unlock(&watcher[0].rq->context->timeline->mutex); 1048 if (err) { 1049 i915_request_add(rq); 1050 intel_context_put(ce); 1051 goto out; 1052 } 1053 1054 mutex_lock(&watcher[1].rq->context->timeline->mutex); 1055 err = intel_timeline_read_hwsp(rq, watcher[1].rq, &hwsp); 1056 if (err == 0) 1057 err = emit_read_hwsp(watcher[1].rq, /* after */ 1058 rq->fence.seqno, hwsp, 1059 &watcher[1].addr); 1060 mutex_unlock(&watcher[1].rq->context->timeline->mutex); 1061 if (err) { 1062 i915_request_add(rq); 1063 intel_context_put(ce); 1064 goto out; 1065 } 1066 1067 i915_request_get(rq); 1068 i915_request_add(rq); 1069 1070 rq = wrap_timeline(rq); 1071 intel_context_put(ce); 1072 if (IS_ERR(rq)) { 1073 err = PTR_ERR(rq); 1074 goto out; 1075 } 1076 1077 err = i915_sw_fence_await_dma_fence(&watcher[1].rq->submit, 1078 &rq->fence, 0, 1079 GFP_KERNEL); 1080 if (err < 0) { 1081 i915_request_put(rq); 1082 goto out; 1083 } 1084 1085 err = check_watcher(&watcher[0], "before", cmp_lt); 1086 i915_sw_fence_commit(submit); 1087 heap_fence_put(submit); 1088 if (err) { 1089 i915_request_put(rq); 1090 goto out; 1091 } 1092 count++; 1093 1094 /* Flush the timeline before manually wrapping again */ 1095 if (i915_request_wait(rq, 1096 I915_WAIT_INTERRUPTIBLE, 1097 HZ) < 0) { 1098 err = -ETIME; 1099 i915_request_put(rq); 1100 goto out; 1101 } 1102 retire_requests(tl); 1103 i915_request_put(rq); 1104 1105 /* Single requests are limited to half a ring at most */ 1106 if (8 * watcher[1].rq->ring->emit > 1107 3 * watcher[1].rq->ring->size) 1108 break; 1109 1110 } while (!__igt_timeout(end_time, NULL)); 1111 WRITE_ONCE(*(u32 *)tl->hwsp_seqno, 0xdeadbeef); 1112 1113 pr_info("%s: simulated %lu wraps\n", engine->name, count); 1114 err = check_watcher(&watcher[1], "after", cmp_gte); 1115 if (err) 1116 goto out; 1117 } 1118 1119 out: 1120 for (i = 0; i < ARRAY_SIZE(watcher); i++) 1121 cleanup_watcher(&watcher[i]); 1122 1123 if (igt_flush_test(gt->i915)) 1124 err = -EIO; 1125 1126 out_free: 1127 intel_timeline_put(tl); 1128 return err; 1129 } 1130 1131 static int live_hwsp_rollover_kernel(void *arg) 1132 { 1133 struct intel_gt *gt = arg; 1134 struct intel_engine_cs *engine; 1135 enum intel_engine_id id; 1136 int err = 0; 1137 1138 /* 1139 * Run the host for long enough, and even the kernel context will 1140 * see a seqno rollover. 1141 */ 1142 1143 for_each_engine(engine, gt, id) { 1144 struct intel_context *ce = engine->kernel_context; 1145 struct intel_timeline *tl = ce->timeline; 1146 struct i915_request *rq[3] = {}; 1147 int i; 1148 1149 st_engine_heartbeat_disable(engine); 1150 if (intel_gt_wait_for_idle(gt, HZ / 2)) { 1151 err = -EIO; 1152 goto out; 1153 } 1154 1155 GEM_BUG_ON(i915_active_fence_isset(&tl->last_request)); 1156 tl->seqno = 0; 1157 timeline_rollback(tl); 1158 timeline_rollback(tl); 1159 WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno); 1160 1161 for (i = 0; i < ARRAY_SIZE(rq); i++) { 1162 struct i915_request *this; 1163 1164 this = i915_request_create(ce); 1165 if (IS_ERR(this)) { 1166 err = PTR_ERR(this); 1167 goto out; 1168 } 1169 1170 pr_debug("%s: create fence.seqnp:%d\n", 1171 engine->name, 1172 lower_32_bits(this->fence.seqno)); 1173 1174 GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl); 1175 1176 rq[i] = i915_request_get(this); 1177 i915_request_add(this); 1178 } 1179 1180 /* We expected a wrap! */ 1181 GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno); 1182 1183 if (i915_request_wait(rq[2], 0, HZ / 5) < 0) { 1184 pr_err("Wait for timeline wrap timed out!\n"); 1185 err = -EIO; 1186 goto out; 1187 } 1188 1189 for (i = 0; i < ARRAY_SIZE(rq); i++) { 1190 if (!i915_request_completed(rq[i])) { 1191 pr_err("Pre-wrap request not completed!\n"); 1192 err = -EINVAL; 1193 goto out; 1194 } 1195 } 1196 1197 out: 1198 for (i = 0; i < ARRAY_SIZE(rq); i++) 1199 i915_request_put(rq[i]); 1200 st_engine_heartbeat_enable(engine); 1201 if (err) 1202 break; 1203 } 1204 1205 if (igt_flush_test(gt->i915)) 1206 err = -EIO; 1207 1208 return err; 1209 } 1210 1211 static int live_hwsp_rollover_user(void *arg) 1212 { 1213 struct intel_gt *gt = arg; 1214 struct intel_engine_cs *engine; 1215 enum intel_engine_id id; 1216 int err = 0; 1217 1218 /* 1219 * Simulate a long running user context, and force the seqno wrap 1220 * on the user's timeline. 1221 */ 1222 1223 for_each_engine(engine, gt, id) { 1224 struct i915_request *rq[3] = {}; 1225 struct intel_timeline *tl; 1226 struct intel_context *ce; 1227 int i; 1228 1229 ce = intel_context_create(engine); 1230 if (IS_ERR(ce)) 1231 return PTR_ERR(ce); 1232 1233 err = intel_context_alloc_state(ce); 1234 if (err) 1235 goto out; 1236 1237 tl = ce->timeline; 1238 if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline) 1239 goto out; 1240 1241 timeline_rollback(tl); 1242 timeline_rollback(tl); 1243 WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno); 1244 1245 for (i = 0; i < ARRAY_SIZE(rq); i++) { 1246 struct i915_request *this; 1247 1248 this = intel_context_create_request(ce); 1249 if (IS_ERR(this)) { 1250 err = PTR_ERR(this); 1251 goto out; 1252 } 1253 1254 pr_debug("%s: create fence.seqnp:%d\n", 1255 engine->name, 1256 lower_32_bits(this->fence.seqno)); 1257 1258 GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl); 1259 1260 rq[i] = i915_request_get(this); 1261 i915_request_add(this); 1262 } 1263 1264 /* We expected a wrap! */ 1265 GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno); 1266 1267 if (i915_request_wait(rq[2], 0, HZ / 5) < 0) { 1268 pr_err("Wait for timeline wrap timed out!\n"); 1269 err = -EIO; 1270 goto out; 1271 } 1272 1273 for (i = 0; i < ARRAY_SIZE(rq); i++) { 1274 if (!i915_request_completed(rq[i])) { 1275 pr_err("Pre-wrap request not completed!\n"); 1276 err = -EINVAL; 1277 goto out; 1278 } 1279 } 1280 1281 out: 1282 for (i = 0; i < ARRAY_SIZE(rq); i++) 1283 i915_request_put(rq[i]); 1284 intel_context_put(ce); 1285 if (err) 1286 break; 1287 } 1288 1289 if (igt_flush_test(gt->i915)) 1290 err = -EIO; 1291 1292 return err; 1293 } 1294 1295 static int live_hwsp_recycle(void *arg) 1296 { 1297 struct intel_gt *gt = arg; 1298 struct intel_engine_cs *engine; 1299 enum intel_engine_id id; 1300 unsigned long count; 1301 int err = 0; 1302 1303 /* 1304 * Check seqno writes into one timeline at a time. We expect to 1305 * recycle the breadcrumb slot between iterations and neither 1306 * want to confuse ourselves or the GPU. 1307 */ 1308 1309 count = 0; 1310 for_each_engine(engine, gt, id) { 1311 IGT_TIMEOUT(end_time); 1312 1313 if (!intel_engine_can_store_dword(engine)) 1314 continue; 1315 1316 intel_engine_pm_get(engine); 1317 1318 do { 1319 struct intel_timeline *tl; 1320 struct i915_request *rq; 1321 1322 tl = checked_intel_timeline_create(gt); 1323 if (IS_ERR(tl)) { 1324 err = PTR_ERR(tl); 1325 break; 1326 } 1327 1328 rq = tl_write(tl, engine, count); 1329 if (IS_ERR(rq)) { 1330 intel_timeline_put(tl); 1331 err = PTR_ERR(rq); 1332 break; 1333 } 1334 1335 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1336 pr_err("Wait for timeline writes timed out!\n"); 1337 i915_request_put(rq); 1338 intel_timeline_put(tl); 1339 err = -EIO; 1340 break; 1341 } 1342 1343 if (READ_ONCE(*tl->hwsp_seqno) != count) { 1344 GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x found 0x%x\n", 1345 count, tl->fence_context, 1346 tl->hwsp_offset, *tl->hwsp_seqno); 1347 GEM_TRACE_DUMP(); 1348 err = -EINVAL; 1349 } 1350 1351 i915_request_put(rq); 1352 intel_timeline_put(tl); 1353 count++; 1354 1355 if (err) 1356 break; 1357 } while (!__igt_timeout(end_time, NULL)); 1358 1359 intel_engine_pm_put(engine); 1360 if (err) 1361 break; 1362 } 1363 1364 return err; 1365 } 1366 1367 int intel_timeline_live_selftests(struct drm_i915_private *i915) 1368 { 1369 static const struct i915_subtest tests[] = { 1370 SUBTEST(live_hwsp_recycle), 1371 SUBTEST(live_hwsp_engine), 1372 SUBTEST(live_hwsp_alternate), 1373 SUBTEST(live_hwsp_wrap), 1374 SUBTEST(live_hwsp_read), 1375 SUBTEST(live_hwsp_rollover_kernel), 1376 SUBTEST(live_hwsp_rollover_user), 1377 }; 1378 1379 if (intel_gt_is_wedged(&i915->gt)) 1380 return 0; 1381 1382 return intel_gt_live_subtests(tests, &i915->gt); 1383 } 1384