1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2017-2018 Intel Corporation 4 */ 5 6 #include <linux/prime_numbers.h> 7 8 #include "intel_context.h" 9 #include "intel_engine_heartbeat.h" 10 #include "intel_engine_pm.h" 11 #include "intel_gpu_commands.h" 12 #include "intel_gt.h" 13 #include "intel_gt_requests.h" 14 #include "intel_ring.h" 15 #include "selftest_engine_heartbeat.h" 16 17 #include "../selftests/i915_random.h" 18 #include "../i915_selftest.h" 19 20 #include "selftests/igt_flush_test.h" 21 #include "selftests/lib_sw_fence.h" 22 #include "selftests/mock_gem_device.h" 23 #include "selftests/mock_timeline.h" 24 25 static struct page *hwsp_page(struct intel_timeline *tl) 26 { 27 struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj; 28 29 GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj)); 30 return sg_page(obj->mm.pages->sgl); 31 } 32 33 static unsigned long hwsp_cacheline(struct intel_timeline *tl) 34 { 35 unsigned long address = (unsigned long)page_address(hwsp_page(tl)); 36 37 return (address + offset_in_page(tl->hwsp_offset)) / TIMELINE_SEQNO_BYTES; 38 } 39 40 static int selftest_tl_pin(struct intel_timeline *tl) 41 { 42 struct i915_gem_ww_ctx ww; 43 int err; 44 45 i915_gem_ww_ctx_init(&ww, false); 46 retry: 47 err = i915_gem_object_lock(tl->hwsp_ggtt->obj, &ww); 48 if (!err) 49 err = intel_timeline_pin(tl, &ww); 50 51 if (err == -EDEADLK) { 52 err = i915_gem_ww_ctx_backoff(&ww); 53 if (!err) 54 goto retry; 55 } 56 i915_gem_ww_ctx_fini(&ww); 57 return err; 58 } 59 60 /* Only half of seqno's are usable, see __intel_timeline_get_seqno() */ 61 #define CACHELINES_PER_PAGE (PAGE_SIZE / TIMELINE_SEQNO_BYTES / 2) 62 63 struct mock_hwsp_freelist { 64 struct intel_gt *gt; 65 struct radix_tree_root cachelines; 66 struct intel_timeline **history; 67 unsigned long count, max; 68 struct rnd_state prng; 69 }; 70 71 enum { 72 SHUFFLE = BIT(0), 73 }; 74 75 static void __mock_hwsp_record(struct mock_hwsp_freelist *state, 76 unsigned int idx, 77 struct intel_timeline *tl) 78 { 79 tl = xchg(&state->history[idx], tl); 80 if (tl) { 81 radix_tree_delete(&state->cachelines, hwsp_cacheline(tl)); 82 intel_timeline_unpin(tl); 83 intel_timeline_put(tl); 84 } 85 } 86 87 static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state, 88 unsigned int count, 89 unsigned int flags) 90 { 91 struct intel_timeline *tl; 92 unsigned int idx; 93 94 while (count--) { 95 unsigned long cacheline; 96 int err; 97 98 tl = intel_timeline_create(state->gt); 99 if (IS_ERR(tl)) 100 return PTR_ERR(tl); 101 102 err = selftest_tl_pin(tl); 103 if (err) { 104 intel_timeline_put(tl); 105 return err; 106 } 107 108 cacheline = hwsp_cacheline(tl); 109 err = radix_tree_insert(&state->cachelines, cacheline, tl); 110 if (err) { 111 if (err == -EEXIST) { 112 pr_err("HWSP cacheline %lu already used; duplicate allocation!\n", 113 cacheline); 114 } 115 intel_timeline_unpin(tl); 116 intel_timeline_put(tl); 117 return err; 118 } 119 120 idx = state->count++ % state->max; 121 __mock_hwsp_record(state, idx, tl); 122 } 123 124 if (flags & SHUFFLE) 125 i915_prandom_shuffle(state->history, 126 sizeof(*state->history), 127 min(state->count, state->max), 128 &state->prng); 129 130 count = i915_prandom_u32_max_state(min(state->count, state->max), 131 &state->prng); 132 while (count--) { 133 idx = --state->count % state->max; 134 __mock_hwsp_record(state, idx, NULL); 135 } 136 137 return 0; 138 } 139 140 static int mock_hwsp_freelist(void *arg) 141 { 142 struct mock_hwsp_freelist state; 143 struct drm_i915_private *i915; 144 const struct { 145 const char *name; 146 unsigned int flags; 147 } phases[] = { 148 { "linear", 0 }, 149 { "shuffled", SHUFFLE }, 150 { }, 151 }, *p; 152 unsigned int na; 153 int err = 0; 154 155 i915 = mock_gem_device(); 156 if (!i915) 157 return -ENOMEM; 158 159 INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL); 160 state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed); 161 162 state.gt = &i915->gt; 163 164 /* 165 * Create a bunch of timelines and check that their HWSP do not overlap. 166 * Free some, and try again. 167 */ 168 169 state.max = PAGE_SIZE / sizeof(*state.history); 170 state.count = 0; 171 state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL); 172 if (!state.history) { 173 err = -ENOMEM; 174 goto err_put; 175 } 176 177 for (p = phases; p->name; p++) { 178 pr_debug("%s(%s)\n", __func__, p->name); 179 for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) { 180 err = __mock_hwsp_timeline(&state, na, p->flags); 181 if (err) 182 goto out; 183 } 184 } 185 186 out: 187 for (na = 0; na < state.max; na++) 188 __mock_hwsp_record(&state, na, NULL); 189 kfree(state.history); 190 err_put: 191 mock_destroy_device(i915); 192 return err; 193 } 194 195 struct __igt_sync { 196 const char *name; 197 u32 seqno; 198 bool expected; 199 bool set; 200 }; 201 202 static int __igt_sync(struct intel_timeline *tl, 203 u64 ctx, 204 const struct __igt_sync *p, 205 const char *name) 206 { 207 int ret; 208 209 if (__intel_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) { 210 pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n", 211 name, p->name, ctx, p->seqno, yesno(p->expected)); 212 return -EINVAL; 213 } 214 215 if (p->set) { 216 ret = __intel_timeline_sync_set(tl, ctx, p->seqno); 217 if (ret) 218 return ret; 219 } 220 221 return 0; 222 } 223 224 static int igt_sync(void *arg) 225 { 226 const struct __igt_sync pass[] = { 227 { "unset", 0, false, false }, 228 { "new", 0, false, true }, 229 { "0a", 0, true, true }, 230 { "1a", 1, false, true }, 231 { "1b", 1, true, true }, 232 { "0b", 0, true, false }, 233 { "2a", 2, false, true }, 234 { "4", 4, false, true }, 235 { "INT_MAX", INT_MAX, false, true }, 236 { "INT_MAX-1", INT_MAX-1, true, false }, 237 { "INT_MAX+1", (u32)INT_MAX+1, false, true }, 238 { "INT_MAX", INT_MAX, true, false }, 239 { "UINT_MAX", UINT_MAX, false, true }, 240 { "wrap", 0, false, true }, 241 { "unwrap", UINT_MAX, true, false }, 242 {}, 243 }, *p; 244 struct intel_timeline tl; 245 int order, offset; 246 int ret = -ENODEV; 247 248 mock_timeline_init(&tl, 0); 249 for (p = pass; p->name; p++) { 250 for (order = 1; order < 64; order++) { 251 for (offset = -1; offset <= (order > 1); offset++) { 252 u64 ctx = BIT_ULL(order) + offset; 253 254 ret = __igt_sync(&tl, ctx, p, "1"); 255 if (ret) 256 goto out; 257 } 258 } 259 } 260 mock_timeline_fini(&tl); 261 262 mock_timeline_init(&tl, 0); 263 for (order = 1; order < 64; order++) { 264 for (offset = -1; offset <= (order > 1); offset++) { 265 u64 ctx = BIT_ULL(order) + offset; 266 267 for (p = pass; p->name; p++) { 268 ret = __igt_sync(&tl, ctx, p, "2"); 269 if (ret) 270 goto out; 271 } 272 } 273 } 274 275 out: 276 mock_timeline_fini(&tl); 277 return ret; 278 } 279 280 static unsigned int random_engine(struct rnd_state *rnd) 281 { 282 return i915_prandom_u32_max_state(I915_NUM_ENGINES, rnd); 283 } 284 285 static int bench_sync(void *arg) 286 { 287 struct rnd_state prng; 288 struct intel_timeline tl; 289 unsigned long end_time, count; 290 u64 prng32_1M; 291 ktime_t kt; 292 int order, last_order; 293 294 mock_timeline_init(&tl, 0); 295 296 /* Lookups from cache are very fast and so the random number generation 297 * and the loop itself becomes a significant factor in the per-iteration 298 * timings. We try to compensate the results by measuring the overhead 299 * of the prng and subtract it from the reported results. 300 */ 301 prandom_seed_state(&prng, i915_selftest.random_seed); 302 count = 0; 303 kt = ktime_get(); 304 end_time = jiffies + HZ/10; 305 do { 306 u32 x; 307 308 /* Make sure the compiler doesn't optimise away the prng call */ 309 WRITE_ONCE(x, prandom_u32_state(&prng)); 310 311 count++; 312 } while (!time_after(jiffies, end_time)); 313 kt = ktime_sub(ktime_get(), kt); 314 pr_debug("%s: %lu random evaluations, %lluns/prng\n", 315 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); 316 prng32_1M = div64_ul(ktime_to_ns(kt) << 20, count); 317 318 /* Benchmark (only) setting random context ids */ 319 prandom_seed_state(&prng, i915_selftest.random_seed); 320 count = 0; 321 kt = ktime_get(); 322 end_time = jiffies + HZ/10; 323 do { 324 u64 id = i915_prandom_u64_state(&prng); 325 326 __intel_timeline_sync_set(&tl, id, 0); 327 count++; 328 } while (!time_after(jiffies, end_time)); 329 kt = ktime_sub(ktime_get(), kt); 330 kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20); 331 pr_info("%s: %lu random insertions, %lluns/insert\n", 332 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); 333 334 /* Benchmark looking up the exact same context ids as we just set */ 335 prandom_seed_state(&prng, i915_selftest.random_seed); 336 end_time = count; 337 kt = ktime_get(); 338 while (end_time--) { 339 u64 id = i915_prandom_u64_state(&prng); 340 341 if (!__intel_timeline_sync_is_later(&tl, id, 0)) { 342 mock_timeline_fini(&tl); 343 pr_err("Lookup of %llu failed\n", id); 344 return -EINVAL; 345 } 346 } 347 kt = ktime_sub(ktime_get(), kt); 348 kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20); 349 pr_info("%s: %lu random lookups, %lluns/lookup\n", 350 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); 351 352 mock_timeline_fini(&tl); 353 cond_resched(); 354 355 mock_timeline_init(&tl, 0); 356 357 /* Benchmark setting the first N (in order) contexts */ 358 count = 0; 359 kt = ktime_get(); 360 end_time = jiffies + HZ/10; 361 do { 362 __intel_timeline_sync_set(&tl, count++, 0); 363 } while (!time_after(jiffies, end_time)); 364 kt = ktime_sub(ktime_get(), kt); 365 pr_info("%s: %lu in-order insertions, %lluns/insert\n", 366 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); 367 368 /* Benchmark looking up the exact same context ids as we just set */ 369 end_time = count; 370 kt = ktime_get(); 371 while (end_time--) { 372 if (!__intel_timeline_sync_is_later(&tl, end_time, 0)) { 373 pr_err("Lookup of %lu failed\n", end_time); 374 mock_timeline_fini(&tl); 375 return -EINVAL; 376 } 377 } 378 kt = ktime_sub(ktime_get(), kt); 379 pr_info("%s: %lu in-order lookups, %lluns/lookup\n", 380 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); 381 382 mock_timeline_fini(&tl); 383 cond_resched(); 384 385 mock_timeline_init(&tl, 0); 386 387 /* Benchmark searching for a random context id and maybe changing it */ 388 prandom_seed_state(&prng, i915_selftest.random_seed); 389 count = 0; 390 kt = ktime_get(); 391 end_time = jiffies + HZ/10; 392 do { 393 u32 id = random_engine(&prng); 394 u32 seqno = prandom_u32_state(&prng); 395 396 if (!__intel_timeline_sync_is_later(&tl, id, seqno)) 397 __intel_timeline_sync_set(&tl, id, seqno); 398 399 count++; 400 } while (!time_after(jiffies, end_time)); 401 kt = ktime_sub(ktime_get(), kt); 402 kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20); 403 pr_info("%s: %lu repeated insert/lookups, %lluns/op\n", 404 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count)); 405 mock_timeline_fini(&tl); 406 cond_resched(); 407 408 /* Benchmark searching for a known context id and changing the seqno */ 409 for (last_order = 1, order = 1; order < 32; 410 ({ int tmp = last_order; last_order = order; order += tmp; })) { 411 unsigned int mask = BIT(order) - 1; 412 413 mock_timeline_init(&tl, 0); 414 415 count = 0; 416 kt = ktime_get(); 417 end_time = jiffies + HZ/10; 418 do { 419 /* Without assuming too many details of the underlying 420 * implementation, try to identify its phase-changes 421 * (if any)! 422 */ 423 u64 id = (u64)(count & mask) << order; 424 425 __intel_timeline_sync_is_later(&tl, id, 0); 426 __intel_timeline_sync_set(&tl, id, 0); 427 428 count++; 429 } while (!time_after(jiffies, end_time)); 430 kt = ktime_sub(ktime_get(), kt); 431 pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n", 432 __func__, count, order, 433 (long long)div64_ul(ktime_to_ns(kt), count)); 434 mock_timeline_fini(&tl); 435 cond_resched(); 436 } 437 438 return 0; 439 } 440 441 int intel_timeline_mock_selftests(void) 442 { 443 static const struct i915_subtest tests[] = { 444 SUBTEST(mock_hwsp_freelist), 445 SUBTEST(igt_sync), 446 SUBTEST(bench_sync), 447 }; 448 449 return i915_subtests(tests, NULL); 450 } 451 452 static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value) 453 { 454 u32 *cs; 455 456 cs = intel_ring_begin(rq, 4); 457 if (IS_ERR(cs)) 458 return PTR_ERR(cs); 459 460 if (GRAPHICS_VER(rq->engine->i915) >= 8) { 461 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 462 *cs++ = addr; 463 *cs++ = 0; 464 *cs++ = value; 465 } else if (GRAPHICS_VER(rq->engine->i915) >= 4) { 466 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 467 *cs++ = 0; 468 *cs++ = addr; 469 *cs++ = value; 470 } else { 471 *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 472 *cs++ = addr; 473 *cs++ = value; 474 *cs++ = MI_NOOP; 475 } 476 477 intel_ring_advance(rq, cs); 478 479 return 0; 480 } 481 482 static struct i915_request * 483 checked_tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value) 484 { 485 struct i915_request *rq; 486 int err; 487 488 err = selftest_tl_pin(tl); 489 if (err) { 490 rq = ERR_PTR(err); 491 goto out; 492 } 493 494 if (READ_ONCE(*tl->hwsp_seqno) != tl->seqno) { 495 pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n", 496 *tl->hwsp_seqno, tl->seqno); 497 intel_timeline_unpin(tl); 498 return ERR_PTR(-EINVAL); 499 } 500 501 rq = intel_engine_create_kernel_request(engine); 502 if (IS_ERR(rq)) 503 goto out_unpin; 504 505 i915_request_get(rq); 506 507 err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value); 508 i915_request_add(rq); 509 if (err) { 510 i915_request_put(rq); 511 rq = ERR_PTR(err); 512 } 513 514 out_unpin: 515 intel_timeline_unpin(tl); 516 out: 517 if (IS_ERR(rq)) 518 pr_err("Failed to write to timeline!\n"); 519 return rq; 520 } 521 522 static int live_hwsp_engine(void *arg) 523 { 524 #define NUM_TIMELINES 4096 525 struct intel_gt *gt = arg; 526 struct intel_timeline **timelines; 527 struct intel_engine_cs *engine; 528 enum intel_engine_id id; 529 unsigned long count, n; 530 int err = 0; 531 532 /* 533 * Create a bunch of timelines and check we can write 534 * independently to each of their breadcrumb slots. 535 */ 536 537 timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES, 538 sizeof(*timelines), 539 GFP_KERNEL); 540 if (!timelines) 541 return -ENOMEM; 542 543 count = 0; 544 for_each_engine(engine, gt, id) { 545 if (!intel_engine_can_store_dword(engine)) 546 continue; 547 548 intel_engine_pm_get(engine); 549 550 for (n = 0; n < NUM_TIMELINES; n++) { 551 struct intel_timeline *tl; 552 struct i915_request *rq; 553 554 tl = intel_timeline_create(gt); 555 if (IS_ERR(tl)) { 556 err = PTR_ERR(tl); 557 break; 558 } 559 560 rq = checked_tl_write(tl, engine, count); 561 if (IS_ERR(rq)) { 562 intel_timeline_put(tl); 563 err = PTR_ERR(rq); 564 break; 565 } 566 567 timelines[count++] = tl; 568 i915_request_put(rq); 569 } 570 571 intel_engine_pm_put(engine); 572 if (err) 573 break; 574 } 575 576 if (igt_flush_test(gt->i915)) 577 err = -EIO; 578 579 for (n = 0; n < count; n++) { 580 struct intel_timeline *tl = timelines[n]; 581 582 if (!err && READ_ONCE(*tl->hwsp_seqno) != n) { 583 GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n", 584 n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno); 585 GEM_TRACE_DUMP(); 586 err = -EINVAL; 587 } 588 intel_timeline_put(tl); 589 } 590 591 kvfree(timelines); 592 return err; 593 #undef NUM_TIMELINES 594 } 595 596 static int live_hwsp_alternate(void *arg) 597 { 598 #define NUM_TIMELINES 4096 599 struct intel_gt *gt = arg; 600 struct intel_timeline **timelines; 601 struct intel_engine_cs *engine; 602 enum intel_engine_id id; 603 unsigned long count, n; 604 int err = 0; 605 606 /* 607 * Create a bunch of timelines and check we can write 608 * independently to each of their breadcrumb slots with adjacent 609 * engines. 610 */ 611 612 timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES, 613 sizeof(*timelines), 614 GFP_KERNEL); 615 if (!timelines) 616 return -ENOMEM; 617 618 count = 0; 619 for (n = 0; n < NUM_TIMELINES; n++) { 620 for_each_engine(engine, gt, id) { 621 struct intel_timeline *tl; 622 struct i915_request *rq; 623 624 if (!intel_engine_can_store_dword(engine)) 625 continue; 626 627 tl = intel_timeline_create(gt); 628 if (IS_ERR(tl)) { 629 err = PTR_ERR(tl); 630 goto out; 631 } 632 633 intel_engine_pm_get(engine); 634 rq = checked_tl_write(tl, engine, count); 635 intel_engine_pm_put(engine); 636 if (IS_ERR(rq)) { 637 intel_timeline_put(tl); 638 err = PTR_ERR(rq); 639 goto out; 640 } 641 642 timelines[count++] = tl; 643 i915_request_put(rq); 644 } 645 } 646 647 out: 648 if (igt_flush_test(gt->i915)) 649 err = -EIO; 650 651 for (n = 0; n < count; n++) { 652 struct intel_timeline *tl = timelines[n]; 653 654 if (!err && READ_ONCE(*tl->hwsp_seqno) != n) { 655 GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n", 656 n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno); 657 GEM_TRACE_DUMP(); 658 err = -EINVAL; 659 } 660 intel_timeline_put(tl); 661 } 662 663 kvfree(timelines); 664 return err; 665 #undef NUM_TIMELINES 666 } 667 668 static int live_hwsp_wrap(void *arg) 669 { 670 struct intel_gt *gt = arg; 671 struct intel_engine_cs *engine; 672 struct intel_timeline *tl; 673 enum intel_engine_id id; 674 int err = 0; 675 676 /* 677 * Across a seqno wrap, we need to keep the old cacheline alive for 678 * foreign GPU references. 679 */ 680 681 tl = intel_timeline_create(gt); 682 if (IS_ERR(tl)) 683 return PTR_ERR(tl); 684 685 if (!tl->has_initial_breadcrumb) 686 goto out_free; 687 688 err = selftest_tl_pin(tl); 689 if (err) 690 goto out_free; 691 692 for_each_engine(engine, gt, id) { 693 const u32 *hwsp_seqno[2]; 694 struct i915_request *rq; 695 u32 seqno[2]; 696 697 if (!intel_engine_can_store_dword(engine)) 698 continue; 699 700 rq = intel_engine_create_kernel_request(engine); 701 if (IS_ERR(rq)) { 702 err = PTR_ERR(rq); 703 goto out; 704 } 705 706 tl->seqno = -4u; 707 708 mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING); 709 err = intel_timeline_get_seqno(tl, rq, &seqno[0]); 710 mutex_unlock(&tl->mutex); 711 if (err) { 712 i915_request_add(rq); 713 goto out; 714 } 715 pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n", 716 seqno[0], tl->hwsp_offset); 717 718 err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]); 719 if (err) { 720 i915_request_add(rq); 721 goto out; 722 } 723 hwsp_seqno[0] = tl->hwsp_seqno; 724 725 mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING); 726 err = intel_timeline_get_seqno(tl, rq, &seqno[1]); 727 mutex_unlock(&tl->mutex); 728 if (err) { 729 i915_request_add(rq); 730 goto out; 731 } 732 pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n", 733 seqno[1], tl->hwsp_offset); 734 735 err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]); 736 if (err) { 737 i915_request_add(rq); 738 goto out; 739 } 740 hwsp_seqno[1] = tl->hwsp_seqno; 741 742 /* With wrap should come a new hwsp */ 743 GEM_BUG_ON(seqno[1] >= seqno[0]); 744 GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]); 745 746 i915_request_add(rq); 747 748 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 749 pr_err("Wait for timeline writes timed out!\n"); 750 err = -EIO; 751 goto out; 752 } 753 754 if (READ_ONCE(*hwsp_seqno[0]) != seqno[0] || 755 READ_ONCE(*hwsp_seqno[1]) != seqno[1]) { 756 pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n", 757 *hwsp_seqno[0], *hwsp_seqno[1], 758 seqno[0], seqno[1]); 759 err = -EINVAL; 760 goto out; 761 } 762 763 intel_gt_retire_requests(gt); /* recycle HWSP */ 764 } 765 766 out: 767 if (igt_flush_test(gt->i915)) 768 err = -EIO; 769 770 intel_timeline_unpin(tl); 771 out_free: 772 intel_timeline_put(tl); 773 return err; 774 } 775 776 static int emit_read_hwsp(struct i915_request *rq, 777 u32 seqno, u32 hwsp, 778 u32 *addr) 779 { 780 const u32 gpr = i915_mmio_reg_offset(GEN8_RING_CS_GPR(rq->engine->mmio_base, 0)); 781 u32 *cs; 782 783 cs = intel_ring_begin(rq, 12); 784 if (IS_ERR(cs)) 785 return PTR_ERR(cs); 786 787 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 788 *cs++ = *addr; 789 *cs++ = 0; 790 *cs++ = seqno; 791 *addr += 4; 792 793 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT; 794 *cs++ = gpr; 795 *cs++ = hwsp; 796 *cs++ = 0; 797 798 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 799 *cs++ = gpr; 800 *cs++ = *addr; 801 *cs++ = 0; 802 *addr += 4; 803 804 intel_ring_advance(rq, cs); 805 806 return 0; 807 } 808 809 struct hwsp_watcher { 810 struct i915_vma *vma; 811 struct i915_request *rq; 812 u32 addr; 813 u32 *map; 814 }; 815 816 static bool cmp_lt(u32 a, u32 b) 817 { 818 return a < b; 819 } 820 821 static bool cmp_gte(u32 a, u32 b) 822 { 823 return a >= b; 824 } 825 826 static int setup_watcher(struct hwsp_watcher *w, struct intel_gt *gt) 827 { 828 struct drm_i915_gem_object *obj; 829 struct i915_vma *vma; 830 831 obj = i915_gem_object_create_internal(gt->i915, SZ_2M); 832 if (IS_ERR(obj)) 833 return PTR_ERR(obj); 834 835 w->map = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB); 836 if (IS_ERR(w->map)) { 837 i915_gem_object_put(obj); 838 return PTR_ERR(w->map); 839 } 840 841 vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0); 842 if (IS_ERR(vma)) { 843 i915_gem_object_put(obj); 844 return PTR_ERR(vma); 845 } 846 847 w->vma = vma; 848 w->addr = i915_ggtt_offset(vma); 849 return 0; 850 } 851 852 static void switch_tl_lock(struct i915_request *from, struct i915_request *to) 853 { 854 /* some light mutex juggling required; think co-routines */ 855 856 if (from) { 857 lockdep_unpin_lock(&from->context->timeline->mutex, from->cookie); 858 mutex_unlock(&from->context->timeline->mutex); 859 } 860 861 if (to) { 862 mutex_lock(&to->context->timeline->mutex); 863 to->cookie = lockdep_pin_lock(&to->context->timeline->mutex); 864 } 865 } 866 867 static int create_watcher(struct hwsp_watcher *w, 868 struct intel_engine_cs *engine, 869 int ringsz) 870 { 871 struct intel_context *ce; 872 873 ce = intel_context_create(engine); 874 if (IS_ERR(ce)) 875 return PTR_ERR(ce); 876 877 ce->ring = __intel_context_ring_size(ringsz); 878 w->rq = intel_context_create_request(ce); 879 intel_context_put(ce); 880 if (IS_ERR(w->rq)) 881 return PTR_ERR(w->rq); 882 883 w->addr = i915_ggtt_offset(w->vma); 884 885 switch_tl_lock(w->rq, NULL); 886 887 return 0; 888 } 889 890 static int check_watcher(struct hwsp_watcher *w, const char *name, 891 bool (*op)(u32 hwsp, u32 seqno)) 892 { 893 struct i915_request *rq = fetch_and_zero(&w->rq); 894 u32 offset, end; 895 int err; 896 897 GEM_BUG_ON(w->addr - i915_ggtt_offset(w->vma) > w->vma->size); 898 899 i915_request_get(rq); 900 switch_tl_lock(NULL, rq); 901 i915_request_add(rq); 902 903 if (i915_request_wait(rq, 0, HZ) < 0) { 904 err = -ETIME; 905 goto out; 906 } 907 908 err = 0; 909 offset = 0; 910 end = (w->addr - i915_ggtt_offset(w->vma)) / sizeof(*w->map); 911 while (offset < end) { 912 if (!op(w->map[offset + 1], w->map[offset])) { 913 pr_err("Watcher '%s' found HWSP value %x for seqno %x\n", 914 name, w->map[offset + 1], w->map[offset]); 915 err = -EINVAL; 916 } 917 918 offset += 2; 919 } 920 921 out: 922 i915_request_put(rq); 923 return err; 924 } 925 926 static void cleanup_watcher(struct hwsp_watcher *w) 927 { 928 if (w->rq) { 929 switch_tl_lock(NULL, w->rq); 930 931 i915_request_add(w->rq); 932 } 933 934 i915_vma_unpin_and_release(&w->vma, I915_VMA_RELEASE_MAP); 935 } 936 937 static bool retire_requests(struct intel_timeline *tl) 938 { 939 struct i915_request *rq, *rn; 940 941 mutex_lock(&tl->mutex); 942 list_for_each_entry_safe(rq, rn, &tl->requests, link) 943 if (!i915_request_retire(rq)) 944 break; 945 mutex_unlock(&tl->mutex); 946 947 return !i915_active_fence_isset(&tl->last_request); 948 } 949 950 static struct i915_request *wrap_timeline(struct i915_request *rq) 951 { 952 struct intel_context *ce = rq->context; 953 struct intel_timeline *tl = ce->timeline; 954 u32 seqno = rq->fence.seqno; 955 956 while (tl->seqno >= seqno) { /* Cause a wrap */ 957 i915_request_put(rq); 958 rq = intel_context_create_request(ce); 959 if (IS_ERR(rq)) 960 return rq; 961 962 i915_request_get(rq); 963 i915_request_add(rq); 964 } 965 966 i915_request_put(rq); 967 rq = i915_request_create(ce); 968 if (IS_ERR(rq)) 969 return rq; 970 971 i915_request_get(rq); 972 i915_request_add(rq); 973 974 return rq; 975 } 976 977 static int live_hwsp_read(void *arg) 978 { 979 struct intel_gt *gt = arg; 980 struct hwsp_watcher watcher[2] = {}; 981 struct intel_engine_cs *engine; 982 struct intel_timeline *tl; 983 enum intel_engine_id id; 984 int err = 0; 985 int i; 986 987 /* 988 * If we take a reference to the HWSP for reading on the GPU, that 989 * read may be arbitrarily delayed (either by foreign fence or 990 * priority saturation) and a wrap can happen within 30 minutes. 991 * When the GPU read is finally submitted it should be correct, 992 * even across multiple wraps. 993 */ 994 995 if (GRAPHICS_VER(gt->i915) < 8) /* CS convenience [SRM/LRM] */ 996 return 0; 997 998 tl = intel_timeline_create(gt); 999 if (IS_ERR(tl)) 1000 return PTR_ERR(tl); 1001 1002 if (!tl->has_initial_breadcrumb) 1003 goto out_free; 1004 1005 for (i = 0; i < ARRAY_SIZE(watcher); i++) { 1006 err = setup_watcher(&watcher[i], gt); 1007 if (err) 1008 goto out; 1009 } 1010 1011 for_each_engine(engine, gt, id) { 1012 struct intel_context *ce; 1013 unsigned long count = 0; 1014 IGT_TIMEOUT(end_time); 1015 1016 /* Create a request we can use for remote reading of the HWSP */ 1017 err = create_watcher(&watcher[1], engine, SZ_512K); 1018 if (err) 1019 goto out; 1020 1021 do { 1022 struct i915_sw_fence *submit; 1023 struct i915_request *rq; 1024 u32 hwsp, dummy; 1025 1026 submit = heap_fence_create(GFP_KERNEL); 1027 if (!submit) { 1028 err = -ENOMEM; 1029 goto out; 1030 } 1031 1032 err = create_watcher(&watcher[0], engine, SZ_4K); 1033 if (err) 1034 goto out; 1035 1036 ce = intel_context_create(engine); 1037 if (IS_ERR(ce)) { 1038 err = PTR_ERR(ce); 1039 goto out; 1040 } 1041 1042 ce->timeline = intel_timeline_get(tl); 1043 1044 /* Ensure timeline is mapped, done during first pin */ 1045 err = intel_context_pin(ce); 1046 if (err) { 1047 intel_context_put(ce); 1048 goto out; 1049 } 1050 1051 /* 1052 * Start at a new wrap, and set seqno right before another wrap, 1053 * saving 30 minutes of nops 1054 */ 1055 tl->seqno = -12u + 2 * (count & 3); 1056 __intel_timeline_get_seqno(tl, &dummy); 1057 1058 rq = i915_request_create(ce); 1059 if (IS_ERR(rq)) { 1060 err = PTR_ERR(rq); 1061 intel_context_unpin(ce); 1062 intel_context_put(ce); 1063 goto out; 1064 } 1065 1066 err = i915_sw_fence_await_dma_fence(&rq->submit, 1067 &watcher[0].rq->fence, 0, 1068 GFP_KERNEL); 1069 if (err < 0) { 1070 i915_request_add(rq); 1071 intel_context_unpin(ce); 1072 intel_context_put(ce); 1073 goto out; 1074 } 1075 1076 switch_tl_lock(rq, watcher[0].rq); 1077 err = intel_timeline_read_hwsp(rq, watcher[0].rq, &hwsp); 1078 if (err == 0) 1079 err = emit_read_hwsp(watcher[0].rq, /* before */ 1080 rq->fence.seqno, hwsp, 1081 &watcher[0].addr); 1082 switch_tl_lock(watcher[0].rq, rq); 1083 if (err) { 1084 i915_request_add(rq); 1085 intel_context_unpin(ce); 1086 intel_context_put(ce); 1087 goto out; 1088 } 1089 1090 switch_tl_lock(rq, watcher[1].rq); 1091 err = intel_timeline_read_hwsp(rq, watcher[1].rq, &hwsp); 1092 if (err == 0) 1093 err = emit_read_hwsp(watcher[1].rq, /* after */ 1094 rq->fence.seqno, hwsp, 1095 &watcher[1].addr); 1096 switch_tl_lock(watcher[1].rq, rq); 1097 if (err) { 1098 i915_request_add(rq); 1099 intel_context_unpin(ce); 1100 intel_context_put(ce); 1101 goto out; 1102 } 1103 1104 i915_request_get(rq); 1105 i915_request_add(rq); 1106 1107 rq = wrap_timeline(rq); 1108 intel_context_unpin(ce); 1109 intel_context_put(ce); 1110 if (IS_ERR(rq)) { 1111 err = PTR_ERR(rq); 1112 goto out; 1113 } 1114 1115 err = i915_sw_fence_await_dma_fence(&watcher[1].rq->submit, 1116 &rq->fence, 0, 1117 GFP_KERNEL); 1118 if (err < 0) { 1119 i915_request_put(rq); 1120 goto out; 1121 } 1122 1123 err = check_watcher(&watcher[0], "before", cmp_lt); 1124 i915_sw_fence_commit(submit); 1125 heap_fence_put(submit); 1126 if (err) { 1127 i915_request_put(rq); 1128 goto out; 1129 } 1130 count++; 1131 1132 /* Flush the timeline before manually wrapping again */ 1133 if (i915_request_wait(rq, 1134 I915_WAIT_INTERRUPTIBLE, 1135 HZ) < 0) { 1136 err = -ETIME; 1137 i915_request_put(rq); 1138 goto out; 1139 } 1140 retire_requests(tl); 1141 i915_request_put(rq); 1142 1143 /* Single requests are limited to half a ring at most */ 1144 if (8 * watcher[1].rq->ring->emit > 1145 3 * watcher[1].rq->ring->size) 1146 break; 1147 1148 } while (!__igt_timeout(end_time, NULL) && 1149 count < (PAGE_SIZE / TIMELINE_SEQNO_BYTES - 1) / 2); 1150 1151 pr_info("%s: simulated %lu wraps\n", engine->name, count); 1152 err = check_watcher(&watcher[1], "after", cmp_gte); 1153 if (err) 1154 goto out; 1155 } 1156 1157 out: 1158 for (i = 0; i < ARRAY_SIZE(watcher); i++) 1159 cleanup_watcher(&watcher[i]); 1160 1161 if (igt_flush_test(gt->i915)) 1162 err = -EIO; 1163 1164 out_free: 1165 intel_timeline_put(tl); 1166 return err; 1167 } 1168 1169 static int live_hwsp_rollover_kernel(void *arg) 1170 { 1171 struct intel_gt *gt = arg; 1172 struct intel_engine_cs *engine; 1173 enum intel_engine_id id; 1174 int err = 0; 1175 1176 /* 1177 * Run the host for long enough, and even the kernel context will 1178 * see a seqno rollover. 1179 */ 1180 1181 for_each_engine(engine, gt, id) { 1182 struct intel_context *ce = engine->kernel_context; 1183 struct intel_timeline *tl = ce->timeline; 1184 struct i915_request *rq[3] = {}; 1185 int i; 1186 1187 st_engine_heartbeat_disable(engine); 1188 if (intel_gt_wait_for_idle(gt, HZ / 2)) { 1189 err = -EIO; 1190 goto out; 1191 } 1192 1193 GEM_BUG_ON(i915_active_fence_isset(&tl->last_request)); 1194 tl->seqno = -2u; 1195 WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno); 1196 1197 for (i = 0; i < ARRAY_SIZE(rq); i++) { 1198 struct i915_request *this; 1199 1200 this = i915_request_create(ce); 1201 if (IS_ERR(this)) { 1202 err = PTR_ERR(this); 1203 goto out; 1204 } 1205 1206 pr_debug("%s: create fence.seqnp:%d\n", 1207 engine->name, 1208 lower_32_bits(this->fence.seqno)); 1209 1210 GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl); 1211 1212 rq[i] = i915_request_get(this); 1213 i915_request_add(this); 1214 } 1215 1216 /* We expected a wrap! */ 1217 GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno); 1218 1219 if (i915_request_wait(rq[2], 0, HZ / 5) < 0) { 1220 pr_err("Wait for timeline wrap timed out!\n"); 1221 err = -EIO; 1222 goto out; 1223 } 1224 1225 for (i = 0; i < ARRAY_SIZE(rq); i++) { 1226 if (!i915_request_completed(rq[i])) { 1227 pr_err("Pre-wrap request not completed!\n"); 1228 err = -EINVAL; 1229 goto out; 1230 } 1231 } 1232 1233 out: 1234 for (i = 0; i < ARRAY_SIZE(rq); i++) 1235 i915_request_put(rq[i]); 1236 st_engine_heartbeat_enable(engine); 1237 if (err) 1238 break; 1239 } 1240 1241 if (igt_flush_test(gt->i915)) 1242 err = -EIO; 1243 1244 return err; 1245 } 1246 1247 static int live_hwsp_rollover_user(void *arg) 1248 { 1249 struct intel_gt *gt = arg; 1250 struct intel_engine_cs *engine; 1251 enum intel_engine_id id; 1252 int err = 0; 1253 1254 /* 1255 * Simulate a long running user context, and force the seqno wrap 1256 * on the user's timeline. 1257 */ 1258 1259 for_each_engine(engine, gt, id) { 1260 struct i915_request *rq[3] = {}; 1261 struct intel_timeline *tl; 1262 struct intel_context *ce; 1263 int i; 1264 1265 ce = intel_context_create(engine); 1266 if (IS_ERR(ce)) 1267 return PTR_ERR(ce); 1268 1269 err = intel_context_alloc_state(ce); 1270 if (err) 1271 goto out; 1272 1273 tl = ce->timeline; 1274 if (!tl->has_initial_breadcrumb) 1275 goto out; 1276 1277 err = intel_context_pin(ce); 1278 if (err) 1279 goto out; 1280 1281 tl->seqno = -4u; 1282 WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno); 1283 1284 for (i = 0; i < ARRAY_SIZE(rq); i++) { 1285 struct i915_request *this; 1286 1287 this = intel_context_create_request(ce); 1288 if (IS_ERR(this)) { 1289 err = PTR_ERR(this); 1290 goto out_unpin; 1291 } 1292 1293 pr_debug("%s: create fence.seqnp:%d\n", 1294 engine->name, 1295 lower_32_bits(this->fence.seqno)); 1296 1297 GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl); 1298 1299 rq[i] = i915_request_get(this); 1300 i915_request_add(this); 1301 } 1302 1303 /* We expected a wrap! */ 1304 GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno); 1305 1306 if (i915_request_wait(rq[2], 0, HZ / 5) < 0) { 1307 pr_err("Wait for timeline wrap timed out!\n"); 1308 err = -EIO; 1309 goto out_unpin; 1310 } 1311 1312 for (i = 0; i < ARRAY_SIZE(rq); i++) { 1313 if (!i915_request_completed(rq[i])) { 1314 pr_err("Pre-wrap request not completed!\n"); 1315 err = -EINVAL; 1316 goto out_unpin; 1317 } 1318 } 1319 out_unpin: 1320 intel_context_unpin(ce); 1321 out: 1322 for (i = 0; i < ARRAY_SIZE(rq); i++) 1323 i915_request_put(rq[i]); 1324 intel_context_put(ce); 1325 if (err) 1326 break; 1327 } 1328 1329 if (igt_flush_test(gt->i915)) 1330 err = -EIO; 1331 1332 return err; 1333 } 1334 1335 static int live_hwsp_recycle(void *arg) 1336 { 1337 struct intel_gt *gt = arg; 1338 struct intel_engine_cs *engine; 1339 enum intel_engine_id id; 1340 unsigned long count; 1341 int err = 0; 1342 1343 /* 1344 * Check seqno writes into one timeline at a time. We expect to 1345 * recycle the breadcrumb slot between iterations and neither 1346 * want to confuse ourselves or the GPU. 1347 */ 1348 1349 count = 0; 1350 for_each_engine(engine, gt, id) { 1351 IGT_TIMEOUT(end_time); 1352 1353 if (!intel_engine_can_store_dword(engine)) 1354 continue; 1355 1356 intel_engine_pm_get(engine); 1357 1358 do { 1359 struct intel_timeline *tl; 1360 struct i915_request *rq; 1361 1362 tl = intel_timeline_create(gt); 1363 if (IS_ERR(tl)) { 1364 err = PTR_ERR(tl); 1365 break; 1366 } 1367 1368 rq = checked_tl_write(tl, engine, count); 1369 if (IS_ERR(rq)) { 1370 intel_timeline_put(tl); 1371 err = PTR_ERR(rq); 1372 break; 1373 } 1374 1375 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1376 pr_err("Wait for timeline writes timed out!\n"); 1377 i915_request_put(rq); 1378 intel_timeline_put(tl); 1379 err = -EIO; 1380 break; 1381 } 1382 1383 if (READ_ONCE(*tl->hwsp_seqno) != count) { 1384 GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x found 0x%x\n", 1385 count, tl->fence_context, 1386 tl->hwsp_offset, *tl->hwsp_seqno); 1387 GEM_TRACE_DUMP(); 1388 err = -EINVAL; 1389 } 1390 1391 i915_request_put(rq); 1392 intel_timeline_put(tl); 1393 count++; 1394 1395 if (err) 1396 break; 1397 } while (!__igt_timeout(end_time, NULL)); 1398 1399 intel_engine_pm_put(engine); 1400 if (err) 1401 break; 1402 } 1403 1404 return err; 1405 } 1406 1407 int intel_timeline_live_selftests(struct drm_i915_private *i915) 1408 { 1409 static const struct i915_subtest tests[] = { 1410 SUBTEST(live_hwsp_recycle), 1411 SUBTEST(live_hwsp_engine), 1412 SUBTEST(live_hwsp_alternate), 1413 SUBTEST(live_hwsp_wrap), 1414 SUBTEST(live_hwsp_read), 1415 SUBTEST(live_hwsp_rollover_kernel), 1416 SUBTEST(live_hwsp_rollover_user), 1417 }; 1418 1419 if (intel_gt_is_wedged(&i915->gt)) 1420 return 0; 1421 1422 return intel_gt_live_subtests(tests, &i915->gt); 1423 } 1424