1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/prime_numbers.h> 26 #include <linux/pm_qos.h> 27 #include <linux/sort.h> 28 29 #include "gem/i915_gem_internal.h" 30 #include "gem/i915_gem_pm.h" 31 #include "gem/selftests/mock_context.h" 32 33 #include "gt/intel_engine_heartbeat.h" 34 #include "gt/intel_engine_pm.h" 35 #include "gt/intel_engine_user.h" 36 #include "gt/intel_gt.h" 37 #include "gt/intel_gt_clock_utils.h" 38 #include "gt/intel_gt_requests.h" 39 #include "gt/selftest_engine_heartbeat.h" 40 41 #include "i915_random.h" 42 #include "i915_selftest.h" 43 #include "igt_flush_test.h" 44 #include "igt_live_test.h" 45 #include "igt_spinner.h" 46 #include "lib_sw_fence.h" 47 48 #include "mock_drm.h" 49 #include "mock_gem_device.h" 50 51 static unsigned int num_uabi_engines(struct drm_i915_private *i915) 52 { 53 struct intel_engine_cs *engine; 54 unsigned int count; 55 56 count = 0; 57 for_each_uabi_engine(engine, i915) 58 count++; 59 60 return count; 61 } 62 63 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915) 64 { 65 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0); 66 } 67 68 static int igt_add_request(void *arg) 69 { 70 struct drm_i915_private *i915 = arg; 71 struct i915_request *request; 72 73 /* Basic preliminary test to create a request and let it loose! */ 74 75 request = mock_request(rcs0(i915)->kernel_context, HZ / 10); 76 if (!request) 77 return -ENOMEM; 78 79 i915_request_add(request); 80 81 return 0; 82 } 83 84 static int igt_wait_request(void *arg) 85 { 86 const long T = HZ / 4; 87 struct drm_i915_private *i915 = arg; 88 struct i915_request *request; 89 int err = -EINVAL; 90 91 /* Submit a request, then wait upon it */ 92 93 request = mock_request(rcs0(i915)->kernel_context, T); 94 if (!request) 95 return -ENOMEM; 96 97 i915_request_get(request); 98 99 if (i915_request_wait(request, 0, 0) != -ETIME) { 100 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n"); 101 goto out_request; 102 } 103 104 if (i915_request_wait(request, 0, T) != -ETIME) { 105 pr_err("request wait succeeded (expected timeout before submit!)\n"); 106 goto out_request; 107 } 108 109 if (i915_request_completed(request)) { 110 pr_err("request completed before submit!!\n"); 111 goto out_request; 112 } 113 114 i915_request_add(request); 115 116 if (i915_request_wait(request, 0, 0) != -ETIME) { 117 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n"); 118 goto out_request; 119 } 120 121 if (i915_request_completed(request)) { 122 pr_err("request completed immediately!\n"); 123 goto out_request; 124 } 125 126 if (i915_request_wait(request, 0, T / 2) != -ETIME) { 127 pr_err("request wait succeeded (expected timeout!)\n"); 128 goto out_request; 129 } 130 131 if (i915_request_wait(request, 0, T) == -ETIME) { 132 pr_err("request wait timed out!\n"); 133 goto out_request; 134 } 135 136 if (!i915_request_completed(request)) { 137 pr_err("request not complete after waiting!\n"); 138 goto out_request; 139 } 140 141 if (i915_request_wait(request, 0, T) == -ETIME) { 142 pr_err("request wait timed out when already complete!\n"); 143 goto out_request; 144 } 145 146 err = 0; 147 out_request: 148 i915_request_put(request); 149 mock_device_flush(i915); 150 return err; 151 } 152 153 static int igt_fence_wait(void *arg) 154 { 155 const long T = HZ / 4; 156 struct drm_i915_private *i915 = arg; 157 struct i915_request *request; 158 int err = -EINVAL; 159 160 /* Submit a request, treat it as a fence and wait upon it */ 161 162 request = mock_request(rcs0(i915)->kernel_context, T); 163 if (!request) 164 return -ENOMEM; 165 166 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) { 167 pr_err("fence wait success before submit (expected timeout)!\n"); 168 goto out; 169 } 170 171 i915_request_add(request); 172 173 if (dma_fence_is_signaled(&request->fence)) { 174 pr_err("fence signaled immediately!\n"); 175 goto out; 176 } 177 178 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) { 179 pr_err("fence wait success after submit (expected timeout)!\n"); 180 goto out; 181 } 182 183 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 184 pr_err("fence wait timed out (expected success)!\n"); 185 goto out; 186 } 187 188 if (!dma_fence_is_signaled(&request->fence)) { 189 pr_err("fence unsignaled after waiting!\n"); 190 goto out; 191 } 192 193 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 194 pr_err("fence wait timed out when complete (expected success)!\n"); 195 goto out; 196 } 197 198 err = 0; 199 out: 200 mock_device_flush(i915); 201 return err; 202 } 203 204 static int igt_request_rewind(void *arg) 205 { 206 struct drm_i915_private *i915 = arg; 207 struct i915_request *request, *vip; 208 struct i915_gem_context *ctx[2]; 209 struct intel_context *ce; 210 int err = -EINVAL; 211 212 ctx[0] = mock_context(i915, "A"); 213 if (!ctx[0]) { 214 err = -ENOMEM; 215 goto err_ctx_0; 216 } 217 218 ce = i915_gem_context_get_engine(ctx[0], RCS0); 219 GEM_BUG_ON(IS_ERR(ce)); 220 request = mock_request(ce, 2 * HZ); 221 intel_context_put(ce); 222 if (!request) { 223 err = -ENOMEM; 224 goto err_context_0; 225 } 226 227 i915_request_get(request); 228 i915_request_add(request); 229 230 ctx[1] = mock_context(i915, "B"); 231 if (!ctx[1]) { 232 err = -ENOMEM; 233 goto err_ctx_1; 234 } 235 236 ce = i915_gem_context_get_engine(ctx[1], RCS0); 237 GEM_BUG_ON(IS_ERR(ce)); 238 vip = mock_request(ce, 0); 239 intel_context_put(ce); 240 if (!vip) { 241 err = -ENOMEM; 242 goto err_context_1; 243 } 244 245 /* Simulate preemption by manual reordering */ 246 if (!mock_cancel_request(request)) { 247 pr_err("failed to cancel request (already executed)!\n"); 248 i915_request_add(vip); 249 goto err_context_1; 250 } 251 i915_request_get(vip); 252 i915_request_add(vip); 253 rcu_read_lock(); 254 request->engine->submit_request(request); 255 rcu_read_unlock(); 256 257 258 if (i915_request_wait(vip, 0, HZ) == -ETIME) { 259 pr_err("timed out waiting for high priority request\n"); 260 goto err; 261 } 262 263 if (i915_request_completed(request)) { 264 pr_err("low priority request already completed\n"); 265 goto err; 266 } 267 268 err = 0; 269 err: 270 i915_request_put(vip); 271 err_context_1: 272 mock_context_close(ctx[1]); 273 err_ctx_1: 274 i915_request_put(request); 275 err_context_0: 276 mock_context_close(ctx[0]); 277 err_ctx_0: 278 mock_device_flush(i915); 279 return err; 280 } 281 282 struct smoketest { 283 struct intel_engine_cs *engine; 284 struct i915_gem_context **contexts; 285 atomic_long_t num_waits, num_fences; 286 int ncontexts, max_batch; 287 struct i915_request *(*request_alloc)(struct intel_context *ce); 288 }; 289 290 static struct i915_request * 291 __mock_request_alloc(struct intel_context *ce) 292 { 293 return mock_request(ce, 0); 294 } 295 296 static struct i915_request * 297 __live_request_alloc(struct intel_context *ce) 298 { 299 return intel_context_create_request(ce); 300 } 301 302 static int __igt_breadcrumbs_smoketest(void *arg) 303 { 304 struct smoketest *t = arg; 305 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1; 306 const unsigned int total = 4 * t->ncontexts + 1; 307 unsigned int num_waits = 0, num_fences = 0; 308 struct i915_request **requests; 309 I915_RND_STATE(prng); 310 unsigned int *order; 311 int err = 0; 312 313 /* 314 * A very simple test to catch the most egregious of list handling bugs. 315 * 316 * At its heart, we simply create oodles of requests running across 317 * multiple kthreads and enable signaling on them, for the sole purpose 318 * of stressing our breadcrumb handling. The only inspection we do is 319 * that the fences were marked as signaled. 320 */ 321 322 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL); 323 if (!requests) 324 return -ENOMEM; 325 326 order = i915_random_order(total, &prng); 327 if (!order) { 328 err = -ENOMEM; 329 goto out_requests; 330 } 331 332 while (!kthread_should_stop()) { 333 struct i915_sw_fence *submit, *wait; 334 unsigned int n, count; 335 336 submit = heap_fence_create(GFP_KERNEL); 337 if (!submit) { 338 err = -ENOMEM; 339 break; 340 } 341 342 wait = heap_fence_create(GFP_KERNEL); 343 if (!wait) { 344 i915_sw_fence_commit(submit); 345 heap_fence_put(submit); 346 err = -ENOMEM; 347 break; 348 } 349 350 i915_random_reorder(order, total, &prng); 351 count = 1 + i915_prandom_u32_max_state(max_batch, &prng); 352 353 for (n = 0; n < count; n++) { 354 struct i915_gem_context *ctx = 355 t->contexts[order[n] % t->ncontexts]; 356 struct i915_request *rq; 357 struct intel_context *ce; 358 359 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx); 360 GEM_BUG_ON(IS_ERR(ce)); 361 rq = t->request_alloc(ce); 362 intel_context_put(ce); 363 if (IS_ERR(rq)) { 364 err = PTR_ERR(rq); 365 count = n; 366 break; 367 } 368 369 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 370 submit, 371 GFP_KERNEL); 372 373 requests[n] = i915_request_get(rq); 374 i915_request_add(rq); 375 376 if (err >= 0) 377 err = i915_sw_fence_await_dma_fence(wait, 378 &rq->fence, 379 0, 380 GFP_KERNEL); 381 382 if (err < 0) { 383 i915_request_put(rq); 384 count = n; 385 break; 386 } 387 } 388 389 i915_sw_fence_commit(submit); 390 i915_sw_fence_commit(wait); 391 392 if (!wait_event_timeout(wait->wait, 393 i915_sw_fence_done(wait), 394 5 * HZ)) { 395 struct i915_request *rq = requests[count - 1]; 396 397 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", 398 atomic_read(&wait->pending), count, 399 rq->fence.context, rq->fence.seqno, 400 t->engine->name); 401 GEM_TRACE_DUMP(); 402 403 intel_gt_set_wedged(t->engine->gt); 404 GEM_BUG_ON(!i915_request_completed(rq)); 405 i915_sw_fence_wait(wait); 406 err = -EIO; 407 } 408 409 for (n = 0; n < count; n++) { 410 struct i915_request *rq = requests[n]; 411 412 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 413 &rq->fence.flags)) { 414 pr_err("%llu:%llu was not signaled!\n", 415 rq->fence.context, rq->fence.seqno); 416 err = -EINVAL; 417 } 418 419 i915_request_put(rq); 420 } 421 422 heap_fence_put(wait); 423 heap_fence_put(submit); 424 425 if (err < 0) 426 break; 427 428 num_fences += count; 429 num_waits++; 430 431 cond_resched(); 432 } 433 434 atomic_long_add(num_fences, &t->num_fences); 435 atomic_long_add(num_waits, &t->num_waits); 436 437 kfree(order); 438 out_requests: 439 kfree(requests); 440 return err; 441 } 442 443 static int mock_breadcrumbs_smoketest(void *arg) 444 { 445 struct drm_i915_private *i915 = arg; 446 struct smoketest t = { 447 .engine = rcs0(i915), 448 .ncontexts = 1024, 449 .max_batch = 1024, 450 .request_alloc = __mock_request_alloc 451 }; 452 unsigned int ncpus = num_online_cpus(); 453 struct task_struct **threads; 454 unsigned int n; 455 int ret = 0; 456 457 /* 458 * Smoketest our breadcrumb/signal handling for requests across multiple 459 * threads. A very simple test to only catch the most egregious of bugs. 460 * See __igt_breadcrumbs_smoketest(); 461 */ 462 463 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL); 464 if (!threads) 465 return -ENOMEM; 466 467 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL); 468 if (!t.contexts) { 469 ret = -ENOMEM; 470 goto out_threads; 471 } 472 473 for (n = 0; n < t.ncontexts; n++) { 474 t.contexts[n] = mock_context(t.engine->i915, "mock"); 475 if (!t.contexts[n]) { 476 ret = -ENOMEM; 477 goto out_contexts; 478 } 479 } 480 481 for (n = 0; n < ncpus; n++) { 482 threads[n] = kthread_run(__igt_breadcrumbs_smoketest, 483 &t, "igt/%d", n); 484 if (IS_ERR(threads[n])) { 485 ret = PTR_ERR(threads[n]); 486 ncpus = n; 487 break; 488 } 489 490 get_task_struct(threads[n]); 491 } 492 493 yield(); /* start all threads before we begin */ 494 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 495 496 for (n = 0; n < ncpus; n++) { 497 int err; 498 499 err = kthread_stop(threads[n]); 500 if (err < 0 && !ret) 501 ret = err; 502 503 put_task_struct(threads[n]); 504 } 505 pr_info("Completed %lu waits for %lu fence across %d cpus\n", 506 atomic_long_read(&t.num_waits), 507 atomic_long_read(&t.num_fences), 508 ncpus); 509 510 out_contexts: 511 for (n = 0; n < t.ncontexts; n++) { 512 if (!t.contexts[n]) 513 break; 514 mock_context_close(t.contexts[n]); 515 } 516 kfree(t.contexts); 517 out_threads: 518 kfree(threads); 519 return ret; 520 } 521 522 int i915_request_mock_selftests(void) 523 { 524 static const struct i915_subtest tests[] = { 525 SUBTEST(igt_add_request), 526 SUBTEST(igt_wait_request), 527 SUBTEST(igt_fence_wait), 528 SUBTEST(igt_request_rewind), 529 SUBTEST(mock_breadcrumbs_smoketest), 530 }; 531 struct drm_i915_private *i915; 532 intel_wakeref_t wakeref; 533 int err = 0; 534 535 i915 = mock_gem_device(); 536 if (!i915) 537 return -ENOMEM; 538 539 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 540 err = i915_subtests(tests, i915); 541 542 mock_destroy_device(i915); 543 544 return err; 545 } 546 547 static int live_nop_request(void *arg) 548 { 549 struct drm_i915_private *i915 = arg; 550 struct intel_engine_cs *engine; 551 struct igt_live_test t; 552 int err = -ENODEV; 553 554 /* 555 * Submit various sized batches of empty requests, to each engine 556 * (individually), and wait for the batch to complete. We can check 557 * the overhead of submitting requests to the hardware. 558 */ 559 560 for_each_uabi_engine(engine, i915) { 561 unsigned long n, prime; 562 IGT_TIMEOUT(end_time); 563 ktime_t times[2] = {}; 564 565 err = igt_live_test_begin(&t, i915, __func__, engine->name); 566 if (err) 567 return err; 568 569 intel_engine_pm_get(engine); 570 for_each_prime_number_from(prime, 1, 8192) { 571 struct i915_request *request = NULL; 572 573 times[1] = ktime_get_raw(); 574 575 for (n = 0; n < prime; n++) { 576 i915_request_put(request); 577 request = i915_request_create(engine->kernel_context); 578 if (IS_ERR(request)) 579 return PTR_ERR(request); 580 581 /* 582 * This space is left intentionally blank. 583 * 584 * We do not actually want to perform any 585 * action with this request, we just want 586 * to measure the latency in allocation 587 * and submission of our breadcrumbs - 588 * ensuring that the bare request is sufficient 589 * for the system to work (i.e. proper HEAD 590 * tracking of the rings, interrupt handling, 591 * etc). It also gives us the lowest bounds 592 * for latency. 593 */ 594 595 i915_request_get(request); 596 i915_request_add(request); 597 } 598 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 599 i915_request_put(request); 600 601 times[1] = ktime_sub(ktime_get_raw(), times[1]); 602 if (prime == 1) 603 times[0] = times[1]; 604 605 if (__igt_timeout(end_time, NULL)) 606 break; 607 } 608 intel_engine_pm_put(engine); 609 610 err = igt_live_test_end(&t); 611 if (err) 612 return err; 613 614 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n", 615 engine->name, 616 ktime_to_ns(times[0]), 617 prime, div64_u64(ktime_to_ns(times[1]), prime)); 618 } 619 620 return err; 621 } 622 623 static int __cancel_inactive(struct intel_engine_cs *engine) 624 { 625 struct intel_context *ce; 626 struct igt_spinner spin; 627 struct i915_request *rq; 628 int err = 0; 629 630 if (igt_spinner_init(&spin, engine->gt)) 631 return -ENOMEM; 632 633 ce = intel_context_create(engine); 634 if (IS_ERR(ce)) { 635 err = PTR_ERR(ce); 636 goto out_spin; 637 } 638 639 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 640 if (IS_ERR(rq)) { 641 err = PTR_ERR(rq); 642 goto out_ce; 643 } 644 645 pr_debug("%s: Cancelling inactive request\n", engine->name); 646 i915_request_cancel(rq, -EINTR); 647 i915_request_get(rq); 648 i915_request_add(rq); 649 650 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 651 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 652 653 pr_err("%s: Failed to cancel inactive request\n", engine->name); 654 intel_engine_dump(engine, &p, "%s\n", engine->name); 655 err = -ETIME; 656 goto out_rq; 657 } 658 659 if (rq->fence.error != -EINTR) { 660 pr_err("%s: fence not cancelled (%u)\n", 661 engine->name, rq->fence.error); 662 err = -EINVAL; 663 } 664 665 out_rq: 666 i915_request_put(rq); 667 out_ce: 668 intel_context_put(ce); 669 out_spin: 670 igt_spinner_fini(&spin); 671 if (err) 672 pr_err("%s: %s error %d\n", __func__, engine->name, err); 673 return err; 674 } 675 676 static int __cancel_active(struct intel_engine_cs *engine) 677 { 678 struct intel_context *ce; 679 struct igt_spinner spin; 680 struct i915_request *rq; 681 int err = 0; 682 683 if (igt_spinner_init(&spin, engine->gt)) 684 return -ENOMEM; 685 686 ce = intel_context_create(engine); 687 if (IS_ERR(ce)) { 688 err = PTR_ERR(ce); 689 goto out_spin; 690 } 691 692 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 693 if (IS_ERR(rq)) { 694 err = PTR_ERR(rq); 695 goto out_ce; 696 } 697 698 pr_debug("%s: Cancelling active request\n", engine->name); 699 i915_request_get(rq); 700 i915_request_add(rq); 701 if (!igt_wait_for_spinner(&spin, rq)) { 702 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 703 704 pr_err("Failed to start spinner on %s\n", engine->name); 705 intel_engine_dump(engine, &p, "%s\n", engine->name); 706 err = -ETIME; 707 goto out_rq; 708 } 709 i915_request_cancel(rq, -EINTR); 710 711 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 712 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 713 714 pr_err("%s: Failed to cancel active request\n", engine->name); 715 intel_engine_dump(engine, &p, "%s\n", engine->name); 716 err = -ETIME; 717 goto out_rq; 718 } 719 720 if (rq->fence.error != -EINTR) { 721 pr_err("%s: fence not cancelled (%u)\n", 722 engine->name, rq->fence.error); 723 err = -EINVAL; 724 } 725 726 out_rq: 727 i915_request_put(rq); 728 out_ce: 729 intel_context_put(ce); 730 out_spin: 731 igt_spinner_fini(&spin); 732 if (err) 733 pr_err("%s: %s error %d\n", __func__, engine->name, err); 734 return err; 735 } 736 737 static int __cancel_completed(struct intel_engine_cs *engine) 738 { 739 struct intel_context *ce; 740 struct igt_spinner spin; 741 struct i915_request *rq; 742 int err = 0; 743 744 if (igt_spinner_init(&spin, engine->gt)) 745 return -ENOMEM; 746 747 ce = intel_context_create(engine); 748 if (IS_ERR(ce)) { 749 err = PTR_ERR(ce); 750 goto out_spin; 751 } 752 753 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 754 if (IS_ERR(rq)) { 755 err = PTR_ERR(rq); 756 goto out_ce; 757 } 758 igt_spinner_end(&spin); 759 i915_request_get(rq); 760 i915_request_add(rq); 761 762 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 763 err = -ETIME; 764 goto out_rq; 765 } 766 767 pr_debug("%s: Cancelling completed request\n", engine->name); 768 i915_request_cancel(rq, -EINTR); 769 if (rq->fence.error) { 770 pr_err("%s: fence not cancelled (%u)\n", 771 engine->name, rq->fence.error); 772 err = -EINVAL; 773 } 774 775 out_rq: 776 i915_request_put(rq); 777 out_ce: 778 intel_context_put(ce); 779 out_spin: 780 igt_spinner_fini(&spin); 781 if (err) 782 pr_err("%s: %s error %d\n", __func__, engine->name, err); 783 return err; 784 } 785 786 /* 787 * Test to prove a non-preemptable request can be cancelled and a subsequent 788 * request on the same context can successfully complete after cancellation. 789 * 790 * Testing methodology is to create a non-preemptible request and submit it, 791 * wait for spinner to start, create a NOP request and submit it, cancel the 792 * spinner, wait for spinner to complete and verify it failed with an error, 793 * finally wait for NOP request to complete verify it succeeded without an 794 * error. Preemption timeout also reduced / restored so test runs in a timely 795 * maner. 796 */ 797 static int __cancel_reset(struct drm_i915_private *i915, 798 struct intel_engine_cs *engine) 799 { 800 struct intel_context *ce; 801 struct igt_spinner spin; 802 struct i915_request *rq, *nop; 803 unsigned long preempt_timeout_ms; 804 int err = 0; 805 806 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT || 807 !intel_has_reset_engine(engine->gt)) 808 return 0; 809 810 preempt_timeout_ms = engine->props.preempt_timeout_ms; 811 engine->props.preempt_timeout_ms = 100; 812 813 if (igt_spinner_init(&spin, engine->gt)) 814 goto out_restore; 815 816 ce = intel_context_create(engine); 817 if (IS_ERR(ce)) { 818 err = PTR_ERR(ce); 819 goto out_spin; 820 } 821 822 rq = igt_spinner_create_request(&spin, ce, MI_NOOP); 823 if (IS_ERR(rq)) { 824 err = PTR_ERR(rq); 825 goto out_ce; 826 } 827 828 pr_debug("%s: Cancelling active non-preemptable request\n", 829 engine->name); 830 i915_request_get(rq); 831 i915_request_add(rq); 832 if (!igt_wait_for_spinner(&spin, rq)) { 833 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 834 835 pr_err("Failed to start spinner on %s\n", engine->name); 836 intel_engine_dump(engine, &p, "%s\n", engine->name); 837 err = -ETIME; 838 goto out_rq; 839 } 840 841 nop = intel_context_create_request(ce); 842 if (IS_ERR(nop)) 843 goto out_rq; 844 i915_request_get(nop); 845 i915_request_add(nop); 846 847 i915_request_cancel(rq, -EINTR); 848 849 if (i915_request_wait(rq, 0, HZ) < 0) { 850 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 851 852 pr_err("%s: Failed to cancel hung request\n", engine->name); 853 intel_engine_dump(engine, &p, "%s\n", engine->name); 854 err = -ETIME; 855 goto out_nop; 856 } 857 858 if (rq->fence.error != -EINTR) { 859 pr_err("%s: fence not cancelled (%u)\n", 860 engine->name, rq->fence.error); 861 err = -EINVAL; 862 goto out_nop; 863 } 864 865 if (i915_request_wait(nop, 0, HZ) < 0) { 866 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 867 868 pr_err("%s: Failed to complete nop request\n", engine->name); 869 intel_engine_dump(engine, &p, "%s\n", engine->name); 870 err = -ETIME; 871 goto out_nop; 872 } 873 874 if (nop->fence.error != 0) { 875 pr_err("%s: Nop request errored (%u)\n", 876 engine->name, nop->fence.error); 877 err = -EINVAL; 878 } 879 880 out_nop: 881 i915_request_put(nop); 882 out_rq: 883 i915_request_put(rq); 884 out_ce: 885 intel_context_put(ce); 886 out_spin: 887 igt_spinner_fini(&spin); 888 out_restore: 889 engine->props.preempt_timeout_ms = preempt_timeout_ms; 890 if (err) 891 pr_err("%s: %s error %d\n", __func__, engine->name, err); 892 return err; 893 } 894 895 static int live_cancel_request(void *arg) 896 { 897 struct drm_i915_private *i915 = arg; 898 struct intel_engine_cs *engine; 899 900 /* 901 * Check cancellation of requests. We expect to be able to immediately 902 * cancel active requests, even if they are currently on the GPU. 903 */ 904 905 for_each_uabi_engine(engine, i915) { 906 struct igt_live_test t; 907 int err, err2; 908 909 if (!intel_engine_has_preemption(engine)) 910 continue; 911 912 err = igt_live_test_begin(&t, i915, __func__, engine->name); 913 if (err) 914 return err; 915 916 err = __cancel_inactive(engine); 917 if (err == 0) 918 err = __cancel_active(engine); 919 if (err == 0) 920 err = __cancel_completed(engine); 921 922 err2 = igt_live_test_end(&t); 923 if (err) 924 return err; 925 if (err2) 926 return err2; 927 928 /* Expects reset so call outside of igt_live_test_* */ 929 err = __cancel_reset(i915, engine); 930 if (err) 931 return err; 932 933 if (igt_flush_test(i915)) 934 return -EIO; 935 } 936 937 return 0; 938 } 939 940 static struct i915_vma *empty_batch(struct drm_i915_private *i915) 941 { 942 struct drm_i915_gem_object *obj; 943 struct i915_vma *vma; 944 u32 *cmd; 945 int err; 946 947 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 948 if (IS_ERR(obj)) 949 return ERR_CAST(obj); 950 951 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB); 952 if (IS_ERR(cmd)) { 953 err = PTR_ERR(cmd); 954 goto err; 955 } 956 957 *cmd = MI_BATCH_BUFFER_END; 958 959 __i915_gem_object_flush_map(obj, 0, 64); 960 i915_gem_object_unpin_map(obj); 961 962 intel_gt_chipset_flush(to_gt(i915)); 963 964 vma = i915_vma_instance(obj, &to_gt(i915)->ggtt->vm, NULL); 965 if (IS_ERR(vma)) { 966 err = PTR_ERR(vma); 967 goto err; 968 } 969 970 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL); 971 if (err) 972 goto err; 973 974 /* Force the wait now to avoid including it in the benchmark */ 975 err = i915_vma_sync(vma); 976 if (err) 977 goto err_pin; 978 979 return vma; 980 981 err_pin: 982 i915_vma_unpin(vma); 983 err: 984 i915_gem_object_put(obj); 985 return ERR_PTR(err); 986 } 987 988 static struct i915_request * 989 empty_request(struct intel_engine_cs *engine, 990 struct i915_vma *batch) 991 { 992 struct i915_request *request; 993 int err; 994 995 request = i915_request_create(engine->kernel_context); 996 if (IS_ERR(request)) 997 return request; 998 999 err = engine->emit_bb_start(request, 1000 batch->node.start, 1001 batch->node.size, 1002 I915_DISPATCH_SECURE); 1003 if (err) 1004 goto out_request; 1005 1006 i915_request_get(request); 1007 out_request: 1008 i915_request_add(request); 1009 return err ? ERR_PTR(err) : request; 1010 } 1011 1012 static int live_empty_request(void *arg) 1013 { 1014 struct drm_i915_private *i915 = arg; 1015 struct intel_engine_cs *engine; 1016 struct igt_live_test t; 1017 struct i915_vma *batch; 1018 int err = 0; 1019 1020 /* 1021 * Submit various sized batches of empty requests, to each engine 1022 * (individually), and wait for the batch to complete. We can check 1023 * the overhead of submitting requests to the hardware. 1024 */ 1025 1026 batch = empty_batch(i915); 1027 if (IS_ERR(batch)) 1028 return PTR_ERR(batch); 1029 1030 for_each_uabi_engine(engine, i915) { 1031 IGT_TIMEOUT(end_time); 1032 struct i915_request *request; 1033 unsigned long n, prime; 1034 ktime_t times[2] = {}; 1035 1036 err = igt_live_test_begin(&t, i915, __func__, engine->name); 1037 if (err) 1038 goto out_batch; 1039 1040 intel_engine_pm_get(engine); 1041 1042 /* Warmup / preload */ 1043 request = empty_request(engine, batch); 1044 if (IS_ERR(request)) { 1045 err = PTR_ERR(request); 1046 intel_engine_pm_put(engine); 1047 goto out_batch; 1048 } 1049 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1050 1051 for_each_prime_number_from(prime, 1, 8192) { 1052 times[1] = ktime_get_raw(); 1053 1054 for (n = 0; n < prime; n++) { 1055 i915_request_put(request); 1056 request = empty_request(engine, batch); 1057 if (IS_ERR(request)) { 1058 err = PTR_ERR(request); 1059 intel_engine_pm_put(engine); 1060 goto out_batch; 1061 } 1062 } 1063 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1064 1065 times[1] = ktime_sub(ktime_get_raw(), times[1]); 1066 if (prime == 1) 1067 times[0] = times[1]; 1068 1069 if (__igt_timeout(end_time, NULL)) 1070 break; 1071 } 1072 i915_request_put(request); 1073 intel_engine_pm_put(engine); 1074 1075 err = igt_live_test_end(&t); 1076 if (err) 1077 goto out_batch; 1078 1079 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n", 1080 engine->name, 1081 ktime_to_ns(times[0]), 1082 prime, div64_u64(ktime_to_ns(times[1]), prime)); 1083 } 1084 1085 out_batch: 1086 i915_vma_unpin(batch); 1087 i915_vma_put(batch); 1088 return err; 1089 } 1090 1091 static struct i915_vma *recursive_batch(struct drm_i915_private *i915) 1092 { 1093 struct drm_i915_gem_object *obj; 1094 const int ver = GRAPHICS_VER(i915); 1095 struct i915_vma *vma; 1096 u32 *cmd; 1097 int err; 1098 1099 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 1100 if (IS_ERR(obj)) 1101 return ERR_CAST(obj); 1102 1103 vma = i915_vma_instance(obj, to_gt(i915)->vm, NULL); 1104 if (IS_ERR(vma)) { 1105 err = PTR_ERR(vma); 1106 goto err; 1107 } 1108 1109 err = i915_vma_pin(vma, 0, 0, PIN_USER); 1110 if (err) 1111 goto err; 1112 1113 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC); 1114 if (IS_ERR(cmd)) { 1115 err = PTR_ERR(cmd); 1116 goto err; 1117 } 1118 1119 if (ver >= 8) { 1120 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 1121 *cmd++ = lower_32_bits(vma->node.start); 1122 *cmd++ = upper_32_bits(vma->node.start); 1123 } else if (ver >= 6) { 1124 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8; 1125 *cmd++ = lower_32_bits(vma->node.start); 1126 } else { 1127 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; 1128 *cmd++ = lower_32_bits(vma->node.start); 1129 } 1130 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */ 1131 1132 __i915_gem_object_flush_map(obj, 0, 64); 1133 i915_gem_object_unpin_map(obj); 1134 1135 intel_gt_chipset_flush(to_gt(i915)); 1136 1137 return vma; 1138 1139 err: 1140 i915_gem_object_put(obj); 1141 return ERR_PTR(err); 1142 } 1143 1144 static int recursive_batch_resolve(struct i915_vma *batch) 1145 { 1146 u32 *cmd; 1147 1148 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC); 1149 if (IS_ERR(cmd)) 1150 return PTR_ERR(cmd); 1151 1152 *cmd = MI_BATCH_BUFFER_END; 1153 1154 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd)); 1155 i915_gem_object_unpin_map(batch->obj); 1156 1157 intel_gt_chipset_flush(batch->vm->gt); 1158 1159 return 0; 1160 } 1161 1162 static int live_all_engines(void *arg) 1163 { 1164 struct drm_i915_private *i915 = arg; 1165 const unsigned int nengines = num_uabi_engines(i915); 1166 struct intel_engine_cs *engine; 1167 struct i915_request **request; 1168 struct igt_live_test t; 1169 struct i915_vma *batch; 1170 unsigned int idx; 1171 int err; 1172 1173 /* 1174 * Check we can submit requests to all engines simultaneously. We 1175 * send a recursive batch to each engine - checking that we don't 1176 * block doing so, and that they don't complete too soon. 1177 */ 1178 1179 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1180 if (!request) 1181 return -ENOMEM; 1182 1183 err = igt_live_test_begin(&t, i915, __func__, ""); 1184 if (err) 1185 goto out_free; 1186 1187 batch = recursive_batch(i915); 1188 if (IS_ERR(batch)) { 1189 err = PTR_ERR(batch); 1190 pr_err("%s: Unable to create batch, err=%d\n", __func__, err); 1191 goto out_free; 1192 } 1193 1194 i915_vma_lock(batch); 1195 1196 idx = 0; 1197 for_each_uabi_engine(engine, i915) { 1198 request[idx] = intel_engine_create_kernel_request(engine); 1199 if (IS_ERR(request[idx])) { 1200 err = PTR_ERR(request[idx]); 1201 pr_err("%s: Request allocation failed with err=%d\n", 1202 __func__, err); 1203 goto out_request; 1204 } 1205 1206 err = i915_request_await_object(request[idx], batch->obj, 0); 1207 if (err == 0) 1208 err = i915_vma_move_to_active(batch, request[idx], 0); 1209 GEM_BUG_ON(err); 1210 1211 err = engine->emit_bb_start(request[idx], 1212 batch->node.start, 1213 batch->node.size, 1214 0); 1215 GEM_BUG_ON(err); 1216 request[idx]->batch = batch; 1217 1218 i915_request_get(request[idx]); 1219 i915_request_add(request[idx]); 1220 idx++; 1221 } 1222 1223 i915_vma_unlock(batch); 1224 1225 idx = 0; 1226 for_each_uabi_engine(engine, i915) { 1227 if (i915_request_completed(request[idx])) { 1228 pr_err("%s(%s): request completed too early!\n", 1229 __func__, engine->name); 1230 err = -EINVAL; 1231 goto out_request; 1232 } 1233 idx++; 1234 } 1235 1236 err = recursive_batch_resolve(batch); 1237 if (err) { 1238 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err); 1239 goto out_request; 1240 } 1241 1242 idx = 0; 1243 for_each_uabi_engine(engine, i915) { 1244 long timeout; 1245 1246 timeout = i915_request_wait(request[idx], 0, 1247 MAX_SCHEDULE_TIMEOUT); 1248 if (timeout < 0) { 1249 err = timeout; 1250 pr_err("%s: error waiting for request on %s, err=%d\n", 1251 __func__, engine->name, err); 1252 goto out_request; 1253 } 1254 1255 GEM_BUG_ON(!i915_request_completed(request[idx])); 1256 i915_request_put(request[idx]); 1257 request[idx] = NULL; 1258 idx++; 1259 } 1260 1261 err = igt_live_test_end(&t); 1262 1263 out_request: 1264 idx = 0; 1265 for_each_uabi_engine(engine, i915) { 1266 if (request[idx]) 1267 i915_request_put(request[idx]); 1268 idx++; 1269 } 1270 i915_vma_unpin(batch); 1271 i915_vma_put(batch); 1272 out_free: 1273 kfree(request); 1274 return err; 1275 } 1276 1277 static int live_sequential_engines(void *arg) 1278 { 1279 struct drm_i915_private *i915 = arg; 1280 const unsigned int nengines = num_uabi_engines(i915); 1281 struct i915_request **request; 1282 struct i915_request *prev = NULL; 1283 struct intel_engine_cs *engine; 1284 struct igt_live_test t; 1285 unsigned int idx; 1286 int err; 1287 1288 /* 1289 * Check we can submit requests to all engines sequentially, such 1290 * that each successive request waits for the earlier ones. This 1291 * tests that we don't execute requests out of order, even though 1292 * they are running on independent engines. 1293 */ 1294 1295 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1296 if (!request) 1297 return -ENOMEM; 1298 1299 err = igt_live_test_begin(&t, i915, __func__, ""); 1300 if (err) 1301 goto out_free; 1302 1303 idx = 0; 1304 for_each_uabi_engine(engine, i915) { 1305 struct i915_vma *batch; 1306 1307 batch = recursive_batch(i915); 1308 if (IS_ERR(batch)) { 1309 err = PTR_ERR(batch); 1310 pr_err("%s: Unable to create batch for %s, err=%d\n", 1311 __func__, engine->name, err); 1312 goto out_free; 1313 } 1314 1315 i915_vma_lock(batch); 1316 request[idx] = intel_engine_create_kernel_request(engine); 1317 if (IS_ERR(request[idx])) { 1318 err = PTR_ERR(request[idx]); 1319 pr_err("%s: Request allocation failed for %s with err=%d\n", 1320 __func__, engine->name, err); 1321 goto out_unlock; 1322 } 1323 1324 if (prev) { 1325 err = i915_request_await_dma_fence(request[idx], 1326 &prev->fence); 1327 if (err) { 1328 i915_request_add(request[idx]); 1329 pr_err("%s: Request await failed for %s with err=%d\n", 1330 __func__, engine->name, err); 1331 goto out_unlock; 1332 } 1333 } 1334 1335 err = i915_request_await_object(request[idx], 1336 batch->obj, false); 1337 if (err == 0) 1338 err = i915_vma_move_to_active(batch, request[idx], 0); 1339 GEM_BUG_ON(err); 1340 1341 err = engine->emit_bb_start(request[idx], 1342 batch->node.start, 1343 batch->node.size, 1344 0); 1345 GEM_BUG_ON(err); 1346 request[idx]->batch = batch; 1347 1348 i915_request_get(request[idx]); 1349 i915_request_add(request[idx]); 1350 1351 prev = request[idx]; 1352 idx++; 1353 1354 out_unlock: 1355 i915_vma_unlock(batch); 1356 if (err) 1357 goto out_request; 1358 } 1359 1360 idx = 0; 1361 for_each_uabi_engine(engine, i915) { 1362 long timeout; 1363 1364 if (i915_request_completed(request[idx])) { 1365 pr_err("%s(%s): request completed too early!\n", 1366 __func__, engine->name); 1367 err = -EINVAL; 1368 goto out_request; 1369 } 1370 1371 err = recursive_batch_resolve(request[idx]->batch); 1372 if (err) { 1373 pr_err("%s: failed to resolve batch, err=%d\n", 1374 __func__, err); 1375 goto out_request; 1376 } 1377 1378 timeout = i915_request_wait(request[idx], 0, 1379 MAX_SCHEDULE_TIMEOUT); 1380 if (timeout < 0) { 1381 err = timeout; 1382 pr_err("%s: error waiting for request on %s, err=%d\n", 1383 __func__, engine->name, err); 1384 goto out_request; 1385 } 1386 1387 GEM_BUG_ON(!i915_request_completed(request[idx])); 1388 idx++; 1389 } 1390 1391 err = igt_live_test_end(&t); 1392 1393 out_request: 1394 idx = 0; 1395 for_each_uabi_engine(engine, i915) { 1396 u32 *cmd; 1397 1398 if (!request[idx]) 1399 break; 1400 1401 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj, 1402 I915_MAP_WC); 1403 if (!IS_ERR(cmd)) { 1404 *cmd = MI_BATCH_BUFFER_END; 1405 1406 __i915_gem_object_flush_map(request[idx]->batch->obj, 1407 0, sizeof(*cmd)); 1408 i915_gem_object_unpin_map(request[idx]->batch->obj); 1409 1410 intel_gt_chipset_flush(engine->gt); 1411 } 1412 1413 i915_vma_put(request[idx]->batch); 1414 i915_request_put(request[idx]); 1415 idx++; 1416 } 1417 out_free: 1418 kfree(request); 1419 return err; 1420 } 1421 1422 static int __live_parallel_engine1(void *arg) 1423 { 1424 struct intel_engine_cs *engine = arg; 1425 IGT_TIMEOUT(end_time); 1426 unsigned long count; 1427 int err = 0; 1428 1429 count = 0; 1430 intel_engine_pm_get(engine); 1431 do { 1432 struct i915_request *rq; 1433 1434 rq = i915_request_create(engine->kernel_context); 1435 if (IS_ERR(rq)) { 1436 err = PTR_ERR(rq); 1437 break; 1438 } 1439 1440 i915_request_get(rq); 1441 i915_request_add(rq); 1442 1443 err = 0; 1444 if (i915_request_wait(rq, 0, HZ) < 0) 1445 err = -ETIME; 1446 i915_request_put(rq); 1447 if (err) 1448 break; 1449 1450 count++; 1451 } while (!__igt_timeout(end_time, NULL)); 1452 intel_engine_pm_put(engine); 1453 1454 pr_info("%s: %lu request + sync\n", engine->name, count); 1455 return err; 1456 } 1457 1458 static int __live_parallel_engineN(void *arg) 1459 { 1460 struct intel_engine_cs *engine = arg; 1461 IGT_TIMEOUT(end_time); 1462 unsigned long count; 1463 int err = 0; 1464 1465 count = 0; 1466 intel_engine_pm_get(engine); 1467 do { 1468 struct i915_request *rq; 1469 1470 rq = i915_request_create(engine->kernel_context); 1471 if (IS_ERR(rq)) { 1472 err = PTR_ERR(rq); 1473 break; 1474 } 1475 1476 i915_request_add(rq); 1477 count++; 1478 } while (!__igt_timeout(end_time, NULL)); 1479 intel_engine_pm_put(engine); 1480 1481 pr_info("%s: %lu requests\n", engine->name, count); 1482 return err; 1483 } 1484 1485 static bool wake_all(struct drm_i915_private *i915) 1486 { 1487 if (atomic_dec_and_test(&i915->selftest.counter)) { 1488 wake_up_var(&i915->selftest.counter); 1489 return true; 1490 } 1491 1492 return false; 1493 } 1494 1495 static int wait_for_all(struct drm_i915_private *i915) 1496 { 1497 if (wake_all(i915)) 1498 return 0; 1499 1500 if (wait_var_event_timeout(&i915->selftest.counter, 1501 !atomic_read(&i915->selftest.counter), 1502 i915_selftest.timeout_jiffies)) 1503 return 0; 1504 1505 return -ETIME; 1506 } 1507 1508 static int __live_parallel_spin(void *arg) 1509 { 1510 struct intel_engine_cs *engine = arg; 1511 struct igt_spinner spin; 1512 struct i915_request *rq; 1513 int err = 0; 1514 1515 /* 1516 * Create a spinner running for eternity on each engine. If a second 1517 * spinner is incorrectly placed on the same engine, it will not be 1518 * able to start in time. 1519 */ 1520 1521 if (igt_spinner_init(&spin, engine->gt)) { 1522 wake_all(engine->i915); 1523 return -ENOMEM; 1524 } 1525 1526 intel_engine_pm_get(engine); 1527 rq = igt_spinner_create_request(&spin, 1528 engine->kernel_context, 1529 MI_NOOP); /* no preemption */ 1530 intel_engine_pm_put(engine); 1531 if (IS_ERR(rq)) { 1532 err = PTR_ERR(rq); 1533 if (err == -ENODEV) 1534 err = 0; 1535 wake_all(engine->i915); 1536 goto out_spin; 1537 } 1538 1539 i915_request_get(rq); 1540 i915_request_add(rq); 1541 if (igt_wait_for_spinner(&spin, rq)) { 1542 /* Occupy this engine for the whole test */ 1543 err = wait_for_all(engine->i915); 1544 } else { 1545 pr_err("Failed to start spinner on %s\n", engine->name); 1546 err = -EINVAL; 1547 } 1548 igt_spinner_end(&spin); 1549 1550 if (err == 0 && i915_request_wait(rq, 0, HZ) < 0) 1551 err = -EIO; 1552 i915_request_put(rq); 1553 1554 out_spin: 1555 igt_spinner_fini(&spin); 1556 return err; 1557 } 1558 1559 static int live_parallel_engines(void *arg) 1560 { 1561 struct drm_i915_private *i915 = arg; 1562 static int (* const func[])(void *arg) = { 1563 __live_parallel_engine1, 1564 __live_parallel_engineN, 1565 __live_parallel_spin, 1566 NULL, 1567 }; 1568 const unsigned int nengines = num_uabi_engines(i915); 1569 struct intel_engine_cs *engine; 1570 int (* const *fn)(void *arg); 1571 struct task_struct **tsk; 1572 int err = 0; 1573 1574 /* 1575 * Check we can submit requests to all engines concurrently. This 1576 * tests that we load up the system maximally. 1577 */ 1578 1579 tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL); 1580 if (!tsk) 1581 return -ENOMEM; 1582 1583 for (fn = func; !err && *fn; fn++) { 1584 char name[KSYM_NAME_LEN]; 1585 struct igt_live_test t; 1586 unsigned int idx; 1587 1588 snprintf(name, sizeof(name), "%ps", *fn); 1589 err = igt_live_test_begin(&t, i915, __func__, name); 1590 if (err) 1591 break; 1592 1593 atomic_set(&i915->selftest.counter, nengines); 1594 1595 idx = 0; 1596 for_each_uabi_engine(engine, i915) { 1597 tsk[idx] = kthread_run(*fn, engine, 1598 "igt/parallel:%s", 1599 engine->name); 1600 if (IS_ERR(tsk[idx])) { 1601 err = PTR_ERR(tsk[idx]); 1602 break; 1603 } 1604 get_task_struct(tsk[idx++]); 1605 } 1606 1607 yield(); /* start all threads before we kthread_stop() */ 1608 1609 idx = 0; 1610 for_each_uabi_engine(engine, i915) { 1611 int status; 1612 1613 if (IS_ERR(tsk[idx])) 1614 break; 1615 1616 status = kthread_stop(tsk[idx]); 1617 if (status && !err) 1618 err = status; 1619 1620 put_task_struct(tsk[idx++]); 1621 } 1622 1623 if (igt_live_test_end(&t)) 1624 err = -EIO; 1625 } 1626 1627 kfree(tsk); 1628 return err; 1629 } 1630 1631 static int 1632 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine) 1633 { 1634 struct i915_request *rq; 1635 int ret; 1636 1637 /* 1638 * Before execlists, all contexts share the same ringbuffer. With 1639 * execlists, each context/engine has a separate ringbuffer and 1640 * for the purposes of this test, inexhaustible. 1641 * 1642 * For the global ringbuffer though, we have to be very careful 1643 * that we do not wrap while preventing the execution of requests 1644 * with a unsignaled fence. 1645 */ 1646 if (HAS_EXECLISTS(ctx->i915)) 1647 return INT_MAX; 1648 1649 rq = igt_request_alloc(ctx, engine); 1650 if (IS_ERR(rq)) { 1651 ret = PTR_ERR(rq); 1652 } else { 1653 int sz; 1654 1655 ret = rq->ring->size - rq->reserved_space; 1656 i915_request_add(rq); 1657 1658 sz = rq->ring->emit - rq->head; 1659 if (sz < 0) 1660 sz += rq->ring->size; 1661 ret /= sz; 1662 ret /= 2; /* leave half spare, in case of emergency! */ 1663 } 1664 1665 return ret; 1666 } 1667 1668 static int live_breadcrumbs_smoketest(void *arg) 1669 { 1670 struct drm_i915_private *i915 = arg; 1671 const unsigned int nengines = num_uabi_engines(i915); 1672 const unsigned int ncpus = num_online_cpus(); 1673 unsigned long num_waits, num_fences; 1674 struct intel_engine_cs *engine; 1675 struct task_struct **threads; 1676 struct igt_live_test live; 1677 intel_wakeref_t wakeref; 1678 struct smoketest *smoke; 1679 unsigned int n, idx; 1680 struct file *file; 1681 int ret = 0; 1682 1683 /* 1684 * Smoketest our breadcrumb/signal handling for requests across multiple 1685 * threads. A very simple test to only catch the most egregious of bugs. 1686 * See __igt_breadcrumbs_smoketest(); 1687 * 1688 * On real hardware this time. 1689 */ 1690 1691 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1692 1693 file = mock_file(i915); 1694 if (IS_ERR(file)) { 1695 ret = PTR_ERR(file); 1696 goto out_rpm; 1697 } 1698 1699 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL); 1700 if (!smoke) { 1701 ret = -ENOMEM; 1702 goto out_file; 1703 } 1704 1705 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL); 1706 if (!threads) { 1707 ret = -ENOMEM; 1708 goto out_smoke; 1709 } 1710 1711 smoke[0].request_alloc = __live_request_alloc; 1712 smoke[0].ncontexts = 64; 1713 smoke[0].contexts = kcalloc(smoke[0].ncontexts, 1714 sizeof(*smoke[0].contexts), 1715 GFP_KERNEL); 1716 if (!smoke[0].contexts) { 1717 ret = -ENOMEM; 1718 goto out_threads; 1719 } 1720 1721 for (n = 0; n < smoke[0].ncontexts; n++) { 1722 smoke[0].contexts[n] = live_context(i915, file); 1723 if (IS_ERR(smoke[0].contexts[n])) { 1724 ret = PTR_ERR(smoke[0].contexts[n]); 1725 goto out_contexts; 1726 } 1727 } 1728 1729 ret = igt_live_test_begin(&live, i915, __func__, ""); 1730 if (ret) 1731 goto out_contexts; 1732 1733 idx = 0; 1734 for_each_uabi_engine(engine, i915) { 1735 smoke[idx] = smoke[0]; 1736 smoke[idx].engine = engine; 1737 smoke[idx].max_batch = 1738 max_batches(smoke[0].contexts[0], engine); 1739 if (smoke[idx].max_batch < 0) { 1740 ret = smoke[idx].max_batch; 1741 goto out_flush; 1742 } 1743 /* One ring interleaved between requests from all cpus */ 1744 smoke[idx].max_batch /= num_online_cpus() + 1; 1745 pr_debug("Limiting batches to %d requests on %s\n", 1746 smoke[idx].max_batch, engine->name); 1747 1748 for (n = 0; n < ncpus; n++) { 1749 struct task_struct *tsk; 1750 1751 tsk = kthread_run(__igt_breadcrumbs_smoketest, 1752 &smoke[idx], "igt/%d.%d", idx, n); 1753 if (IS_ERR(tsk)) { 1754 ret = PTR_ERR(tsk); 1755 goto out_flush; 1756 } 1757 1758 get_task_struct(tsk); 1759 threads[idx * ncpus + n] = tsk; 1760 } 1761 1762 idx++; 1763 } 1764 1765 yield(); /* start all threads before we begin */ 1766 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 1767 1768 out_flush: 1769 idx = 0; 1770 num_waits = 0; 1771 num_fences = 0; 1772 for_each_uabi_engine(engine, i915) { 1773 for (n = 0; n < ncpus; n++) { 1774 struct task_struct *tsk = threads[idx * ncpus + n]; 1775 int err; 1776 1777 if (!tsk) 1778 continue; 1779 1780 err = kthread_stop(tsk); 1781 if (err < 0 && !ret) 1782 ret = err; 1783 1784 put_task_struct(tsk); 1785 } 1786 1787 num_waits += atomic_long_read(&smoke[idx].num_waits); 1788 num_fences += atomic_long_read(&smoke[idx].num_fences); 1789 idx++; 1790 } 1791 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", 1792 num_waits, num_fences, idx, ncpus); 1793 1794 ret = igt_live_test_end(&live) ?: ret; 1795 out_contexts: 1796 kfree(smoke[0].contexts); 1797 out_threads: 1798 kfree(threads); 1799 out_smoke: 1800 kfree(smoke); 1801 out_file: 1802 fput(file); 1803 out_rpm: 1804 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1805 1806 return ret; 1807 } 1808 1809 int i915_request_live_selftests(struct drm_i915_private *i915) 1810 { 1811 static const struct i915_subtest tests[] = { 1812 SUBTEST(live_nop_request), 1813 SUBTEST(live_all_engines), 1814 SUBTEST(live_sequential_engines), 1815 SUBTEST(live_parallel_engines), 1816 SUBTEST(live_empty_request), 1817 SUBTEST(live_cancel_request), 1818 SUBTEST(live_breadcrumbs_smoketest), 1819 }; 1820 1821 if (intel_gt_is_wedged(to_gt(i915))) 1822 return 0; 1823 1824 return i915_live_subtests(tests, i915); 1825 } 1826 1827 static int switch_to_kernel_sync(struct intel_context *ce, int err) 1828 { 1829 struct i915_request *rq; 1830 struct dma_fence *fence; 1831 1832 rq = intel_engine_create_kernel_request(ce->engine); 1833 if (IS_ERR(rq)) 1834 return PTR_ERR(rq); 1835 1836 fence = i915_active_fence_get(&ce->timeline->last_request); 1837 if (fence) { 1838 i915_request_await_dma_fence(rq, fence); 1839 dma_fence_put(fence); 1840 } 1841 1842 rq = i915_request_get(rq); 1843 i915_request_add(rq); 1844 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err) 1845 err = -ETIME; 1846 i915_request_put(rq); 1847 1848 while (!err && !intel_engine_is_idle(ce->engine)) 1849 intel_engine_flush_submission(ce->engine); 1850 1851 return err; 1852 } 1853 1854 struct perf_stats { 1855 struct intel_engine_cs *engine; 1856 unsigned long count; 1857 ktime_t time; 1858 ktime_t busy; 1859 u64 runtime; 1860 }; 1861 1862 struct perf_series { 1863 struct drm_i915_private *i915; 1864 unsigned int nengines; 1865 struct intel_context *ce[]; 1866 }; 1867 1868 static int cmp_u32(const void *A, const void *B) 1869 { 1870 const u32 *a = A, *b = B; 1871 1872 return *a - *b; 1873 } 1874 1875 static u32 trifilter(u32 *a) 1876 { 1877 u64 sum; 1878 1879 #define TF_COUNT 5 1880 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); 1881 1882 sum = mul_u32_u32(a[2], 2); 1883 sum += a[1]; 1884 sum += a[3]; 1885 1886 GEM_BUG_ON(sum > U32_MAX); 1887 return sum; 1888 #define TF_BIAS 2 1889 } 1890 1891 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) 1892 { 1893 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles); 1894 1895 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); 1896 } 1897 1898 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) 1899 { 1900 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 1901 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); 1902 *cs++ = offset; 1903 *cs++ = 0; 1904 1905 return cs; 1906 } 1907 1908 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) 1909 { 1910 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1911 *cs++ = offset; 1912 *cs++ = 0; 1913 *cs++ = value; 1914 1915 return cs; 1916 } 1917 1918 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) 1919 { 1920 *cs++ = MI_SEMAPHORE_WAIT | 1921 MI_SEMAPHORE_GLOBAL_GTT | 1922 MI_SEMAPHORE_POLL | 1923 mode; 1924 *cs++ = value; 1925 *cs++ = offset; 1926 *cs++ = 0; 1927 1928 return cs; 1929 } 1930 1931 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) 1932 { 1933 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); 1934 } 1935 1936 static void semaphore_set(u32 *sema, u32 value) 1937 { 1938 WRITE_ONCE(*sema, value); 1939 wmb(); /* flush the update to the cache, and beyond */ 1940 } 1941 1942 static u32 *hwsp_scratch(const struct intel_context *ce) 1943 { 1944 return memset32(ce->engine->status_page.addr + 1000, 0, 21); 1945 } 1946 1947 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) 1948 { 1949 return (i915_ggtt_offset(ce->engine->status_page.vma) + 1950 offset_in_page(dw)); 1951 } 1952 1953 static int measure_semaphore_response(struct intel_context *ce) 1954 { 1955 u32 *sema = hwsp_scratch(ce); 1956 const u32 offset = hwsp_offset(ce, sema); 1957 u32 elapsed[TF_COUNT], cycles; 1958 struct i915_request *rq; 1959 u32 *cs; 1960 int err; 1961 int i; 1962 1963 /* 1964 * Measure how many cycles it takes for the HW to detect the change 1965 * in a semaphore value. 1966 * 1967 * A: read CS_TIMESTAMP from CPU 1968 * poke semaphore 1969 * B: read CS_TIMESTAMP on GPU 1970 * 1971 * Semaphore latency: B - A 1972 */ 1973 1974 semaphore_set(sema, -1); 1975 1976 rq = i915_request_create(ce); 1977 if (IS_ERR(rq)) 1978 return PTR_ERR(rq); 1979 1980 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); 1981 if (IS_ERR(cs)) { 1982 i915_request_add(rq); 1983 err = PTR_ERR(cs); 1984 goto err; 1985 } 1986 1987 cs = emit_store_dw(cs, offset, 0); 1988 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1989 cs = emit_semaphore_poll_until(cs, offset, i); 1990 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1991 cs = emit_store_dw(cs, offset, 0); 1992 } 1993 1994 intel_ring_advance(rq, cs); 1995 i915_request_add(rq); 1996 1997 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 1998 err = -EIO; 1999 goto err; 2000 } 2001 2002 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2003 preempt_disable(); 2004 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2005 semaphore_set(sema, i); 2006 preempt_enable(); 2007 2008 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2009 err = -EIO; 2010 goto err; 2011 } 2012 2013 elapsed[i - 1] = sema[i] - cycles; 2014 } 2015 2016 cycles = trifilter(elapsed); 2017 pr_info("%s: semaphore response %d cycles, %lluns\n", 2018 ce->engine->name, cycles >> TF_BIAS, 2019 cycles_to_ns(ce->engine, cycles)); 2020 2021 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2022 2023 err: 2024 intel_gt_set_wedged(ce->engine->gt); 2025 return err; 2026 } 2027 2028 static int measure_idle_dispatch(struct intel_context *ce) 2029 { 2030 u32 *sema = hwsp_scratch(ce); 2031 const u32 offset = hwsp_offset(ce, sema); 2032 u32 elapsed[TF_COUNT], cycles; 2033 u32 *cs; 2034 int err; 2035 int i; 2036 2037 /* 2038 * Measure how long it takes for us to submit a request while the 2039 * engine is idle, but is resting in our context. 2040 * 2041 * A: read CS_TIMESTAMP from CPU 2042 * submit request 2043 * B: read CS_TIMESTAMP on GPU 2044 * 2045 * Submission latency: B - A 2046 */ 2047 2048 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2049 struct i915_request *rq; 2050 2051 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2052 if (err) 2053 return err; 2054 2055 rq = i915_request_create(ce); 2056 if (IS_ERR(rq)) { 2057 err = PTR_ERR(rq); 2058 goto err; 2059 } 2060 2061 cs = intel_ring_begin(rq, 4); 2062 if (IS_ERR(cs)) { 2063 i915_request_add(rq); 2064 err = PTR_ERR(cs); 2065 goto err; 2066 } 2067 2068 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2069 2070 intel_ring_advance(rq, cs); 2071 2072 preempt_disable(); 2073 local_bh_disable(); 2074 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2075 i915_request_add(rq); 2076 local_bh_enable(); 2077 preempt_enable(); 2078 } 2079 2080 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2081 if (err) 2082 goto err; 2083 2084 for (i = 0; i < ARRAY_SIZE(elapsed); i++) 2085 elapsed[i] = sema[i] - elapsed[i]; 2086 2087 cycles = trifilter(elapsed); 2088 pr_info("%s: idle dispatch latency %d cycles, %lluns\n", 2089 ce->engine->name, cycles >> TF_BIAS, 2090 cycles_to_ns(ce->engine, cycles)); 2091 2092 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2093 2094 err: 2095 intel_gt_set_wedged(ce->engine->gt); 2096 return err; 2097 } 2098 2099 static int measure_busy_dispatch(struct intel_context *ce) 2100 { 2101 u32 *sema = hwsp_scratch(ce); 2102 const u32 offset = hwsp_offset(ce, sema); 2103 u32 elapsed[TF_COUNT + 1], cycles; 2104 u32 *cs; 2105 int err; 2106 int i; 2107 2108 /* 2109 * Measure how long it takes for us to submit a request while the 2110 * engine is busy, polling on a semaphore in our context. With 2111 * direct submission, this will include the cost of a lite restore. 2112 * 2113 * A: read CS_TIMESTAMP from CPU 2114 * submit request 2115 * B: read CS_TIMESTAMP on GPU 2116 * 2117 * Submission latency: B - A 2118 */ 2119 2120 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2121 struct i915_request *rq; 2122 2123 rq = i915_request_create(ce); 2124 if (IS_ERR(rq)) { 2125 err = PTR_ERR(rq); 2126 goto err; 2127 } 2128 2129 cs = intel_ring_begin(rq, 12); 2130 if (IS_ERR(cs)) { 2131 i915_request_add(rq); 2132 err = PTR_ERR(cs); 2133 goto err; 2134 } 2135 2136 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2137 cs = emit_semaphore_poll_until(cs, offset, i); 2138 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2139 2140 intel_ring_advance(rq, cs); 2141 2142 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { 2143 err = -EIO; 2144 goto err; 2145 } 2146 2147 preempt_disable(); 2148 local_bh_disable(); 2149 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2150 i915_request_add(rq); 2151 local_bh_enable(); 2152 semaphore_set(sema, i - 1); 2153 preempt_enable(); 2154 } 2155 2156 wait_for(READ_ONCE(sema[i - 1]), 500); 2157 semaphore_set(sema, i - 1); 2158 2159 for (i = 1; i <= TF_COUNT; i++) { 2160 GEM_BUG_ON(sema[i] == -1); 2161 elapsed[i - 1] = sema[i] - elapsed[i]; 2162 } 2163 2164 cycles = trifilter(elapsed); 2165 pr_info("%s: busy dispatch latency %d cycles, %lluns\n", 2166 ce->engine->name, cycles >> TF_BIAS, 2167 cycles_to_ns(ce->engine, cycles)); 2168 2169 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2170 2171 err: 2172 intel_gt_set_wedged(ce->engine->gt); 2173 return err; 2174 } 2175 2176 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) 2177 { 2178 const u32 offset = 2179 i915_ggtt_offset(engine->status_page.vma) + 2180 offset_in_page(sema); 2181 struct i915_request *rq; 2182 u32 *cs; 2183 2184 rq = i915_request_create(engine->kernel_context); 2185 if (IS_ERR(rq)) 2186 return PTR_ERR(rq); 2187 2188 cs = intel_ring_begin(rq, 4); 2189 if (IS_ERR(cs)) { 2190 i915_request_add(rq); 2191 return PTR_ERR(cs); 2192 } 2193 2194 cs = emit_semaphore_poll(cs, mode, value, offset); 2195 2196 intel_ring_advance(rq, cs); 2197 i915_request_add(rq); 2198 2199 return 0; 2200 } 2201 2202 static int measure_inter_request(struct intel_context *ce) 2203 { 2204 u32 *sema = hwsp_scratch(ce); 2205 const u32 offset = hwsp_offset(ce, sema); 2206 u32 elapsed[TF_COUNT + 1], cycles; 2207 struct i915_sw_fence *submit; 2208 int i, err; 2209 2210 /* 2211 * Measure how long it takes to advance from one request into the 2212 * next. Between each request we flush the GPU caches to memory, 2213 * update the breadcrumbs, and then invalidate those caches. 2214 * We queue up all the requests to be submitted in one batch so 2215 * it should be one set of contiguous measurements. 2216 * 2217 * A: read CS_TIMESTAMP on GPU 2218 * advance request 2219 * B: read CS_TIMESTAMP on GPU 2220 * 2221 * Request latency: B - A 2222 */ 2223 2224 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2225 if (err) 2226 return err; 2227 2228 submit = heap_fence_create(GFP_KERNEL); 2229 if (!submit) { 2230 semaphore_set(sema, 1); 2231 return -ENOMEM; 2232 } 2233 2234 intel_engine_flush_submission(ce->engine); 2235 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2236 struct i915_request *rq; 2237 u32 *cs; 2238 2239 rq = i915_request_create(ce); 2240 if (IS_ERR(rq)) { 2241 err = PTR_ERR(rq); 2242 goto err_submit; 2243 } 2244 2245 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 2246 submit, 2247 GFP_KERNEL); 2248 if (err < 0) { 2249 i915_request_add(rq); 2250 goto err_submit; 2251 } 2252 2253 cs = intel_ring_begin(rq, 4); 2254 if (IS_ERR(cs)) { 2255 i915_request_add(rq); 2256 err = PTR_ERR(cs); 2257 goto err_submit; 2258 } 2259 2260 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2261 2262 intel_ring_advance(rq, cs); 2263 i915_request_add(rq); 2264 } 2265 i915_sw_fence_commit(submit); 2266 intel_engine_flush_submission(ce->engine); 2267 heap_fence_put(submit); 2268 2269 semaphore_set(sema, 1); 2270 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2271 if (err) 2272 goto err; 2273 2274 for (i = 1; i <= TF_COUNT; i++) 2275 elapsed[i - 1] = sema[i + 1] - sema[i]; 2276 2277 cycles = trifilter(elapsed); 2278 pr_info("%s: inter-request latency %d cycles, %lluns\n", 2279 ce->engine->name, cycles >> TF_BIAS, 2280 cycles_to_ns(ce->engine, cycles)); 2281 2282 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2283 2284 err_submit: 2285 i915_sw_fence_commit(submit); 2286 heap_fence_put(submit); 2287 semaphore_set(sema, 1); 2288 err: 2289 intel_gt_set_wedged(ce->engine->gt); 2290 return err; 2291 } 2292 2293 static int measure_context_switch(struct intel_context *ce) 2294 { 2295 u32 *sema = hwsp_scratch(ce); 2296 const u32 offset = hwsp_offset(ce, sema); 2297 struct i915_request *fence = NULL; 2298 u32 elapsed[TF_COUNT + 1], cycles; 2299 int i, j, err; 2300 u32 *cs; 2301 2302 /* 2303 * Measure how long it takes to advance from one request in one 2304 * context to a request in another context. This allows us to 2305 * measure how long the context save/restore take, along with all 2306 * the inter-context setup we require. 2307 * 2308 * A: read CS_TIMESTAMP on GPU 2309 * switch context 2310 * B: read CS_TIMESTAMP on GPU 2311 * 2312 * Context switch latency: B - A 2313 */ 2314 2315 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2316 if (err) 2317 return err; 2318 2319 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2320 struct intel_context *arr[] = { 2321 ce, ce->engine->kernel_context 2322 }; 2323 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); 2324 2325 for (j = 0; j < ARRAY_SIZE(arr); j++) { 2326 struct i915_request *rq; 2327 2328 rq = i915_request_create(arr[j]); 2329 if (IS_ERR(rq)) { 2330 err = PTR_ERR(rq); 2331 goto err_fence; 2332 } 2333 2334 if (fence) { 2335 err = i915_request_await_dma_fence(rq, 2336 &fence->fence); 2337 if (err) { 2338 i915_request_add(rq); 2339 goto err_fence; 2340 } 2341 } 2342 2343 cs = intel_ring_begin(rq, 4); 2344 if (IS_ERR(cs)) { 2345 i915_request_add(rq); 2346 err = PTR_ERR(cs); 2347 goto err_fence; 2348 } 2349 2350 cs = emit_timestamp_store(cs, ce, addr); 2351 addr += sizeof(u32); 2352 2353 intel_ring_advance(rq, cs); 2354 2355 i915_request_put(fence); 2356 fence = i915_request_get(rq); 2357 2358 i915_request_add(rq); 2359 } 2360 } 2361 i915_request_put(fence); 2362 intel_engine_flush_submission(ce->engine); 2363 2364 semaphore_set(sema, 1); 2365 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2366 if (err) 2367 goto err; 2368 2369 for (i = 1; i <= TF_COUNT; i++) 2370 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; 2371 2372 cycles = trifilter(elapsed); 2373 pr_info("%s: context switch latency %d cycles, %lluns\n", 2374 ce->engine->name, cycles >> TF_BIAS, 2375 cycles_to_ns(ce->engine, cycles)); 2376 2377 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2378 2379 err_fence: 2380 i915_request_put(fence); 2381 semaphore_set(sema, 1); 2382 err: 2383 intel_gt_set_wedged(ce->engine->gt); 2384 return err; 2385 } 2386 2387 static int measure_preemption(struct intel_context *ce) 2388 { 2389 u32 *sema = hwsp_scratch(ce); 2390 const u32 offset = hwsp_offset(ce, sema); 2391 u32 elapsed[TF_COUNT], cycles; 2392 u32 *cs; 2393 int err; 2394 int i; 2395 2396 /* 2397 * We measure two latencies while triggering preemption. The first 2398 * latency is how long it takes for us to submit a preempting request. 2399 * The second latency is how it takes for us to return from the 2400 * preemption back to the original context. 2401 * 2402 * A: read CS_TIMESTAMP from CPU 2403 * submit preemption 2404 * B: read CS_TIMESTAMP on GPU (in preempting context) 2405 * context switch 2406 * C: read CS_TIMESTAMP on GPU (in original context) 2407 * 2408 * Preemption dispatch latency: B - A 2409 * Preemption switch latency: C - B 2410 */ 2411 2412 if (!intel_engine_has_preemption(ce->engine)) 2413 return 0; 2414 2415 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2416 u32 addr = offset + 2 * i * sizeof(u32); 2417 struct i915_request *rq; 2418 2419 rq = i915_request_create(ce); 2420 if (IS_ERR(rq)) { 2421 err = PTR_ERR(rq); 2422 goto err; 2423 } 2424 2425 cs = intel_ring_begin(rq, 12); 2426 if (IS_ERR(cs)) { 2427 i915_request_add(rq); 2428 err = PTR_ERR(cs); 2429 goto err; 2430 } 2431 2432 cs = emit_store_dw(cs, addr, -1); 2433 cs = emit_semaphore_poll_until(cs, offset, i); 2434 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); 2435 2436 intel_ring_advance(rq, cs); 2437 i915_request_add(rq); 2438 2439 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { 2440 err = -EIO; 2441 goto err; 2442 } 2443 2444 rq = i915_request_create(ce->engine->kernel_context); 2445 if (IS_ERR(rq)) { 2446 err = PTR_ERR(rq); 2447 goto err; 2448 } 2449 2450 cs = intel_ring_begin(rq, 8); 2451 if (IS_ERR(cs)) { 2452 i915_request_add(rq); 2453 err = PTR_ERR(cs); 2454 goto err; 2455 } 2456 2457 cs = emit_timestamp_store(cs, ce, addr); 2458 cs = emit_store_dw(cs, offset, i); 2459 2460 intel_ring_advance(rq, cs); 2461 rq->sched.attr.priority = I915_PRIORITY_BARRIER; 2462 2463 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2464 i915_request_add(rq); 2465 } 2466 2467 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { 2468 err = -EIO; 2469 goto err; 2470 } 2471 2472 for (i = 1; i <= TF_COUNT; i++) 2473 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; 2474 2475 cycles = trifilter(elapsed); 2476 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", 2477 ce->engine->name, cycles >> TF_BIAS, 2478 cycles_to_ns(ce->engine, cycles)); 2479 2480 for (i = 1; i <= TF_COUNT; i++) 2481 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; 2482 2483 cycles = trifilter(elapsed); 2484 pr_info("%s: preemption switch latency %d cycles, %lluns\n", 2485 ce->engine->name, cycles >> TF_BIAS, 2486 cycles_to_ns(ce->engine, cycles)); 2487 2488 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2489 2490 err: 2491 intel_gt_set_wedged(ce->engine->gt); 2492 return err; 2493 } 2494 2495 struct signal_cb { 2496 struct dma_fence_cb base; 2497 bool seen; 2498 }; 2499 2500 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 2501 { 2502 struct signal_cb *s = container_of(cb, typeof(*s), base); 2503 2504 smp_store_mb(s->seen, true); /* be safe, be strong */ 2505 } 2506 2507 static int measure_completion(struct intel_context *ce) 2508 { 2509 u32 *sema = hwsp_scratch(ce); 2510 const u32 offset = hwsp_offset(ce, sema); 2511 u32 elapsed[TF_COUNT], cycles; 2512 u32 *cs; 2513 int err; 2514 int i; 2515 2516 /* 2517 * Measure how long it takes for the signal (interrupt) to be 2518 * sent from the GPU to be processed by the CPU. 2519 * 2520 * A: read CS_TIMESTAMP on GPU 2521 * signal 2522 * B: read CS_TIMESTAMP from CPU 2523 * 2524 * Completion latency: B - A 2525 */ 2526 2527 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2528 struct signal_cb cb = { .seen = false }; 2529 struct i915_request *rq; 2530 2531 rq = i915_request_create(ce); 2532 if (IS_ERR(rq)) { 2533 err = PTR_ERR(rq); 2534 goto err; 2535 } 2536 2537 cs = intel_ring_begin(rq, 12); 2538 if (IS_ERR(cs)) { 2539 i915_request_add(rq); 2540 err = PTR_ERR(cs); 2541 goto err; 2542 } 2543 2544 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2545 cs = emit_semaphore_poll_until(cs, offset, i); 2546 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2547 2548 intel_ring_advance(rq, cs); 2549 2550 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); 2551 i915_request_add(rq); 2552 2553 intel_engine_flush_submission(ce->engine); 2554 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { 2555 err = -EIO; 2556 goto err; 2557 } 2558 2559 preempt_disable(); 2560 semaphore_set(sema, i); 2561 while (!READ_ONCE(cb.seen)) 2562 cpu_relax(); 2563 2564 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2565 preempt_enable(); 2566 } 2567 2568 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2569 if (err) 2570 goto err; 2571 2572 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2573 GEM_BUG_ON(sema[i + 1] == -1); 2574 elapsed[i] = elapsed[i] - sema[i + 1]; 2575 } 2576 2577 cycles = trifilter(elapsed); 2578 pr_info("%s: completion latency %d cycles, %lluns\n", 2579 ce->engine->name, cycles >> TF_BIAS, 2580 cycles_to_ns(ce->engine, cycles)); 2581 2582 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2583 2584 err: 2585 intel_gt_set_wedged(ce->engine->gt); 2586 return err; 2587 } 2588 2589 static void rps_pin(struct intel_gt *gt) 2590 { 2591 /* Pin the frequency to max */ 2592 atomic_inc(>->rps.num_waiters); 2593 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 2594 2595 mutex_lock(>->rps.lock); 2596 intel_rps_set(>->rps, gt->rps.max_freq); 2597 mutex_unlock(>->rps.lock); 2598 } 2599 2600 static void rps_unpin(struct intel_gt *gt) 2601 { 2602 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 2603 atomic_dec(>->rps.num_waiters); 2604 } 2605 2606 static int perf_request_latency(void *arg) 2607 { 2608 struct drm_i915_private *i915 = arg; 2609 struct intel_engine_cs *engine; 2610 struct pm_qos_request qos; 2611 int err = 0; 2612 2613 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */ 2614 return 0; 2615 2616 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2617 2618 for_each_uabi_engine(engine, i915) { 2619 struct intel_context *ce; 2620 2621 ce = intel_context_create(engine); 2622 if (IS_ERR(ce)) { 2623 err = PTR_ERR(ce); 2624 goto out; 2625 } 2626 2627 err = intel_context_pin(ce); 2628 if (err) { 2629 intel_context_put(ce); 2630 goto out; 2631 } 2632 2633 st_engine_heartbeat_disable(engine); 2634 rps_pin(engine->gt); 2635 2636 if (err == 0) 2637 err = measure_semaphore_response(ce); 2638 if (err == 0) 2639 err = measure_idle_dispatch(ce); 2640 if (err == 0) 2641 err = measure_busy_dispatch(ce); 2642 if (err == 0) 2643 err = measure_inter_request(ce); 2644 if (err == 0) 2645 err = measure_context_switch(ce); 2646 if (err == 0) 2647 err = measure_preemption(ce); 2648 if (err == 0) 2649 err = measure_completion(ce); 2650 2651 rps_unpin(engine->gt); 2652 st_engine_heartbeat_enable(engine); 2653 2654 intel_context_unpin(ce); 2655 intel_context_put(ce); 2656 if (err) 2657 goto out; 2658 } 2659 2660 out: 2661 if (igt_flush_test(i915)) 2662 err = -EIO; 2663 2664 cpu_latency_qos_remove_request(&qos); 2665 return err; 2666 } 2667 2668 static int s_sync0(void *arg) 2669 { 2670 struct perf_series *ps = arg; 2671 IGT_TIMEOUT(end_time); 2672 unsigned int idx = 0; 2673 int err = 0; 2674 2675 GEM_BUG_ON(!ps->nengines); 2676 do { 2677 struct i915_request *rq; 2678 2679 rq = i915_request_create(ps->ce[idx]); 2680 if (IS_ERR(rq)) { 2681 err = PTR_ERR(rq); 2682 break; 2683 } 2684 2685 i915_request_get(rq); 2686 i915_request_add(rq); 2687 2688 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2689 err = -ETIME; 2690 i915_request_put(rq); 2691 if (err) 2692 break; 2693 2694 if (++idx == ps->nengines) 2695 idx = 0; 2696 } while (!__igt_timeout(end_time, NULL)); 2697 2698 return err; 2699 } 2700 2701 static int s_sync1(void *arg) 2702 { 2703 struct perf_series *ps = arg; 2704 struct i915_request *prev = NULL; 2705 IGT_TIMEOUT(end_time); 2706 unsigned int idx = 0; 2707 int err = 0; 2708 2709 GEM_BUG_ON(!ps->nengines); 2710 do { 2711 struct i915_request *rq; 2712 2713 rq = i915_request_create(ps->ce[idx]); 2714 if (IS_ERR(rq)) { 2715 err = PTR_ERR(rq); 2716 break; 2717 } 2718 2719 i915_request_get(rq); 2720 i915_request_add(rq); 2721 2722 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2723 err = -ETIME; 2724 i915_request_put(prev); 2725 prev = rq; 2726 if (err) 2727 break; 2728 2729 if (++idx == ps->nengines) 2730 idx = 0; 2731 } while (!__igt_timeout(end_time, NULL)); 2732 i915_request_put(prev); 2733 2734 return err; 2735 } 2736 2737 static int s_many(void *arg) 2738 { 2739 struct perf_series *ps = arg; 2740 IGT_TIMEOUT(end_time); 2741 unsigned int idx = 0; 2742 2743 GEM_BUG_ON(!ps->nengines); 2744 do { 2745 struct i915_request *rq; 2746 2747 rq = i915_request_create(ps->ce[idx]); 2748 if (IS_ERR(rq)) 2749 return PTR_ERR(rq); 2750 2751 i915_request_add(rq); 2752 2753 if (++idx == ps->nengines) 2754 idx = 0; 2755 } while (!__igt_timeout(end_time, NULL)); 2756 2757 return 0; 2758 } 2759 2760 static int perf_series_engines(void *arg) 2761 { 2762 struct drm_i915_private *i915 = arg; 2763 static int (* const func[])(void *arg) = { 2764 s_sync0, 2765 s_sync1, 2766 s_many, 2767 NULL, 2768 }; 2769 const unsigned int nengines = num_uabi_engines(i915); 2770 struct intel_engine_cs *engine; 2771 int (* const *fn)(void *arg); 2772 struct pm_qos_request qos; 2773 struct perf_stats *stats; 2774 struct perf_series *ps; 2775 unsigned int idx; 2776 int err = 0; 2777 2778 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL); 2779 if (!stats) 2780 return -ENOMEM; 2781 2782 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL); 2783 if (!ps) { 2784 kfree(stats); 2785 return -ENOMEM; 2786 } 2787 2788 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2789 2790 ps->i915 = i915; 2791 ps->nengines = nengines; 2792 2793 idx = 0; 2794 for_each_uabi_engine(engine, i915) { 2795 struct intel_context *ce; 2796 2797 ce = intel_context_create(engine); 2798 if (IS_ERR(ce)) { 2799 err = PTR_ERR(ce); 2800 goto out; 2801 } 2802 2803 err = intel_context_pin(ce); 2804 if (err) { 2805 intel_context_put(ce); 2806 goto out; 2807 } 2808 2809 ps->ce[idx++] = ce; 2810 } 2811 GEM_BUG_ON(idx != ps->nengines); 2812 2813 for (fn = func; *fn && !err; fn++) { 2814 char name[KSYM_NAME_LEN]; 2815 struct igt_live_test t; 2816 2817 snprintf(name, sizeof(name), "%ps", *fn); 2818 err = igt_live_test_begin(&t, i915, __func__, name); 2819 if (err) 2820 break; 2821 2822 for (idx = 0; idx < nengines; idx++) { 2823 struct perf_stats *p = 2824 memset(&stats[idx], 0, sizeof(stats[idx])); 2825 struct intel_context *ce = ps->ce[idx]; 2826 2827 p->engine = ps->ce[idx]->engine; 2828 intel_engine_pm_get(p->engine); 2829 2830 if (intel_engine_supports_stats(p->engine)) 2831 p->busy = intel_engine_get_busy_time(p->engine, 2832 &p->time) + 1; 2833 else 2834 p->time = ktime_get(); 2835 p->runtime = -intel_context_get_total_runtime_ns(ce); 2836 } 2837 2838 err = (*fn)(ps); 2839 if (igt_live_test_end(&t)) 2840 err = -EIO; 2841 2842 for (idx = 0; idx < nengines; idx++) { 2843 struct perf_stats *p = &stats[idx]; 2844 struct intel_context *ce = ps->ce[idx]; 2845 int integer, decimal; 2846 u64 busy, dt, now; 2847 2848 if (p->busy) 2849 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine, 2850 &now), 2851 p->busy - 1); 2852 else 2853 now = ktime_get(); 2854 p->time = ktime_sub(now, p->time); 2855 2856 err = switch_to_kernel_sync(ce, err); 2857 p->runtime += intel_context_get_total_runtime_ns(ce); 2858 intel_engine_pm_put(p->engine); 2859 2860 busy = 100 * ktime_to_ns(p->busy); 2861 dt = ktime_to_ns(p->time); 2862 if (dt) { 2863 integer = div64_u64(busy, dt); 2864 busy -= integer * dt; 2865 decimal = div64_u64(100 * busy, dt); 2866 } else { 2867 integer = 0; 2868 decimal = 0; 2869 } 2870 2871 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2872 name, p->engine->name, ce->timeline->seqno, 2873 integer, decimal, 2874 div_u64(p->runtime, 1000 * 1000), 2875 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2876 } 2877 } 2878 2879 out: 2880 for (idx = 0; idx < nengines; idx++) { 2881 if (IS_ERR_OR_NULL(ps->ce[idx])) 2882 break; 2883 2884 intel_context_unpin(ps->ce[idx]); 2885 intel_context_put(ps->ce[idx]); 2886 } 2887 kfree(ps); 2888 2889 cpu_latency_qos_remove_request(&qos); 2890 kfree(stats); 2891 return err; 2892 } 2893 2894 static int p_sync0(void *arg) 2895 { 2896 struct perf_stats *p = arg; 2897 struct intel_engine_cs *engine = p->engine; 2898 struct intel_context *ce; 2899 IGT_TIMEOUT(end_time); 2900 unsigned long count; 2901 bool busy; 2902 int err = 0; 2903 2904 ce = intel_context_create(engine); 2905 if (IS_ERR(ce)) 2906 return PTR_ERR(ce); 2907 2908 err = intel_context_pin(ce); 2909 if (err) { 2910 intel_context_put(ce); 2911 return err; 2912 } 2913 2914 if (intel_engine_supports_stats(engine)) { 2915 p->busy = intel_engine_get_busy_time(engine, &p->time); 2916 busy = true; 2917 } else { 2918 p->time = ktime_get(); 2919 busy = false; 2920 } 2921 2922 count = 0; 2923 do { 2924 struct i915_request *rq; 2925 2926 rq = i915_request_create(ce); 2927 if (IS_ERR(rq)) { 2928 err = PTR_ERR(rq); 2929 break; 2930 } 2931 2932 i915_request_get(rq); 2933 i915_request_add(rq); 2934 2935 err = 0; 2936 if (i915_request_wait(rq, 0, HZ) < 0) 2937 err = -ETIME; 2938 i915_request_put(rq); 2939 if (err) 2940 break; 2941 2942 count++; 2943 } while (!__igt_timeout(end_time, NULL)); 2944 2945 if (busy) { 2946 ktime_t now; 2947 2948 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 2949 p->busy); 2950 p->time = ktime_sub(now, p->time); 2951 } else { 2952 p->time = ktime_sub(ktime_get(), p->time); 2953 } 2954 2955 err = switch_to_kernel_sync(ce, err); 2956 p->runtime = intel_context_get_total_runtime_ns(ce); 2957 p->count = count; 2958 2959 intel_context_unpin(ce); 2960 intel_context_put(ce); 2961 return err; 2962 } 2963 2964 static int p_sync1(void *arg) 2965 { 2966 struct perf_stats *p = arg; 2967 struct intel_engine_cs *engine = p->engine; 2968 struct i915_request *prev = NULL; 2969 struct intel_context *ce; 2970 IGT_TIMEOUT(end_time); 2971 unsigned long count; 2972 bool busy; 2973 int err = 0; 2974 2975 ce = intel_context_create(engine); 2976 if (IS_ERR(ce)) 2977 return PTR_ERR(ce); 2978 2979 err = intel_context_pin(ce); 2980 if (err) { 2981 intel_context_put(ce); 2982 return err; 2983 } 2984 2985 if (intel_engine_supports_stats(engine)) { 2986 p->busy = intel_engine_get_busy_time(engine, &p->time); 2987 busy = true; 2988 } else { 2989 p->time = ktime_get(); 2990 busy = false; 2991 } 2992 2993 count = 0; 2994 do { 2995 struct i915_request *rq; 2996 2997 rq = i915_request_create(ce); 2998 if (IS_ERR(rq)) { 2999 err = PTR_ERR(rq); 3000 break; 3001 } 3002 3003 i915_request_get(rq); 3004 i915_request_add(rq); 3005 3006 err = 0; 3007 if (prev && i915_request_wait(prev, 0, HZ) < 0) 3008 err = -ETIME; 3009 i915_request_put(prev); 3010 prev = rq; 3011 if (err) 3012 break; 3013 3014 count++; 3015 } while (!__igt_timeout(end_time, NULL)); 3016 i915_request_put(prev); 3017 3018 if (busy) { 3019 ktime_t now; 3020 3021 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3022 p->busy); 3023 p->time = ktime_sub(now, p->time); 3024 } else { 3025 p->time = ktime_sub(ktime_get(), p->time); 3026 } 3027 3028 err = switch_to_kernel_sync(ce, err); 3029 p->runtime = intel_context_get_total_runtime_ns(ce); 3030 p->count = count; 3031 3032 intel_context_unpin(ce); 3033 intel_context_put(ce); 3034 return err; 3035 } 3036 3037 static int p_many(void *arg) 3038 { 3039 struct perf_stats *p = arg; 3040 struct intel_engine_cs *engine = p->engine; 3041 struct intel_context *ce; 3042 IGT_TIMEOUT(end_time); 3043 unsigned long count; 3044 int err = 0; 3045 bool busy; 3046 3047 ce = intel_context_create(engine); 3048 if (IS_ERR(ce)) 3049 return PTR_ERR(ce); 3050 3051 err = intel_context_pin(ce); 3052 if (err) { 3053 intel_context_put(ce); 3054 return err; 3055 } 3056 3057 if (intel_engine_supports_stats(engine)) { 3058 p->busy = intel_engine_get_busy_time(engine, &p->time); 3059 busy = true; 3060 } else { 3061 p->time = ktime_get(); 3062 busy = false; 3063 } 3064 3065 count = 0; 3066 do { 3067 struct i915_request *rq; 3068 3069 rq = i915_request_create(ce); 3070 if (IS_ERR(rq)) { 3071 err = PTR_ERR(rq); 3072 break; 3073 } 3074 3075 i915_request_add(rq); 3076 count++; 3077 } while (!__igt_timeout(end_time, NULL)); 3078 3079 if (busy) { 3080 ktime_t now; 3081 3082 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3083 p->busy); 3084 p->time = ktime_sub(now, p->time); 3085 } else { 3086 p->time = ktime_sub(ktime_get(), p->time); 3087 } 3088 3089 err = switch_to_kernel_sync(ce, err); 3090 p->runtime = intel_context_get_total_runtime_ns(ce); 3091 p->count = count; 3092 3093 intel_context_unpin(ce); 3094 intel_context_put(ce); 3095 return err; 3096 } 3097 3098 static int perf_parallel_engines(void *arg) 3099 { 3100 struct drm_i915_private *i915 = arg; 3101 static int (* const func[])(void *arg) = { 3102 p_sync0, 3103 p_sync1, 3104 p_many, 3105 NULL, 3106 }; 3107 const unsigned int nengines = num_uabi_engines(i915); 3108 struct intel_engine_cs *engine; 3109 int (* const *fn)(void *arg); 3110 struct pm_qos_request qos; 3111 struct { 3112 struct perf_stats p; 3113 struct task_struct *tsk; 3114 } *engines; 3115 int err = 0; 3116 3117 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL); 3118 if (!engines) 3119 return -ENOMEM; 3120 3121 cpu_latency_qos_add_request(&qos, 0); 3122 3123 for (fn = func; *fn; fn++) { 3124 char name[KSYM_NAME_LEN]; 3125 struct igt_live_test t; 3126 unsigned int idx; 3127 3128 snprintf(name, sizeof(name), "%ps", *fn); 3129 err = igt_live_test_begin(&t, i915, __func__, name); 3130 if (err) 3131 break; 3132 3133 atomic_set(&i915->selftest.counter, nengines); 3134 3135 idx = 0; 3136 for_each_uabi_engine(engine, i915) { 3137 intel_engine_pm_get(engine); 3138 3139 memset(&engines[idx].p, 0, sizeof(engines[idx].p)); 3140 engines[idx].p.engine = engine; 3141 3142 engines[idx].tsk = kthread_run(*fn, &engines[idx].p, 3143 "igt:%s", engine->name); 3144 if (IS_ERR(engines[idx].tsk)) { 3145 err = PTR_ERR(engines[idx].tsk); 3146 intel_engine_pm_put(engine); 3147 break; 3148 } 3149 get_task_struct(engines[idx++].tsk); 3150 } 3151 3152 yield(); /* start all threads before we kthread_stop() */ 3153 3154 idx = 0; 3155 for_each_uabi_engine(engine, i915) { 3156 int status; 3157 3158 if (IS_ERR(engines[idx].tsk)) 3159 break; 3160 3161 status = kthread_stop(engines[idx].tsk); 3162 if (status && !err) 3163 err = status; 3164 3165 intel_engine_pm_put(engine); 3166 put_task_struct(engines[idx++].tsk); 3167 } 3168 3169 if (igt_live_test_end(&t)) 3170 err = -EIO; 3171 if (err) 3172 break; 3173 3174 idx = 0; 3175 for_each_uabi_engine(engine, i915) { 3176 struct perf_stats *p = &engines[idx].p; 3177 u64 busy = 100 * ktime_to_ns(p->busy); 3178 u64 dt = ktime_to_ns(p->time); 3179 int integer, decimal; 3180 3181 if (dt) { 3182 integer = div64_u64(busy, dt); 3183 busy -= integer * dt; 3184 decimal = div64_u64(100 * busy, dt); 3185 } else { 3186 integer = 0; 3187 decimal = 0; 3188 } 3189 3190 GEM_BUG_ON(engine != p->engine); 3191 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 3192 name, engine->name, p->count, integer, decimal, 3193 div_u64(p->runtime, 1000 * 1000), 3194 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 3195 idx++; 3196 } 3197 } 3198 3199 cpu_latency_qos_remove_request(&qos); 3200 kfree(engines); 3201 return err; 3202 } 3203 3204 int i915_request_perf_selftests(struct drm_i915_private *i915) 3205 { 3206 static const struct i915_subtest tests[] = { 3207 SUBTEST(perf_request_latency), 3208 SUBTEST(perf_series_engines), 3209 SUBTEST(perf_parallel_engines), 3210 }; 3211 3212 if (intel_gt_is_wedged(to_gt(i915))) 3213 return 0; 3214 3215 return i915_subtests(tests, i915); 3216 } 3217