1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/prime_numbers.h> 26 #include <linux/pm_qos.h> 27 #include <linux/sort.h> 28 29 #include "gem/i915_gem_internal.h" 30 #include "gem/i915_gem_pm.h" 31 #include "gem/selftests/mock_context.h" 32 33 #include "gt/intel_engine_heartbeat.h" 34 #include "gt/intel_engine_pm.h" 35 #include "gt/intel_engine_user.h" 36 #include "gt/intel_gt.h" 37 #include "gt/intel_gt_clock_utils.h" 38 #include "gt/intel_gt_requests.h" 39 #include "gt/selftest_engine_heartbeat.h" 40 41 #include "i915_random.h" 42 #include "i915_selftest.h" 43 #include "igt_flush_test.h" 44 #include "igt_live_test.h" 45 #include "igt_spinner.h" 46 #include "lib_sw_fence.h" 47 48 #include "mock_drm.h" 49 #include "mock_gem_device.h" 50 51 static unsigned int num_uabi_engines(struct drm_i915_private *i915) 52 { 53 struct intel_engine_cs *engine; 54 unsigned int count; 55 56 count = 0; 57 for_each_uabi_engine(engine, i915) 58 count++; 59 60 return count; 61 } 62 63 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915) 64 { 65 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0); 66 } 67 68 static int igt_add_request(void *arg) 69 { 70 struct drm_i915_private *i915 = arg; 71 struct i915_request *request; 72 73 /* Basic preliminary test to create a request and let it loose! */ 74 75 request = mock_request(rcs0(i915)->kernel_context, HZ / 10); 76 if (!request) 77 return -ENOMEM; 78 79 i915_request_add(request); 80 81 return 0; 82 } 83 84 static int igt_wait_request(void *arg) 85 { 86 const long T = HZ / 4; 87 struct drm_i915_private *i915 = arg; 88 struct i915_request *request; 89 int err = -EINVAL; 90 91 /* Submit a request, then wait upon it */ 92 93 request = mock_request(rcs0(i915)->kernel_context, T); 94 if (!request) 95 return -ENOMEM; 96 97 i915_request_get(request); 98 99 if (i915_request_wait(request, 0, 0) != -ETIME) { 100 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n"); 101 goto out_request; 102 } 103 104 if (i915_request_wait(request, 0, T) != -ETIME) { 105 pr_err("request wait succeeded (expected timeout before submit!)\n"); 106 goto out_request; 107 } 108 109 if (i915_request_completed(request)) { 110 pr_err("request completed before submit!!\n"); 111 goto out_request; 112 } 113 114 i915_request_add(request); 115 116 if (i915_request_wait(request, 0, 0) != -ETIME) { 117 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n"); 118 goto out_request; 119 } 120 121 if (i915_request_completed(request)) { 122 pr_err("request completed immediately!\n"); 123 goto out_request; 124 } 125 126 if (i915_request_wait(request, 0, T / 2) != -ETIME) { 127 pr_err("request wait succeeded (expected timeout!)\n"); 128 goto out_request; 129 } 130 131 if (i915_request_wait(request, 0, T) == -ETIME) { 132 pr_err("request wait timed out!\n"); 133 goto out_request; 134 } 135 136 if (!i915_request_completed(request)) { 137 pr_err("request not complete after waiting!\n"); 138 goto out_request; 139 } 140 141 if (i915_request_wait(request, 0, T) == -ETIME) { 142 pr_err("request wait timed out when already complete!\n"); 143 goto out_request; 144 } 145 146 err = 0; 147 out_request: 148 i915_request_put(request); 149 mock_device_flush(i915); 150 return err; 151 } 152 153 static int igt_fence_wait(void *arg) 154 { 155 const long T = HZ / 4; 156 struct drm_i915_private *i915 = arg; 157 struct i915_request *request; 158 int err = -EINVAL; 159 160 /* Submit a request, treat it as a fence and wait upon it */ 161 162 request = mock_request(rcs0(i915)->kernel_context, T); 163 if (!request) 164 return -ENOMEM; 165 166 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) { 167 pr_err("fence wait success before submit (expected timeout)!\n"); 168 goto out; 169 } 170 171 i915_request_add(request); 172 173 if (dma_fence_is_signaled(&request->fence)) { 174 pr_err("fence signaled immediately!\n"); 175 goto out; 176 } 177 178 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) { 179 pr_err("fence wait success after submit (expected timeout)!\n"); 180 goto out; 181 } 182 183 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 184 pr_err("fence wait timed out (expected success)!\n"); 185 goto out; 186 } 187 188 if (!dma_fence_is_signaled(&request->fence)) { 189 pr_err("fence unsignaled after waiting!\n"); 190 goto out; 191 } 192 193 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 194 pr_err("fence wait timed out when complete (expected success)!\n"); 195 goto out; 196 } 197 198 err = 0; 199 out: 200 mock_device_flush(i915); 201 return err; 202 } 203 204 static int igt_request_rewind(void *arg) 205 { 206 struct drm_i915_private *i915 = arg; 207 struct i915_request *request, *vip; 208 struct i915_gem_context *ctx[2]; 209 struct intel_context *ce; 210 int err = -EINVAL; 211 212 ctx[0] = mock_context(i915, "A"); 213 if (!ctx[0]) { 214 err = -ENOMEM; 215 goto err_ctx_0; 216 } 217 218 ce = i915_gem_context_get_engine(ctx[0], RCS0); 219 GEM_BUG_ON(IS_ERR(ce)); 220 request = mock_request(ce, 2 * HZ); 221 intel_context_put(ce); 222 if (!request) { 223 err = -ENOMEM; 224 goto err_context_0; 225 } 226 227 i915_request_get(request); 228 i915_request_add(request); 229 230 ctx[1] = mock_context(i915, "B"); 231 if (!ctx[1]) { 232 err = -ENOMEM; 233 goto err_ctx_1; 234 } 235 236 ce = i915_gem_context_get_engine(ctx[1], RCS0); 237 GEM_BUG_ON(IS_ERR(ce)); 238 vip = mock_request(ce, 0); 239 intel_context_put(ce); 240 if (!vip) { 241 err = -ENOMEM; 242 goto err_context_1; 243 } 244 245 /* Simulate preemption by manual reordering */ 246 if (!mock_cancel_request(request)) { 247 pr_err("failed to cancel request (already executed)!\n"); 248 i915_request_add(vip); 249 goto err_context_1; 250 } 251 i915_request_get(vip); 252 i915_request_add(vip); 253 rcu_read_lock(); 254 request->engine->submit_request(request); 255 rcu_read_unlock(); 256 257 258 if (i915_request_wait(vip, 0, HZ) == -ETIME) { 259 pr_err("timed out waiting for high priority request\n"); 260 goto err; 261 } 262 263 if (i915_request_completed(request)) { 264 pr_err("low priority request already completed\n"); 265 goto err; 266 } 267 268 err = 0; 269 err: 270 i915_request_put(vip); 271 err_context_1: 272 mock_context_close(ctx[1]); 273 err_ctx_1: 274 i915_request_put(request); 275 err_context_0: 276 mock_context_close(ctx[0]); 277 err_ctx_0: 278 mock_device_flush(i915); 279 return err; 280 } 281 282 struct smoketest { 283 struct intel_engine_cs *engine; 284 struct i915_gem_context **contexts; 285 atomic_long_t num_waits, num_fences; 286 int ncontexts, max_batch; 287 struct i915_request *(*request_alloc)(struct intel_context *ce); 288 }; 289 290 static struct i915_request * 291 __mock_request_alloc(struct intel_context *ce) 292 { 293 return mock_request(ce, 0); 294 } 295 296 static struct i915_request * 297 __live_request_alloc(struct intel_context *ce) 298 { 299 return intel_context_create_request(ce); 300 } 301 302 struct smoke_thread { 303 struct kthread_worker *worker; 304 struct kthread_work work; 305 struct smoketest *t; 306 bool stop; 307 int result; 308 }; 309 310 static void __igt_breadcrumbs_smoketest(struct kthread_work *work) 311 { 312 struct smoke_thread *thread = container_of(work, typeof(*thread), work); 313 struct smoketest *t = thread->t; 314 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1; 315 const unsigned int total = 4 * t->ncontexts + 1; 316 unsigned int num_waits = 0, num_fences = 0; 317 struct i915_request **requests; 318 I915_RND_STATE(prng); 319 unsigned int *order; 320 int err = 0; 321 322 /* 323 * A very simple test to catch the most egregious of list handling bugs. 324 * 325 * At its heart, we simply create oodles of requests running across 326 * multiple kthreads and enable signaling on them, for the sole purpose 327 * of stressing our breadcrumb handling. The only inspection we do is 328 * that the fences were marked as signaled. 329 */ 330 331 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL); 332 if (!requests) { 333 thread->result = -ENOMEM; 334 return; 335 } 336 337 order = i915_random_order(total, &prng); 338 if (!order) { 339 err = -ENOMEM; 340 goto out_requests; 341 } 342 343 while (!READ_ONCE(thread->stop)) { 344 struct i915_sw_fence *submit, *wait; 345 unsigned int n, count; 346 347 submit = heap_fence_create(GFP_KERNEL); 348 if (!submit) { 349 err = -ENOMEM; 350 break; 351 } 352 353 wait = heap_fence_create(GFP_KERNEL); 354 if (!wait) { 355 i915_sw_fence_commit(submit); 356 heap_fence_put(submit); 357 err = -ENOMEM; 358 break; 359 } 360 361 i915_random_reorder(order, total, &prng); 362 count = 1 + i915_prandom_u32_max_state(max_batch, &prng); 363 364 for (n = 0; n < count; n++) { 365 struct i915_gem_context *ctx = 366 t->contexts[order[n] % t->ncontexts]; 367 struct i915_request *rq; 368 struct intel_context *ce; 369 370 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx); 371 GEM_BUG_ON(IS_ERR(ce)); 372 rq = t->request_alloc(ce); 373 intel_context_put(ce); 374 if (IS_ERR(rq)) { 375 err = PTR_ERR(rq); 376 count = n; 377 break; 378 } 379 380 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 381 submit, 382 GFP_KERNEL); 383 384 requests[n] = i915_request_get(rq); 385 i915_request_add(rq); 386 387 if (err >= 0) 388 err = i915_sw_fence_await_dma_fence(wait, 389 &rq->fence, 390 0, 391 GFP_KERNEL); 392 393 if (err < 0) { 394 i915_request_put(rq); 395 count = n; 396 break; 397 } 398 } 399 400 i915_sw_fence_commit(submit); 401 i915_sw_fence_commit(wait); 402 403 if (!wait_event_timeout(wait->wait, 404 i915_sw_fence_done(wait), 405 5 * HZ)) { 406 struct i915_request *rq = requests[count - 1]; 407 408 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", 409 atomic_read(&wait->pending), count, 410 rq->fence.context, rq->fence.seqno, 411 t->engine->name); 412 GEM_TRACE_DUMP(); 413 414 intel_gt_set_wedged(t->engine->gt); 415 GEM_BUG_ON(!i915_request_completed(rq)); 416 i915_sw_fence_wait(wait); 417 err = -EIO; 418 } 419 420 for (n = 0; n < count; n++) { 421 struct i915_request *rq = requests[n]; 422 423 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 424 &rq->fence.flags)) { 425 pr_err("%llu:%llu was not signaled!\n", 426 rq->fence.context, rq->fence.seqno); 427 err = -EINVAL; 428 } 429 430 i915_request_put(rq); 431 } 432 433 heap_fence_put(wait); 434 heap_fence_put(submit); 435 436 if (err < 0) 437 break; 438 439 num_fences += count; 440 num_waits++; 441 442 cond_resched(); 443 } 444 445 atomic_long_add(num_fences, &t->num_fences); 446 atomic_long_add(num_waits, &t->num_waits); 447 448 kfree(order); 449 out_requests: 450 kfree(requests); 451 thread->result = err; 452 } 453 454 static int mock_breadcrumbs_smoketest(void *arg) 455 { 456 struct drm_i915_private *i915 = arg; 457 struct smoketest t = { 458 .engine = rcs0(i915), 459 .ncontexts = 1024, 460 .max_batch = 1024, 461 .request_alloc = __mock_request_alloc 462 }; 463 unsigned int ncpus = num_online_cpus(); 464 struct smoke_thread *threads; 465 unsigned int n; 466 int ret = 0; 467 468 /* 469 * Smoketest our breadcrumb/signal handling for requests across multiple 470 * threads. A very simple test to only catch the most egregious of bugs. 471 * See __igt_breadcrumbs_smoketest(); 472 */ 473 474 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL); 475 if (!threads) 476 return -ENOMEM; 477 478 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL); 479 if (!t.contexts) { 480 ret = -ENOMEM; 481 goto out_threads; 482 } 483 484 for (n = 0; n < t.ncontexts; n++) { 485 t.contexts[n] = mock_context(t.engine->i915, "mock"); 486 if (!t.contexts[n]) { 487 ret = -ENOMEM; 488 goto out_contexts; 489 } 490 } 491 492 for (n = 0; n < ncpus; n++) { 493 struct kthread_worker *worker; 494 495 worker = kthread_create_worker(0, "igt/%d", n); 496 if (IS_ERR(worker)) { 497 ret = PTR_ERR(worker); 498 ncpus = n; 499 break; 500 } 501 502 threads[n].worker = worker; 503 threads[n].t = &t; 504 threads[n].stop = false; 505 threads[n].result = 0; 506 507 kthread_init_work(&threads[n].work, 508 __igt_breadcrumbs_smoketest); 509 kthread_queue_work(worker, &threads[n].work); 510 } 511 512 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 513 514 for (n = 0; n < ncpus; n++) { 515 int err; 516 517 WRITE_ONCE(threads[n].stop, true); 518 kthread_flush_work(&threads[n].work); 519 err = READ_ONCE(threads[n].result); 520 if (err < 0 && !ret) 521 ret = err; 522 523 kthread_destroy_worker(threads[n].worker); 524 } 525 pr_info("Completed %lu waits for %lu fence across %d cpus\n", 526 atomic_long_read(&t.num_waits), 527 atomic_long_read(&t.num_fences), 528 ncpus); 529 530 out_contexts: 531 for (n = 0; n < t.ncontexts; n++) { 532 if (!t.contexts[n]) 533 break; 534 mock_context_close(t.contexts[n]); 535 } 536 kfree(t.contexts); 537 out_threads: 538 kfree(threads); 539 return ret; 540 } 541 542 int i915_request_mock_selftests(void) 543 { 544 static const struct i915_subtest tests[] = { 545 SUBTEST(igt_add_request), 546 SUBTEST(igt_wait_request), 547 SUBTEST(igt_fence_wait), 548 SUBTEST(igt_request_rewind), 549 SUBTEST(mock_breadcrumbs_smoketest), 550 }; 551 struct drm_i915_private *i915; 552 intel_wakeref_t wakeref; 553 int err = 0; 554 555 i915 = mock_gem_device(); 556 if (!i915) 557 return -ENOMEM; 558 559 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 560 err = i915_subtests(tests, i915); 561 562 mock_destroy_device(i915); 563 564 return err; 565 } 566 567 static int live_nop_request(void *arg) 568 { 569 struct drm_i915_private *i915 = arg; 570 struct intel_engine_cs *engine; 571 struct igt_live_test t; 572 int err = -ENODEV; 573 574 /* 575 * Submit various sized batches of empty requests, to each engine 576 * (individually), and wait for the batch to complete. We can check 577 * the overhead of submitting requests to the hardware. 578 */ 579 580 for_each_uabi_engine(engine, i915) { 581 unsigned long n, prime; 582 IGT_TIMEOUT(end_time); 583 ktime_t times[2] = {}; 584 585 err = igt_live_test_begin(&t, i915, __func__, engine->name); 586 if (err) 587 return err; 588 589 intel_engine_pm_get(engine); 590 for_each_prime_number_from(prime, 1, 8192) { 591 struct i915_request *request = NULL; 592 593 times[1] = ktime_get_raw(); 594 595 for (n = 0; n < prime; n++) { 596 i915_request_put(request); 597 request = i915_request_create(engine->kernel_context); 598 if (IS_ERR(request)) 599 return PTR_ERR(request); 600 601 /* 602 * This space is left intentionally blank. 603 * 604 * We do not actually want to perform any 605 * action with this request, we just want 606 * to measure the latency in allocation 607 * and submission of our breadcrumbs - 608 * ensuring that the bare request is sufficient 609 * for the system to work (i.e. proper HEAD 610 * tracking of the rings, interrupt handling, 611 * etc). It also gives us the lowest bounds 612 * for latency. 613 */ 614 615 i915_request_get(request); 616 i915_request_add(request); 617 } 618 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 619 i915_request_put(request); 620 621 times[1] = ktime_sub(ktime_get_raw(), times[1]); 622 if (prime == 1) 623 times[0] = times[1]; 624 625 if (__igt_timeout(end_time, NULL)) 626 break; 627 } 628 intel_engine_pm_put(engine); 629 630 err = igt_live_test_end(&t); 631 if (err) 632 return err; 633 634 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n", 635 engine->name, 636 ktime_to_ns(times[0]), 637 prime, div64_u64(ktime_to_ns(times[1]), prime)); 638 } 639 640 return err; 641 } 642 643 static int __cancel_inactive(struct intel_engine_cs *engine) 644 { 645 struct intel_context *ce; 646 struct igt_spinner spin; 647 struct i915_request *rq; 648 int err = 0; 649 650 if (igt_spinner_init(&spin, engine->gt)) 651 return -ENOMEM; 652 653 ce = intel_context_create(engine); 654 if (IS_ERR(ce)) { 655 err = PTR_ERR(ce); 656 goto out_spin; 657 } 658 659 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 660 if (IS_ERR(rq)) { 661 err = PTR_ERR(rq); 662 goto out_ce; 663 } 664 665 pr_debug("%s: Cancelling inactive request\n", engine->name); 666 i915_request_cancel(rq, -EINTR); 667 i915_request_get(rq); 668 i915_request_add(rq); 669 670 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 671 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 672 673 pr_err("%s: Failed to cancel inactive request\n", engine->name); 674 intel_engine_dump(engine, &p, "%s\n", engine->name); 675 err = -ETIME; 676 goto out_rq; 677 } 678 679 if (rq->fence.error != -EINTR) { 680 pr_err("%s: fence not cancelled (%u)\n", 681 engine->name, rq->fence.error); 682 err = -EINVAL; 683 } 684 685 out_rq: 686 i915_request_put(rq); 687 out_ce: 688 intel_context_put(ce); 689 out_spin: 690 igt_spinner_fini(&spin); 691 if (err) 692 pr_err("%s: %s error %d\n", __func__, engine->name, err); 693 return err; 694 } 695 696 static int __cancel_active(struct intel_engine_cs *engine) 697 { 698 struct intel_context *ce; 699 struct igt_spinner spin; 700 struct i915_request *rq; 701 int err = 0; 702 703 if (igt_spinner_init(&spin, engine->gt)) 704 return -ENOMEM; 705 706 ce = intel_context_create(engine); 707 if (IS_ERR(ce)) { 708 err = PTR_ERR(ce); 709 goto out_spin; 710 } 711 712 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 713 if (IS_ERR(rq)) { 714 err = PTR_ERR(rq); 715 goto out_ce; 716 } 717 718 pr_debug("%s: Cancelling active request\n", engine->name); 719 i915_request_get(rq); 720 i915_request_add(rq); 721 if (!igt_wait_for_spinner(&spin, rq)) { 722 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 723 724 pr_err("Failed to start spinner on %s\n", engine->name); 725 intel_engine_dump(engine, &p, "%s\n", engine->name); 726 err = -ETIME; 727 goto out_rq; 728 } 729 i915_request_cancel(rq, -EINTR); 730 731 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 732 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 733 734 pr_err("%s: Failed to cancel active request\n", engine->name); 735 intel_engine_dump(engine, &p, "%s\n", engine->name); 736 err = -ETIME; 737 goto out_rq; 738 } 739 740 if (rq->fence.error != -EINTR) { 741 pr_err("%s: fence not cancelled (%u)\n", 742 engine->name, rq->fence.error); 743 err = -EINVAL; 744 } 745 746 out_rq: 747 i915_request_put(rq); 748 out_ce: 749 intel_context_put(ce); 750 out_spin: 751 igt_spinner_fini(&spin); 752 if (err) 753 pr_err("%s: %s error %d\n", __func__, engine->name, err); 754 return err; 755 } 756 757 static int __cancel_completed(struct intel_engine_cs *engine) 758 { 759 struct intel_context *ce; 760 struct igt_spinner spin; 761 struct i915_request *rq; 762 int err = 0; 763 764 if (igt_spinner_init(&spin, engine->gt)) 765 return -ENOMEM; 766 767 ce = intel_context_create(engine); 768 if (IS_ERR(ce)) { 769 err = PTR_ERR(ce); 770 goto out_spin; 771 } 772 773 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 774 if (IS_ERR(rq)) { 775 err = PTR_ERR(rq); 776 goto out_ce; 777 } 778 igt_spinner_end(&spin); 779 i915_request_get(rq); 780 i915_request_add(rq); 781 782 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 783 err = -ETIME; 784 goto out_rq; 785 } 786 787 pr_debug("%s: Cancelling completed request\n", engine->name); 788 i915_request_cancel(rq, -EINTR); 789 if (rq->fence.error) { 790 pr_err("%s: fence not cancelled (%u)\n", 791 engine->name, rq->fence.error); 792 err = -EINVAL; 793 } 794 795 out_rq: 796 i915_request_put(rq); 797 out_ce: 798 intel_context_put(ce); 799 out_spin: 800 igt_spinner_fini(&spin); 801 if (err) 802 pr_err("%s: %s error %d\n", __func__, engine->name, err); 803 return err; 804 } 805 806 /* 807 * Test to prove a non-preemptable request can be cancelled and a subsequent 808 * request on the same context can successfully complete after cancellation. 809 * 810 * Testing methodology is to create a non-preemptible request and submit it, 811 * wait for spinner to start, create a NOP request and submit it, cancel the 812 * spinner, wait for spinner to complete and verify it failed with an error, 813 * finally wait for NOP request to complete verify it succeeded without an 814 * error. Preemption timeout also reduced / restored so test runs in a timely 815 * maner. 816 */ 817 static int __cancel_reset(struct drm_i915_private *i915, 818 struct intel_engine_cs *engine) 819 { 820 struct intel_context *ce; 821 struct igt_spinner spin; 822 struct i915_request *rq, *nop; 823 unsigned long preempt_timeout_ms; 824 int err = 0; 825 826 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT || 827 !intel_has_reset_engine(engine->gt)) 828 return 0; 829 830 preempt_timeout_ms = engine->props.preempt_timeout_ms; 831 engine->props.preempt_timeout_ms = 100; 832 833 if (igt_spinner_init(&spin, engine->gt)) 834 goto out_restore; 835 836 ce = intel_context_create(engine); 837 if (IS_ERR(ce)) { 838 err = PTR_ERR(ce); 839 goto out_spin; 840 } 841 842 rq = igt_spinner_create_request(&spin, ce, MI_NOOP); 843 if (IS_ERR(rq)) { 844 err = PTR_ERR(rq); 845 goto out_ce; 846 } 847 848 pr_debug("%s: Cancelling active non-preemptable request\n", 849 engine->name); 850 i915_request_get(rq); 851 i915_request_add(rq); 852 if (!igt_wait_for_spinner(&spin, rq)) { 853 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 854 855 pr_err("Failed to start spinner on %s\n", engine->name); 856 intel_engine_dump(engine, &p, "%s\n", engine->name); 857 err = -ETIME; 858 goto out_rq; 859 } 860 861 nop = intel_context_create_request(ce); 862 if (IS_ERR(nop)) 863 goto out_rq; 864 i915_request_get(nop); 865 i915_request_add(nop); 866 867 i915_request_cancel(rq, -EINTR); 868 869 if (i915_request_wait(rq, 0, HZ) < 0) { 870 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 871 872 pr_err("%s: Failed to cancel hung request\n", engine->name); 873 intel_engine_dump(engine, &p, "%s\n", engine->name); 874 err = -ETIME; 875 goto out_nop; 876 } 877 878 if (rq->fence.error != -EINTR) { 879 pr_err("%s: fence not cancelled (%u)\n", 880 engine->name, rq->fence.error); 881 err = -EINVAL; 882 goto out_nop; 883 } 884 885 if (i915_request_wait(nop, 0, HZ) < 0) { 886 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 887 888 pr_err("%s: Failed to complete nop request\n", engine->name); 889 intel_engine_dump(engine, &p, "%s\n", engine->name); 890 err = -ETIME; 891 goto out_nop; 892 } 893 894 if (nop->fence.error != 0) { 895 pr_err("%s: Nop request errored (%u)\n", 896 engine->name, nop->fence.error); 897 err = -EINVAL; 898 } 899 900 out_nop: 901 i915_request_put(nop); 902 out_rq: 903 i915_request_put(rq); 904 out_ce: 905 intel_context_put(ce); 906 out_spin: 907 igt_spinner_fini(&spin); 908 out_restore: 909 engine->props.preempt_timeout_ms = preempt_timeout_ms; 910 if (err) 911 pr_err("%s: %s error %d\n", __func__, engine->name, err); 912 return err; 913 } 914 915 static int live_cancel_request(void *arg) 916 { 917 struct drm_i915_private *i915 = arg; 918 struct intel_engine_cs *engine; 919 920 /* 921 * Check cancellation of requests. We expect to be able to immediately 922 * cancel active requests, even if they are currently on the GPU. 923 */ 924 925 for_each_uabi_engine(engine, i915) { 926 struct igt_live_test t; 927 int err, err2; 928 929 if (!intel_engine_has_preemption(engine)) 930 continue; 931 932 err = igt_live_test_begin(&t, i915, __func__, engine->name); 933 if (err) 934 return err; 935 936 err = __cancel_inactive(engine); 937 if (err == 0) 938 err = __cancel_active(engine); 939 if (err == 0) 940 err = __cancel_completed(engine); 941 942 err2 = igt_live_test_end(&t); 943 if (err) 944 return err; 945 if (err2) 946 return err2; 947 948 /* Expects reset so call outside of igt_live_test_* */ 949 err = __cancel_reset(i915, engine); 950 if (err) 951 return err; 952 953 if (igt_flush_test(i915)) 954 return -EIO; 955 } 956 957 return 0; 958 } 959 960 static struct i915_vma *empty_batch(struct drm_i915_private *i915) 961 { 962 struct drm_i915_gem_object *obj; 963 struct i915_vma *vma; 964 u32 *cmd; 965 int err; 966 967 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 968 if (IS_ERR(obj)) 969 return ERR_CAST(obj); 970 971 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB); 972 if (IS_ERR(cmd)) { 973 err = PTR_ERR(cmd); 974 goto err; 975 } 976 977 *cmd = MI_BATCH_BUFFER_END; 978 979 __i915_gem_object_flush_map(obj, 0, 64); 980 i915_gem_object_unpin_map(obj); 981 982 intel_gt_chipset_flush(to_gt(i915)); 983 984 vma = i915_vma_instance(obj, &to_gt(i915)->ggtt->vm, NULL); 985 if (IS_ERR(vma)) { 986 err = PTR_ERR(vma); 987 goto err; 988 } 989 990 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL); 991 if (err) 992 goto err; 993 994 /* Force the wait now to avoid including it in the benchmark */ 995 err = i915_vma_sync(vma); 996 if (err) 997 goto err_pin; 998 999 return vma; 1000 1001 err_pin: 1002 i915_vma_unpin(vma); 1003 err: 1004 i915_gem_object_put(obj); 1005 return ERR_PTR(err); 1006 } 1007 1008 static struct i915_request * 1009 empty_request(struct intel_engine_cs *engine, 1010 struct i915_vma *batch) 1011 { 1012 struct i915_request *request; 1013 int err; 1014 1015 request = i915_request_create(engine->kernel_context); 1016 if (IS_ERR(request)) 1017 return request; 1018 1019 err = engine->emit_bb_start(request, 1020 i915_vma_offset(batch), 1021 i915_vma_size(batch), 1022 I915_DISPATCH_SECURE); 1023 if (err) 1024 goto out_request; 1025 1026 i915_request_get(request); 1027 out_request: 1028 i915_request_add(request); 1029 return err ? ERR_PTR(err) : request; 1030 } 1031 1032 static int live_empty_request(void *arg) 1033 { 1034 struct drm_i915_private *i915 = arg; 1035 struct intel_engine_cs *engine; 1036 struct igt_live_test t; 1037 struct i915_vma *batch; 1038 int err = 0; 1039 1040 /* 1041 * Submit various sized batches of empty requests, to each engine 1042 * (individually), and wait for the batch to complete. We can check 1043 * the overhead of submitting requests to the hardware. 1044 */ 1045 1046 batch = empty_batch(i915); 1047 if (IS_ERR(batch)) 1048 return PTR_ERR(batch); 1049 1050 for_each_uabi_engine(engine, i915) { 1051 IGT_TIMEOUT(end_time); 1052 struct i915_request *request; 1053 unsigned long n, prime; 1054 ktime_t times[2] = {}; 1055 1056 err = igt_live_test_begin(&t, i915, __func__, engine->name); 1057 if (err) 1058 goto out_batch; 1059 1060 intel_engine_pm_get(engine); 1061 1062 /* Warmup / preload */ 1063 request = empty_request(engine, batch); 1064 if (IS_ERR(request)) { 1065 err = PTR_ERR(request); 1066 intel_engine_pm_put(engine); 1067 goto out_batch; 1068 } 1069 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1070 1071 for_each_prime_number_from(prime, 1, 8192) { 1072 times[1] = ktime_get_raw(); 1073 1074 for (n = 0; n < prime; n++) { 1075 i915_request_put(request); 1076 request = empty_request(engine, batch); 1077 if (IS_ERR(request)) { 1078 err = PTR_ERR(request); 1079 intel_engine_pm_put(engine); 1080 goto out_batch; 1081 } 1082 } 1083 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1084 1085 times[1] = ktime_sub(ktime_get_raw(), times[1]); 1086 if (prime == 1) 1087 times[0] = times[1]; 1088 1089 if (__igt_timeout(end_time, NULL)) 1090 break; 1091 } 1092 i915_request_put(request); 1093 intel_engine_pm_put(engine); 1094 1095 err = igt_live_test_end(&t); 1096 if (err) 1097 goto out_batch; 1098 1099 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n", 1100 engine->name, 1101 ktime_to_ns(times[0]), 1102 prime, div64_u64(ktime_to_ns(times[1]), prime)); 1103 } 1104 1105 out_batch: 1106 i915_vma_unpin(batch); 1107 i915_vma_put(batch); 1108 return err; 1109 } 1110 1111 static struct i915_vma *recursive_batch(struct drm_i915_private *i915) 1112 { 1113 struct drm_i915_gem_object *obj; 1114 const int ver = GRAPHICS_VER(i915); 1115 struct i915_vma *vma; 1116 u32 *cmd; 1117 int err; 1118 1119 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 1120 if (IS_ERR(obj)) 1121 return ERR_CAST(obj); 1122 1123 vma = i915_vma_instance(obj, to_gt(i915)->vm, NULL); 1124 if (IS_ERR(vma)) { 1125 err = PTR_ERR(vma); 1126 goto err; 1127 } 1128 1129 err = i915_vma_pin(vma, 0, 0, PIN_USER); 1130 if (err) 1131 goto err; 1132 1133 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC); 1134 if (IS_ERR(cmd)) { 1135 err = PTR_ERR(cmd); 1136 goto err; 1137 } 1138 1139 if (ver >= 8) { 1140 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 1141 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1142 *cmd++ = upper_32_bits(i915_vma_offset(vma)); 1143 } else if (ver >= 6) { 1144 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8; 1145 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1146 } else { 1147 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; 1148 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1149 } 1150 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */ 1151 1152 __i915_gem_object_flush_map(obj, 0, 64); 1153 i915_gem_object_unpin_map(obj); 1154 1155 intel_gt_chipset_flush(to_gt(i915)); 1156 1157 return vma; 1158 1159 err: 1160 i915_gem_object_put(obj); 1161 return ERR_PTR(err); 1162 } 1163 1164 static int recursive_batch_resolve(struct i915_vma *batch) 1165 { 1166 u32 *cmd; 1167 1168 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC); 1169 if (IS_ERR(cmd)) 1170 return PTR_ERR(cmd); 1171 1172 *cmd = MI_BATCH_BUFFER_END; 1173 1174 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd)); 1175 i915_gem_object_unpin_map(batch->obj); 1176 1177 intel_gt_chipset_flush(batch->vm->gt); 1178 1179 return 0; 1180 } 1181 1182 static int live_all_engines(void *arg) 1183 { 1184 struct drm_i915_private *i915 = arg; 1185 const unsigned int nengines = num_uabi_engines(i915); 1186 struct intel_engine_cs *engine; 1187 struct i915_request **request; 1188 struct igt_live_test t; 1189 struct i915_vma *batch; 1190 unsigned int idx; 1191 int err; 1192 1193 /* 1194 * Check we can submit requests to all engines simultaneously. We 1195 * send a recursive batch to each engine - checking that we don't 1196 * block doing so, and that they don't complete too soon. 1197 */ 1198 1199 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1200 if (!request) 1201 return -ENOMEM; 1202 1203 err = igt_live_test_begin(&t, i915, __func__, ""); 1204 if (err) 1205 goto out_free; 1206 1207 batch = recursive_batch(i915); 1208 if (IS_ERR(batch)) { 1209 err = PTR_ERR(batch); 1210 pr_err("%s: Unable to create batch, err=%d\n", __func__, err); 1211 goto out_free; 1212 } 1213 1214 i915_vma_lock(batch); 1215 1216 idx = 0; 1217 for_each_uabi_engine(engine, i915) { 1218 request[idx] = intel_engine_create_kernel_request(engine); 1219 if (IS_ERR(request[idx])) { 1220 err = PTR_ERR(request[idx]); 1221 pr_err("%s: Request allocation failed with err=%d\n", 1222 __func__, err); 1223 goto out_request; 1224 } 1225 1226 err = i915_vma_move_to_active(batch, request[idx], 0); 1227 GEM_BUG_ON(err); 1228 1229 err = engine->emit_bb_start(request[idx], 1230 i915_vma_offset(batch), 1231 i915_vma_size(batch), 1232 0); 1233 GEM_BUG_ON(err); 1234 request[idx]->batch = batch; 1235 1236 i915_request_get(request[idx]); 1237 i915_request_add(request[idx]); 1238 idx++; 1239 } 1240 1241 i915_vma_unlock(batch); 1242 1243 idx = 0; 1244 for_each_uabi_engine(engine, i915) { 1245 if (i915_request_completed(request[idx])) { 1246 pr_err("%s(%s): request completed too early!\n", 1247 __func__, engine->name); 1248 err = -EINVAL; 1249 goto out_request; 1250 } 1251 idx++; 1252 } 1253 1254 err = recursive_batch_resolve(batch); 1255 if (err) { 1256 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err); 1257 goto out_request; 1258 } 1259 1260 idx = 0; 1261 for_each_uabi_engine(engine, i915) { 1262 long timeout; 1263 1264 timeout = i915_request_wait(request[idx], 0, 1265 MAX_SCHEDULE_TIMEOUT); 1266 if (timeout < 0) { 1267 err = timeout; 1268 pr_err("%s: error waiting for request on %s, err=%d\n", 1269 __func__, engine->name, err); 1270 goto out_request; 1271 } 1272 1273 GEM_BUG_ON(!i915_request_completed(request[idx])); 1274 i915_request_put(request[idx]); 1275 request[idx] = NULL; 1276 idx++; 1277 } 1278 1279 err = igt_live_test_end(&t); 1280 1281 out_request: 1282 idx = 0; 1283 for_each_uabi_engine(engine, i915) { 1284 if (request[idx]) 1285 i915_request_put(request[idx]); 1286 idx++; 1287 } 1288 i915_vma_unpin(batch); 1289 i915_vma_put(batch); 1290 out_free: 1291 kfree(request); 1292 return err; 1293 } 1294 1295 static int live_sequential_engines(void *arg) 1296 { 1297 struct drm_i915_private *i915 = arg; 1298 const unsigned int nengines = num_uabi_engines(i915); 1299 struct i915_request **request; 1300 struct i915_request *prev = NULL; 1301 struct intel_engine_cs *engine; 1302 struct igt_live_test t; 1303 unsigned int idx; 1304 int err; 1305 1306 /* 1307 * Check we can submit requests to all engines sequentially, such 1308 * that each successive request waits for the earlier ones. This 1309 * tests that we don't execute requests out of order, even though 1310 * they are running on independent engines. 1311 */ 1312 1313 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1314 if (!request) 1315 return -ENOMEM; 1316 1317 err = igt_live_test_begin(&t, i915, __func__, ""); 1318 if (err) 1319 goto out_free; 1320 1321 idx = 0; 1322 for_each_uabi_engine(engine, i915) { 1323 struct i915_vma *batch; 1324 1325 batch = recursive_batch(i915); 1326 if (IS_ERR(batch)) { 1327 err = PTR_ERR(batch); 1328 pr_err("%s: Unable to create batch for %s, err=%d\n", 1329 __func__, engine->name, err); 1330 goto out_free; 1331 } 1332 1333 i915_vma_lock(batch); 1334 request[idx] = intel_engine_create_kernel_request(engine); 1335 if (IS_ERR(request[idx])) { 1336 err = PTR_ERR(request[idx]); 1337 pr_err("%s: Request allocation failed for %s with err=%d\n", 1338 __func__, engine->name, err); 1339 goto out_unlock; 1340 } 1341 1342 if (prev) { 1343 err = i915_request_await_dma_fence(request[idx], 1344 &prev->fence); 1345 if (err) { 1346 i915_request_add(request[idx]); 1347 pr_err("%s: Request await failed for %s with err=%d\n", 1348 __func__, engine->name, err); 1349 goto out_unlock; 1350 } 1351 } 1352 1353 err = i915_vma_move_to_active(batch, request[idx], 0); 1354 GEM_BUG_ON(err); 1355 1356 err = engine->emit_bb_start(request[idx], 1357 i915_vma_offset(batch), 1358 i915_vma_size(batch), 1359 0); 1360 GEM_BUG_ON(err); 1361 request[idx]->batch = batch; 1362 1363 i915_request_get(request[idx]); 1364 i915_request_add(request[idx]); 1365 1366 prev = request[idx]; 1367 idx++; 1368 1369 out_unlock: 1370 i915_vma_unlock(batch); 1371 if (err) 1372 goto out_request; 1373 } 1374 1375 idx = 0; 1376 for_each_uabi_engine(engine, i915) { 1377 long timeout; 1378 1379 if (i915_request_completed(request[idx])) { 1380 pr_err("%s(%s): request completed too early!\n", 1381 __func__, engine->name); 1382 err = -EINVAL; 1383 goto out_request; 1384 } 1385 1386 err = recursive_batch_resolve(request[idx]->batch); 1387 if (err) { 1388 pr_err("%s: failed to resolve batch, err=%d\n", 1389 __func__, err); 1390 goto out_request; 1391 } 1392 1393 timeout = i915_request_wait(request[idx], 0, 1394 MAX_SCHEDULE_TIMEOUT); 1395 if (timeout < 0) { 1396 err = timeout; 1397 pr_err("%s: error waiting for request on %s, err=%d\n", 1398 __func__, engine->name, err); 1399 goto out_request; 1400 } 1401 1402 GEM_BUG_ON(!i915_request_completed(request[idx])); 1403 idx++; 1404 } 1405 1406 err = igt_live_test_end(&t); 1407 1408 out_request: 1409 idx = 0; 1410 for_each_uabi_engine(engine, i915) { 1411 u32 *cmd; 1412 1413 if (!request[idx]) 1414 break; 1415 1416 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj, 1417 I915_MAP_WC); 1418 if (!IS_ERR(cmd)) { 1419 *cmd = MI_BATCH_BUFFER_END; 1420 1421 __i915_gem_object_flush_map(request[idx]->batch->obj, 1422 0, sizeof(*cmd)); 1423 i915_gem_object_unpin_map(request[idx]->batch->obj); 1424 1425 intel_gt_chipset_flush(engine->gt); 1426 } 1427 1428 i915_vma_put(request[idx]->batch); 1429 i915_request_put(request[idx]); 1430 idx++; 1431 } 1432 out_free: 1433 kfree(request); 1434 return err; 1435 } 1436 1437 struct parallel_thread { 1438 struct kthread_worker *worker; 1439 struct kthread_work work; 1440 struct intel_engine_cs *engine; 1441 int result; 1442 }; 1443 1444 static void __live_parallel_engine1(struct kthread_work *work) 1445 { 1446 struct parallel_thread *thread = 1447 container_of(work, typeof(*thread), work); 1448 struct intel_engine_cs *engine = thread->engine; 1449 IGT_TIMEOUT(end_time); 1450 unsigned long count; 1451 int err = 0; 1452 1453 count = 0; 1454 intel_engine_pm_get(engine); 1455 do { 1456 struct i915_request *rq; 1457 1458 rq = i915_request_create(engine->kernel_context); 1459 if (IS_ERR(rq)) { 1460 err = PTR_ERR(rq); 1461 break; 1462 } 1463 1464 i915_request_get(rq); 1465 i915_request_add(rq); 1466 1467 err = 0; 1468 if (i915_request_wait(rq, 0, HZ) < 0) 1469 err = -ETIME; 1470 i915_request_put(rq); 1471 if (err) 1472 break; 1473 1474 count++; 1475 } while (!__igt_timeout(end_time, NULL)); 1476 intel_engine_pm_put(engine); 1477 1478 pr_info("%s: %lu request + sync\n", engine->name, count); 1479 thread->result = err; 1480 } 1481 1482 static void __live_parallel_engineN(struct kthread_work *work) 1483 { 1484 struct parallel_thread *thread = 1485 container_of(work, typeof(*thread), work); 1486 struct intel_engine_cs *engine = thread->engine; 1487 IGT_TIMEOUT(end_time); 1488 unsigned long count; 1489 int err = 0; 1490 1491 count = 0; 1492 intel_engine_pm_get(engine); 1493 do { 1494 struct i915_request *rq; 1495 1496 rq = i915_request_create(engine->kernel_context); 1497 if (IS_ERR(rq)) { 1498 err = PTR_ERR(rq); 1499 break; 1500 } 1501 1502 i915_request_add(rq); 1503 count++; 1504 } while (!__igt_timeout(end_time, NULL)); 1505 intel_engine_pm_put(engine); 1506 1507 pr_info("%s: %lu requests\n", engine->name, count); 1508 thread->result = err; 1509 } 1510 1511 static bool wake_all(struct drm_i915_private *i915) 1512 { 1513 if (atomic_dec_and_test(&i915->selftest.counter)) { 1514 wake_up_var(&i915->selftest.counter); 1515 return true; 1516 } 1517 1518 return false; 1519 } 1520 1521 static int wait_for_all(struct drm_i915_private *i915) 1522 { 1523 if (wake_all(i915)) 1524 return 0; 1525 1526 if (wait_var_event_timeout(&i915->selftest.counter, 1527 !atomic_read(&i915->selftest.counter), 1528 i915_selftest.timeout_jiffies)) 1529 return 0; 1530 1531 return -ETIME; 1532 } 1533 1534 static void __live_parallel_spin(struct kthread_work *work) 1535 { 1536 struct parallel_thread *thread = 1537 container_of(work, typeof(*thread), work); 1538 struct intel_engine_cs *engine = thread->engine; 1539 struct igt_spinner spin; 1540 struct i915_request *rq; 1541 int err = 0; 1542 1543 /* 1544 * Create a spinner running for eternity on each engine. If a second 1545 * spinner is incorrectly placed on the same engine, it will not be 1546 * able to start in time. 1547 */ 1548 1549 if (igt_spinner_init(&spin, engine->gt)) { 1550 wake_all(engine->i915); 1551 thread->result = -ENOMEM; 1552 return; 1553 } 1554 1555 intel_engine_pm_get(engine); 1556 rq = igt_spinner_create_request(&spin, 1557 engine->kernel_context, 1558 MI_NOOP); /* no preemption */ 1559 intel_engine_pm_put(engine); 1560 if (IS_ERR(rq)) { 1561 err = PTR_ERR(rq); 1562 if (err == -ENODEV) 1563 err = 0; 1564 wake_all(engine->i915); 1565 goto out_spin; 1566 } 1567 1568 i915_request_get(rq); 1569 i915_request_add(rq); 1570 if (igt_wait_for_spinner(&spin, rq)) { 1571 /* Occupy this engine for the whole test */ 1572 err = wait_for_all(engine->i915); 1573 } else { 1574 pr_err("Failed to start spinner on %s\n", engine->name); 1575 err = -EINVAL; 1576 } 1577 igt_spinner_end(&spin); 1578 1579 if (err == 0 && i915_request_wait(rq, 0, HZ) < 0) 1580 err = -EIO; 1581 i915_request_put(rq); 1582 1583 out_spin: 1584 igt_spinner_fini(&spin); 1585 thread->result = err; 1586 } 1587 1588 static int live_parallel_engines(void *arg) 1589 { 1590 struct drm_i915_private *i915 = arg; 1591 static void (* const func[])(struct kthread_work *) = { 1592 __live_parallel_engine1, 1593 __live_parallel_engineN, 1594 __live_parallel_spin, 1595 NULL, 1596 }; 1597 const unsigned int nengines = num_uabi_engines(i915); 1598 struct parallel_thread *threads; 1599 struct intel_engine_cs *engine; 1600 void (* const *fn)(struct kthread_work *); 1601 int err = 0; 1602 1603 /* 1604 * Check we can submit requests to all engines concurrently. This 1605 * tests that we load up the system maximally. 1606 */ 1607 1608 threads = kcalloc(nengines, sizeof(*threads), GFP_KERNEL); 1609 if (!threads) 1610 return -ENOMEM; 1611 1612 for (fn = func; !err && *fn; fn++) { 1613 char name[KSYM_NAME_LEN]; 1614 struct igt_live_test t; 1615 unsigned int idx; 1616 1617 snprintf(name, sizeof(name), "%ps", *fn); 1618 err = igt_live_test_begin(&t, i915, __func__, name); 1619 if (err) 1620 break; 1621 1622 atomic_set(&i915->selftest.counter, nengines); 1623 1624 idx = 0; 1625 for_each_uabi_engine(engine, i915) { 1626 struct kthread_worker *worker; 1627 1628 worker = kthread_create_worker(0, "igt/parallel:%s", 1629 engine->name); 1630 if (IS_ERR(worker)) { 1631 err = PTR_ERR(worker); 1632 break; 1633 } 1634 1635 threads[idx].worker = worker; 1636 threads[idx].result = 0; 1637 threads[idx].engine = engine; 1638 1639 kthread_init_work(&threads[idx].work, *fn); 1640 kthread_queue_work(worker, &threads[idx].work); 1641 idx++; 1642 } 1643 1644 idx = 0; 1645 for_each_uabi_engine(engine, i915) { 1646 int status; 1647 1648 if (!threads[idx].worker) 1649 break; 1650 1651 kthread_flush_work(&threads[idx].work); 1652 status = READ_ONCE(threads[idx].result); 1653 if (status && !err) 1654 err = status; 1655 1656 kthread_destroy_worker(threads[idx++].worker); 1657 } 1658 1659 if (igt_live_test_end(&t)) 1660 err = -EIO; 1661 } 1662 1663 kfree(threads); 1664 return err; 1665 } 1666 1667 static int 1668 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine) 1669 { 1670 struct i915_request *rq; 1671 int ret; 1672 1673 /* 1674 * Before execlists, all contexts share the same ringbuffer. With 1675 * execlists, each context/engine has a separate ringbuffer and 1676 * for the purposes of this test, inexhaustible. 1677 * 1678 * For the global ringbuffer though, we have to be very careful 1679 * that we do not wrap while preventing the execution of requests 1680 * with a unsignaled fence. 1681 */ 1682 if (HAS_EXECLISTS(ctx->i915)) 1683 return INT_MAX; 1684 1685 rq = igt_request_alloc(ctx, engine); 1686 if (IS_ERR(rq)) { 1687 ret = PTR_ERR(rq); 1688 } else { 1689 int sz; 1690 1691 ret = rq->ring->size - rq->reserved_space; 1692 i915_request_add(rq); 1693 1694 sz = rq->ring->emit - rq->head; 1695 if (sz < 0) 1696 sz += rq->ring->size; 1697 ret /= sz; 1698 ret /= 2; /* leave half spare, in case of emergency! */ 1699 } 1700 1701 return ret; 1702 } 1703 1704 static int live_breadcrumbs_smoketest(void *arg) 1705 { 1706 struct drm_i915_private *i915 = arg; 1707 const unsigned int nengines = num_uabi_engines(i915); 1708 const unsigned int ncpus = /* saturate with nengines * ncpus */ 1709 max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines)); 1710 unsigned long num_waits, num_fences; 1711 struct intel_engine_cs *engine; 1712 struct smoke_thread *threads; 1713 struct igt_live_test live; 1714 intel_wakeref_t wakeref; 1715 struct smoketest *smoke; 1716 unsigned int n, idx; 1717 struct file *file; 1718 int ret = 0; 1719 1720 /* 1721 * Smoketest our breadcrumb/signal handling for requests across multiple 1722 * threads. A very simple test to only catch the most egregious of bugs. 1723 * See __igt_breadcrumbs_smoketest(); 1724 * 1725 * On real hardware this time. 1726 */ 1727 1728 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1729 1730 file = mock_file(i915); 1731 if (IS_ERR(file)) { 1732 ret = PTR_ERR(file); 1733 goto out_rpm; 1734 } 1735 1736 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL); 1737 if (!smoke) { 1738 ret = -ENOMEM; 1739 goto out_file; 1740 } 1741 1742 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL); 1743 if (!threads) { 1744 ret = -ENOMEM; 1745 goto out_smoke; 1746 } 1747 1748 smoke[0].request_alloc = __live_request_alloc; 1749 smoke[0].ncontexts = 64; 1750 smoke[0].contexts = kcalloc(smoke[0].ncontexts, 1751 sizeof(*smoke[0].contexts), 1752 GFP_KERNEL); 1753 if (!smoke[0].contexts) { 1754 ret = -ENOMEM; 1755 goto out_threads; 1756 } 1757 1758 for (n = 0; n < smoke[0].ncontexts; n++) { 1759 smoke[0].contexts[n] = live_context(i915, file); 1760 if (IS_ERR(smoke[0].contexts[n])) { 1761 ret = PTR_ERR(smoke[0].contexts[n]); 1762 goto out_contexts; 1763 } 1764 } 1765 1766 ret = igt_live_test_begin(&live, i915, __func__, ""); 1767 if (ret) 1768 goto out_contexts; 1769 1770 idx = 0; 1771 for_each_uabi_engine(engine, i915) { 1772 smoke[idx] = smoke[0]; 1773 smoke[idx].engine = engine; 1774 smoke[idx].max_batch = 1775 max_batches(smoke[0].contexts[0], engine); 1776 if (smoke[idx].max_batch < 0) { 1777 ret = smoke[idx].max_batch; 1778 goto out_flush; 1779 } 1780 /* One ring interleaved between requests from all cpus */ 1781 smoke[idx].max_batch /= ncpus + 1; 1782 pr_debug("Limiting batches to %d requests on %s\n", 1783 smoke[idx].max_batch, engine->name); 1784 1785 for (n = 0; n < ncpus; n++) { 1786 unsigned int i = idx * ncpus + n; 1787 struct kthread_worker *worker; 1788 1789 worker = kthread_create_worker(0, "igt/%d.%d", idx, n); 1790 if (IS_ERR(worker)) { 1791 ret = PTR_ERR(worker); 1792 goto out_flush; 1793 } 1794 1795 threads[i].worker = worker; 1796 threads[i].t = &smoke[idx]; 1797 1798 kthread_init_work(&threads[i].work, 1799 __igt_breadcrumbs_smoketest); 1800 kthread_queue_work(worker, &threads[i].work); 1801 } 1802 1803 idx++; 1804 } 1805 1806 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 1807 1808 out_flush: 1809 idx = 0; 1810 num_waits = 0; 1811 num_fences = 0; 1812 for_each_uabi_engine(engine, i915) { 1813 for (n = 0; n < ncpus; n++) { 1814 unsigned int i = idx * ncpus + n; 1815 int err; 1816 1817 if (!threads[i].worker) 1818 continue; 1819 1820 WRITE_ONCE(threads[i].stop, true); 1821 kthread_flush_work(&threads[i].work); 1822 err = READ_ONCE(threads[i].result); 1823 if (err < 0 && !ret) 1824 ret = err; 1825 1826 kthread_destroy_worker(threads[i].worker); 1827 } 1828 1829 num_waits += atomic_long_read(&smoke[idx].num_waits); 1830 num_fences += atomic_long_read(&smoke[idx].num_fences); 1831 idx++; 1832 } 1833 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", 1834 num_waits, num_fences, idx, ncpus); 1835 1836 ret = igt_live_test_end(&live) ?: ret; 1837 out_contexts: 1838 kfree(smoke[0].contexts); 1839 out_threads: 1840 kfree(threads); 1841 out_smoke: 1842 kfree(smoke); 1843 out_file: 1844 fput(file); 1845 out_rpm: 1846 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1847 1848 return ret; 1849 } 1850 1851 int i915_request_live_selftests(struct drm_i915_private *i915) 1852 { 1853 static const struct i915_subtest tests[] = { 1854 SUBTEST(live_nop_request), 1855 SUBTEST(live_all_engines), 1856 SUBTEST(live_sequential_engines), 1857 SUBTEST(live_parallel_engines), 1858 SUBTEST(live_empty_request), 1859 SUBTEST(live_cancel_request), 1860 SUBTEST(live_breadcrumbs_smoketest), 1861 }; 1862 1863 if (intel_gt_is_wedged(to_gt(i915))) 1864 return 0; 1865 1866 return i915_live_subtests(tests, i915); 1867 } 1868 1869 static int switch_to_kernel_sync(struct intel_context *ce, int err) 1870 { 1871 struct i915_request *rq; 1872 struct dma_fence *fence; 1873 1874 rq = intel_engine_create_kernel_request(ce->engine); 1875 if (IS_ERR(rq)) 1876 return PTR_ERR(rq); 1877 1878 fence = i915_active_fence_get(&ce->timeline->last_request); 1879 if (fence) { 1880 i915_request_await_dma_fence(rq, fence); 1881 dma_fence_put(fence); 1882 } 1883 1884 rq = i915_request_get(rq); 1885 i915_request_add(rq); 1886 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err) 1887 err = -ETIME; 1888 i915_request_put(rq); 1889 1890 while (!err && !intel_engine_is_idle(ce->engine)) 1891 intel_engine_flush_submission(ce->engine); 1892 1893 return err; 1894 } 1895 1896 struct perf_stats { 1897 struct intel_engine_cs *engine; 1898 unsigned long count; 1899 ktime_t time; 1900 ktime_t busy; 1901 u64 runtime; 1902 }; 1903 1904 struct perf_series { 1905 struct drm_i915_private *i915; 1906 unsigned int nengines; 1907 struct intel_context *ce[]; 1908 }; 1909 1910 static int cmp_u32(const void *A, const void *B) 1911 { 1912 const u32 *a = A, *b = B; 1913 1914 return *a - *b; 1915 } 1916 1917 static u32 trifilter(u32 *a) 1918 { 1919 u64 sum; 1920 1921 #define TF_COUNT 5 1922 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); 1923 1924 sum = mul_u32_u32(a[2], 2); 1925 sum += a[1]; 1926 sum += a[3]; 1927 1928 GEM_BUG_ON(sum > U32_MAX); 1929 return sum; 1930 #define TF_BIAS 2 1931 } 1932 1933 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) 1934 { 1935 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles); 1936 1937 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); 1938 } 1939 1940 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) 1941 { 1942 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 1943 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); 1944 *cs++ = offset; 1945 *cs++ = 0; 1946 1947 return cs; 1948 } 1949 1950 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) 1951 { 1952 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1953 *cs++ = offset; 1954 *cs++ = 0; 1955 *cs++ = value; 1956 1957 return cs; 1958 } 1959 1960 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) 1961 { 1962 *cs++ = MI_SEMAPHORE_WAIT | 1963 MI_SEMAPHORE_GLOBAL_GTT | 1964 MI_SEMAPHORE_POLL | 1965 mode; 1966 *cs++ = value; 1967 *cs++ = offset; 1968 *cs++ = 0; 1969 1970 return cs; 1971 } 1972 1973 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) 1974 { 1975 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); 1976 } 1977 1978 static void semaphore_set(u32 *sema, u32 value) 1979 { 1980 WRITE_ONCE(*sema, value); 1981 wmb(); /* flush the update to the cache, and beyond */ 1982 } 1983 1984 static u32 *hwsp_scratch(const struct intel_context *ce) 1985 { 1986 return memset32(ce->engine->status_page.addr + 1000, 0, 21); 1987 } 1988 1989 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) 1990 { 1991 return (i915_ggtt_offset(ce->engine->status_page.vma) + 1992 offset_in_page(dw)); 1993 } 1994 1995 static int measure_semaphore_response(struct intel_context *ce) 1996 { 1997 u32 *sema = hwsp_scratch(ce); 1998 const u32 offset = hwsp_offset(ce, sema); 1999 u32 elapsed[TF_COUNT], cycles; 2000 struct i915_request *rq; 2001 u32 *cs; 2002 int err; 2003 int i; 2004 2005 /* 2006 * Measure how many cycles it takes for the HW to detect the change 2007 * in a semaphore value. 2008 * 2009 * A: read CS_TIMESTAMP from CPU 2010 * poke semaphore 2011 * B: read CS_TIMESTAMP on GPU 2012 * 2013 * Semaphore latency: B - A 2014 */ 2015 2016 semaphore_set(sema, -1); 2017 2018 rq = i915_request_create(ce); 2019 if (IS_ERR(rq)) 2020 return PTR_ERR(rq); 2021 2022 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); 2023 if (IS_ERR(cs)) { 2024 i915_request_add(rq); 2025 err = PTR_ERR(cs); 2026 goto err; 2027 } 2028 2029 cs = emit_store_dw(cs, offset, 0); 2030 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2031 cs = emit_semaphore_poll_until(cs, offset, i); 2032 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2033 cs = emit_store_dw(cs, offset, 0); 2034 } 2035 2036 intel_ring_advance(rq, cs); 2037 i915_request_add(rq); 2038 2039 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2040 err = -EIO; 2041 goto err; 2042 } 2043 2044 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2045 preempt_disable(); 2046 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2047 semaphore_set(sema, i); 2048 preempt_enable(); 2049 2050 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2051 err = -EIO; 2052 goto err; 2053 } 2054 2055 elapsed[i - 1] = sema[i] - cycles; 2056 } 2057 2058 cycles = trifilter(elapsed); 2059 pr_info("%s: semaphore response %d cycles, %lluns\n", 2060 ce->engine->name, cycles >> TF_BIAS, 2061 cycles_to_ns(ce->engine, cycles)); 2062 2063 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2064 2065 err: 2066 intel_gt_set_wedged(ce->engine->gt); 2067 return err; 2068 } 2069 2070 static int measure_idle_dispatch(struct intel_context *ce) 2071 { 2072 u32 *sema = hwsp_scratch(ce); 2073 const u32 offset = hwsp_offset(ce, sema); 2074 u32 elapsed[TF_COUNT], cycles; 2075 u32 *cs; 2076 int err; 2077 int i; 2078 2079 /* 2080 * Measure how long it takes for us to submit a request while the 2081 * engine is idle, but is resting in our context. 2082 * 2083 * A: read CS_TIMESTAMP from CPU 2084 * submit request 2085 * B: read CS_TIMESTAMP on GPU 2086 * 2087 * Submission latency: B - A 2088 */ 2089 2090 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2091 struct i915_request *rq; 2092 2093 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2094 if (err) 2095 return err; 2096 2097 rq = i915_request_create(ce); 2098 if (IS_ERR(rq)) { 2099 err = PTR_ERR(rq); 2100 goto err; 2101 } 2102 2103 cs = intel_ring_begin(rq, 4); 2104 if (IS_ERR(cs)) { 2105 i915_request_add(rq); 2106 err = PTR_ERR(cs); 2107 goto err; 2108 } 2109 2110 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2111 2112 intel_ring_advance(rq, cs); 2113 2114 preempt_disable(); 2115 local_bh_disable(); 2116 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2117 i915_request_add(rq); 2118 local_bh_enable(); 2119 preempt_enable(); 2120 } 2121 2122 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2123 if (err) 2124 goto err; 2125 2126 for (i = 0; i < ARRAY_SIZE(elapsed); i++) 2127 elapsed[i] = sema[i] - elapsed[i]; 2128 2129 cycles = trifilter(elapsed); 2130 pr_info("%s: idle dispatch latency %d cycles, %lluns\n", 2131 ce->engine->name, cycles >> TF_BIAS, 2132 cycles_to_ns(ce->engine, cycles)); 2133 2134 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2135 2136 err: 2137 intel_gt_set_wedged(ce->engine->gt); 2138 return err; 2139 } 2140 2141 static int measure_busy_dispatch(struct intel_context *ce) 2142 { 2143 u32 *sema = hwsp_scratch(ce); 2144 const u32 offset = hwsp_offset(ce, sema); 2145 u32 elapsed[TF_COUNT + 1], cycles; 2146 u32 *cs; 2147 int err; 2148 int i; 2149 2150 /* 2151 * Measure how long it takes for us to submit a request while the 2152 * engine is busy, polling on a semaphore in our context. With 2153 * direct submission, this will include the cost of a lite restore. 2154 * 2155 * A: read CS_TIMESTAMP from CPU 2156 * submit request 2157 * B: read CS_TIMESTAMP on GPU 2158 * 2159 * Submission latency: B - A 2160 */ 2161 2162 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2163 struct i915_request *rq; 2164 2165 rq = i915_request_create(ce); 2166 if (IS_ERR(rq)) { 2167 err = PTR_ERR(rq); 2168 goto err; 2169 } 2170 2171 cs = intel_ring_begin(rq, 12); 2172 if (IS_ERR(cs)) { 2173 i915_request_add(rq); 2174 err = PTR_ERR(cs); 2175 goto err; 2176 } 2177 2178 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2179 cs = emit_semaphore_poll_until(cs, offset, i); 2180 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2181 2182 intel_ring_advance(rq, cs); 2183 2184 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { 2185 err = -EIO; 2186 goto err; 2187 } 2188 2189 preempt_disable(); 2190 local_bh_disable(); 2191 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2192 i915_request_add(rq); 2193 local_bh_enable(); 2194 semaphore_set(sema, i - 1); 2195 preempt_enable(); 2196 } 2197 2198 wait_for(READ_ONCE(sema[i - 1]), 500); 2199 semaphore_set(sema, i - 1); 2200 2201 for (i = 1; i <= TF_COUNT; i++) { 2202 GEM_BUG_ON(sema[i] == -1); 2203 elapsed[i - 1] = sema[i] - elapsed[i]; 2204 } 2205 2206 cycles = trifilter(elapsed); 2207 pr_info("%s: busy dispatch latency %d cycles, %lluns\n", 2208 ce->engine->name, cycles >> TF_BIAS, 2209 cycles_to_ns(ce->engine, cycles)); 2210 2211 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2212 2213 err: 2214 intel_gt_set_wedged(ce->engine->gt); 2215 return err; 2216 } 2217 2218 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) 2219 { 2220 const u32 offset = 2221 i915_ggtt_offset(engine->status_page.vma) + 2222 offset_in_page(sema); 2223 struct i915_request *rq; 2224 u32 *cs; 2225 2226 rq = i915_request_create(engine->kernel_context); 2227 if (IS_ERR(rq)) 2228 return PTR_ERR(rq); 2229 2230 cs = intel_ring_begin(rq, 4); 2231 if (IS_ERR(cs)) { 2232 i915_request_add(rq); 2233 return PTR_ERR(cs); 2234 } 2235 2236 cs = emit_semaphore_poll(cs, mode, value, offset); 2237 2238 intel_ring_advance(rq, cs); 2239 i915_request_add(rq); 2240 2241 return 0; 2242 } 2243 2244 static int measure_inter_request(struct intel_context *ce) 2245 { 2246 u32 *sema = hwsp_scratch(ce); 2247 const u32 offset = hwsp_offset(ce, sema); 2248 u32 elapsed[TF_COUNT + 1], cycles; 2249 struct i915_sw_fence *submit; 2250 int i, err; 2251 2252 /* 2253 * Measure how long it takes to advance from one request into the 2254 * next. Between each request we flush the GPU caches to memory, 2255 * update the breadcrumbs, and then invalidate those caches. 2256 * We queue up all the requests to be submitted in one batch so 2257 * it should be one set of contiguous measurements. 2258 * 2259 * A: read CS_TIMESTAMP on GPU 2260 * advance request 2261 * B: read CS_TIMESTAMP on GPU 2262 * 2263 * Request latency: B - A 2264 */ 2265 2266 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2267 if (err) 2268 return err; 2269 2270 submit = heap_fence_create(GFP_KERNEL); 2271 if (!submit) { 2272 semaphore_set(sema, 1); 2273 return -ENOMEM; 2274 } 2275 2276 intel_engine_flush_submission(ce->engine); 2277 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2278 struct i915_request *rq; 2279 u32 *cs; 2280 2281 rq = i915_request_create(ce); 2282 if (IS_ERR(rq)) { 2283 err = PTR_ERR(rq); 2284 goto err_submit; 2285 } 2286 2287 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 2288 submit, 2289 GFP_KERNEL); 2290 if (err < 0) { 2291 i915_request_add(rq); 2292 goto err_submit; 2293 } 2294 2295 cs = intel_ring_begin(rq, 4); 2296 if (IS_ERR(cs)) { 2297 i915_request_add(rq); 2298 err = PTR_ERR(cs); 2299 goto err_submit; 2300 } 2301 2302 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2303 2304 intel_ring_advance(rq, cs); 2305 i915_request_add(rq); 2306 } 2307 i915_sw_fence_commit(submit); 2308 intel_engine_flush_submission(ce->engine); 2309 heap_fence_put(submit); 2310 2311 semaphore_set(sema, 1); 2312 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2313 if (err) 2314 goto err; 2315 2316 for (i = 1; i <= TF_COUNT; i++) 2317 elapsed[i - 1] = sema[i + 1] - sema[i]; 2318 2319 cycles = trifilter(elapsed); 2320 pr_info("%s: inter-request latency %d cycles, %lluns\n", 2321 ce->engine->name, cycles >> TF_BIAS, 2322 cycles_to_ns(ce->engine, cycles)); 2323 2324 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2325 2326 err_submit: 2327 i915_sw_fence_commit(submit); 2328 heap_fence_put(submit); 2329 semaphore_set(sema, 1); 2330 err: 2331 intel_gt_set_wedged(ce->engine->gt); 2332 return err; 2333 } 2334 2335 static int measure_context_switch(struct intel_context *ce) 2336 { 2337 u32 *sema = hwsp_scratch(ce); 2338 const u32 offset = hwsp_offset(ce, sema); 2339 struct i915_request *fence = NULL; 2340 u32 elapsed[TF_COUNT + 1], cycles; 2341 int i, j, err; 2342 u32 *cs; 2343 2344 /* 2345 * Measure how long it takes to advance from one request in one 2346 * context to a request in another context. This allows us to 2347 * measure how long the context save/restore take, along with all 2348 * the inter-context setup we require. 2349 * 2350 * A: read CS_TIMESTAMP on GPU 2351 * switch context 2352 * B: read CS_TIMESTAMP on GPU 2353 * 2354 * Context switch latency: B - A 2355 */ 2356 2357 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2358 if (err) 2359 return err; 2360 2361 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2362 struct intel_context *arr[] = { 2363 ce, ce->engine->kernel_context 2364 }; 2365 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); 2366 2367 for (j = 0; j < ARRAY_SIZE(arr); j++) { 2368 struct i915_request *rq; 2369 2370 rq = i915_request_create(arr[j]); 2371 if (IS_ERR(rq)) { 2372 err = PTR_ERR(rq); 2373 goto err_fence; 2374 } 2375 2376 if (fence) { 2377 err = i915_request_await_dma_fence(rq, 2378 &fence->fence); 2379 if (err) { 2380 i915_request_add(rq); 2381 goto err_fence; 2382 } 2383 } 2384 2385 cs = intel_ring_begin(rq, 4); 2386 if (IS_ERR(cs)) { 2387 i915_request_add(rq); 2388 err = PTR_ERR(cs); 2389 goto err_fence; 2390 } 2391 2392 cs = emit_timestamp_store(cs, ce, addr); 2393 addr += sizeof(u32); 2394 2395 intel_ring_advance(rq, cs); 2396 2397 i915_request_put(fence); 2398 fence = i915_request_get(rq); 2399 2400 i915_request_add(rq); 2401 } 2402 } 2403 i915_request_put(fence); 2404 intel_engine_flush_submission(ce->engine); 2405 2406 semaphore_set(sema, 1); 2407 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2408 if (err) 2409 goto err; 2410 2411 for (i = 1; i <= TF_COUNT; i++) 2412 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; 2413 2414 cycles = trifilter(elapsed); 2415 pr_info("%s: context switch latency %d cycles, %lluns\n", 2416 ce->engine->name, cycles >> TF_BIAS, 2417 cycles_to_ns(ce->engine, cycles)); 2418 2419 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2420 2421 err_fence: 2422 i915_request_put(fence); 2423 semaphore_set(sema, 1); 2424 err: 2425 intel_gt_set_wedged(ce->engine->gt); 2426 return err; 2427 } 2428 2429 static int measure_preemption(struct intel_context *ce) 2430 { 2431 u32 *sema = hwsp_scratch(ce); 2432 const u32 offset = hwsp_offset(ce, sema); 2433 u32 elapsed[TF_COUNT], cycles; 2434 u32 *cs; 2435 int err; 2436 int i; 2437 2438 /* 2439 * We measure two latencies while triggering preemption. The first 2440 * latency is how long it takes for us to submit a preempting request. 2441 * The second latency is how it takes for us to return from the 2442 * preemption back to the original context. 2443 * 2444 * A: read CS_TIMESTAMP from CPU 2445 * submit preemption 2446 * B: read CS_TIMESTAMP on GPU (in preempting context) 2447 * context switch 2448 * C: read CS_TIMESTAMP on GPU (in original context) 2449 * 2450 * Preemption dispatch latency: B - A 2451 * Preemption switch latency: C - B 2452 */ 2453 2454 if (!intel_engine_has_preemption(ce->engine)) 2455 return 0; 2456 2457 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2458 u32 addr = offset + 2 * i * sizeof(u32); 2459 struct i915_request *rq; 2460 2461 rq = i915_request_create(ce); 2462 if (IS_ERR(rq)) { 2463 err = PTR_ERR(rq); 2464 goto err; 2465 } 2466 2467 cs = intel_ring_begin(rq, 12); 2468 if (IS_ERR(cs)) { 2469 i915_request_add(rq); 2470 err = PTR_ERR(cs); 2471 goto err; 2472 } 2473 2474 cs = emit_store_dw(cs, addr, -1); 2475 cs = emit_semaphore_poll_until(cs, offset, i); 2476 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); 2477 2478 intel_ring_advance(rq, cs); 2479 i915_request_add(rq); 2480 2481 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { 2482 err = -EIO; 2483 goto err; 2484 } 2485 2486 rq = i915_request_create(ce->engine->kernel_context); 2487 if (IS_ERR(rq)) { 2488 err = PTR_ERR(rq); 2489 goto err; 2490 } 2491 2492 cs = intel_ring_begin(rq, 8); 2493 if (IS_ERR(cs)) { 2494 i915_request_add(rq); 2495 err = PTR_ERR(cs); 2496 goto err; 2497 } 2498 2499 cs = emit_timestamp_store(cs, ce, addr); 2500 cs = emit_store_dw(cs, offset, i); 2501 2502 intel_ring_advance(rq, cs); 2503 rq->sched.attr.priority = I915_PRIORITY_BARRIER; 2504 2505 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2506 i915_request_add(rq); 2507 } 2508 2509 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { 2510 err = -EIO; 2511 goto err; 2512 } 2513 2514 for (i = 1; i <= TF_COUNT; i++) 2515 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; 2516 2517 cycles = trifilter(elapsed); 2518 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", 2519 ce->engine->name, cycles >> TF_BIAS, 2520 cycles_to_ns(ce->engine, cycles)); 2521 2522 for (i = 1; i <= TF_COUNT; i++) 2523 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; 2524 2525 cycles = trifilter(elapsed); 2526 pr_info("%s: preemption switch latency %d cycles, %lluns\n", 2527 ce->engine->name, cycles >> TF_BIAS, 2528 cycles_to_ns(ce->engine, cycles)); 2529 2530 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2531 2532 err: 2533 intel_gt_set_wedged(ce->engine->gt); 2534 return err; 2535 } 2536 2537 struct signal_cb { 2538 struct dma_fence_cb base; 2539 bool seen; 2540 }; 2541 2542 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 2543 { 2544 struct signal_cb *s = container_of(cb, typeof(*s), base); 2545 2546 smp_store_mb(s->seen, true); /* be safe, be strong */ 2547 } 2548 2549 static int measure_completion(struct intel_context *ce) 2550 { 2551 u32 *sema = hwsp_scratch(ce); 2552 const u32 offset = hwsp_offset(ce, sema); 2553 u32 elapsed[TF_COUNT], cycles; 2554 u32 *cs; 2555 int err; 2556 int i; 2557 2558 /* 2559 * Measure how long it takes for the signal (interrupt) to be 2560 * sent from the GPU to be processed by the CPU. 2561 * 2562 * A: read CS_TIMESTAMP on GPU 2563 * signal 2564 * B: read CS_TIMESTAMP from CPU 2565 * 2566 * Completion latency: B - A 2567 */ 2568 2569 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2570 struct signal_cb cb = { .seen = false }; 2571 struct i915_request *rq; 2572 2573 rq = i915_request_create(ce); 2574 if (IS_ERR(rq)) { 2575 err = PTR_ERR(rq); 2576 goto err; 2577 } 2578 2579 cs = intel_ring_begin(rq, 12); 2580 if (IS_ERR(cs)) { 2581 i915_request_add(rq); 2582 err = PTR_ERR(cs); 2583 goto err; 2584 } 2585 2586 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2587 cs = emit_semaphore_poll_until(cs, offset, i); 2588 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2589 2590 intel_ring_advance(rq, cs); 2591 2592 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); 2593 i915_request_add(rq); 2594 2595 intel_engine_flush_submission(ce->engine); 2596 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { 2597 err = -EIO; 2598 goto err; 2599 } 2600 2601 preempt_disable(); 2602 semaphore_set(sema, i); 2603 while (!READ_ONCE(cb.seen)) 2604 cpu_relax(); 2605 2606 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2607 preempt_enable(); 2608 } 2609 2610 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2611 if (err) 2612 goto err; 2613 2614 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2615 GEM_BUG_ON(sema[i + 1] == -1); 2616 elapsed[i] = elapsed[i] - sema[i + 1]; 2617 } 2618 2619 cycles = trifilter(elapsed); 2620 pr_info("%s: completion latency %d cycles, %lluns\n", 2621 ce->engine->name, cycles >> TF_BIAS, 2622 cycles_to_ns(ce->engine, cycles)); 2623 2624 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2625 2626 err: 2627 intel_gt_set_wedged(ce->engine->gt); 2628 return err; 2629 } 2630 2631 static void rps_pin(struct intel_gt *gt) 2632 { 2633 /* Pin the frequency to max */ 2634 atomic_inc(>->rps.num_waiters); 2635 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 2636 2637 mutex_lock(>->rps.lock); 2638 intel_rps_set(>->rps, gt->rps.max_freq); 2639 mutex_unlock(>->rps.lock); 2640 } 2641 2642 static void rps_unpin(struct intel_gt *gt) 2643 { 2644 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 2645 atomic_dec(>->rps.num_waiters); 2646 } 2647 2648 static int perf_request_latency(void *arg) 2649 { 2650 struct drm_i915_private *i915 = arg; 2651 struct intel_engine_cs *engine; 2652 struct pm_qos_request qos; 2653 int err = 0; 2654 2655 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */ 2656 return 0; 2657 2658 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2659 2660 for_each_uabi_engine(engine, i915) { 2661 struct intel_context *ce; 2662 2663 ce = intel_context_create(engine); 2664 if (IS_ERR(ce)) { 2665 err = PTR_ERR(ce); 2666 goto out; 2667 } 2668 2669 err = intel_context_pin(ce); 2670 if (err) { 2671 intel_context_put(ce); 2672 goto out; 2673 } 2674 2675 st_engine_heartbeat_disable(engine); 2676 rps_pin(engine->gt); 2677 2678 if (err == 0) 2679 err = measure_semaphore_response(ce); 2680 if (err == 0) 2681 err = measure_idle_dispatch(ce); 2682 if (err == 0) 2683 err = measure_busy_dispatch(ce); 2684 if (err == 0) 2685 err = measure_inter_request(ce); 2686 if (err == 0) 2687 err = measure_context_switch(ce); 2688 if (err == 0) 2689 err = measure_preemption(ce); 2690 if (err == 0) 2691 err = measure_completion(ce); 2692 2693 rps_unpin(engine->gt); 2694 st_engine_heartbeat_enable(engine); 2695 2696 intel_context_unpin(ce); 2697 intel_context_put(ce); 2698 if (err) 2699 goto out; 2700 } 2701 2702 out: 2703 if (igt_flush_test(i915)) 2704 err = -EIO; 2705 2706 cpu_latency_qos_remove_request(&qos); 2707 return err; 2708 } 2709 2710 static int s_sync0(void *arg) 2711 { 2712 struct perf_series *ps = arg; 2713 IGT_TIMEOUT(end_time); 2714 unsigned int idx = 0; 2715 int err = 0; 2716 2717 GEM_BUG_ON(!ps->nengines); 2718 do { 2719 struct i915_request *rq; 2720 2721 rq = i915_request_create(ps->ce[idx]); 2722 if (IS_ERR(rq)) { 2723 err = PTR_ERR(rq); 2724 break; 2725 } 2726 2727 i915_request_get(rq); 2728 i915_request_add(rq); 2729 2730 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2731 err = -ETIME; 2732 i915_request_put(rq); 2733 if (err) 2734 break; 2735 2736 if (++idx == ps->nengines) 2737 idx = 0; 2738 } while (!__igt_timeout(end_time, NULL)); 2739 2740 return err; 2741 } 2742 2743 static int s_sync1(void *arg) 2744 { 2745 struct perf_series *ps = arg; 2746 struct i915_request *prev = NULL; 2747 IGT_TIMEOUT(end_time); 2748 unsigned int idx = 0; 2749 int err = 0; 2750 2751 GEM_BUG_ON(!ps->nengines); 2752 do { 2753 struct i915_request *rq; 2754 2755 rq = i915_request_create(ps->ce[idx]); 2756 if (IS_ERR(rq)) { 2757 err = PTR_ERR(rq); 2758 break; 2759 } 2760 2761 i915_request_get(rq); 2762 i915_request_add(rq); 2763 2764 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2765 err = -ETIME; 2766 i915_request_put(prev); 2767 prev = rq; 2768 if (err) 2769 break; 2770 2771 if (++idx == ps->nengines) 2772 idx = 0; 2773 } while (!__igt_timeout(end_time, NULL)); 2774 i915_request_put(prev); 2775 2776 return err; 2777 } 2778 2779 static int s_many(void *arg) 2780 { 2781 struct perf_series *ps = arg; 2782 IGT_TIMEOUT(end_time); 2783 unsigned int idx = 0; 2784 2785 GEM_BUG_ON(!ps->nengines); 2786 do { 2787 struct i915_request *rq; 2788 2789 rq = i915_request_create(ps->ce[idx]); 2790 if (IS_ERR(rq)) 2791 return PTR_ERR(rq); 2792 2793 i915_request_add(rq); 2794 2795 if (++idx == ps->nengines) 2796 idx = 0; 2797 } while (!__igt_timeout(end_time, NULL)); 2798 2799 return 0; 2800 } 2801 2802 static int perf_series_engines(void *arg) 2803 { 2804 struct drm_i915_private *i915 = arg; 2805 static int (* const func[])(void *arg) = { 2806 s_sync0, 2807 s_sync1, 2808 s_many, 2809 NULL, 2810 }; 2811 const unsigned int nengines = num_uabi_engines(i915); 2812 struct intel_engine_cs *engine; 2813 int (* const *fn)(void *arg); 2814 struct pm_qos_request qos; 2815 struct perf_stats *stats; 2816 struct perf_series *ps; 2817 unsigned int idx; 2818 int err = 0; 2819 2820 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL); 2821 if (!stats) 2822 return -ENOMEM; 2823 2824 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL); 2825 if (!ps) { 2826 kfree(stats); 2827 return -ENOMEM; 2828 } 2829 2830 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2831 2832 ps->i915 = i915; 2833 ps->nengines = nengines; 2834 2835 idx = 0; 2836 for_each_uabi_engine(engine, i915) { 2837 struct intel_context *ce; 2838 2839 ce = intel_context_create(engine); 2840 if (IS_ERR(ce)) { 2841 err = PTR_ERR(ce); 2842 goto out; 2843 } 2844 2845 err = intel_context_pin(ce); 2846 if (err) { 2847 intel_context_put(ce); 2848 goto out; 2849 } 2850 2851 ps->ce[idx++] = ce; 2852 } 2853 GEM_BUG_ON(idx != ps->nengines); 2854 2855 for (fn = func; *fn && !err; fn++) { 2856 char name[KSYM_NAME_LEN]; 2857 struct igt_live_test t; 2858 2859 snprintf(name, sizeof(name), "%ps", *fn); 2860 err = igt_live_test_begin(&t, i915, __func__, name); 2861 if (err) 2862 break; 2863 2864 for (idx = 0; idx < nengines; idx++) { 2865 struct perf_stats *p = 2866 memset(&stats[idx], 0, sizeof(stats[idx])); 2867 struct intel_context *ce = ps->ce[idx]; 2868 2869 p->engine = ps->ce[idx]->engine; 2870 intel_engine_pm_get(p->engine); 2871 2872 if (intel_engine_supports_stats(p->engine)) 2873 p->busy = intel_engine_get_busy_time(p->engine, 2874 &p->time) + 1; 2875 else 2876 p->time = ktime_get(); 2877 p->runtime = -intel_context_get_total_runtime_ns(ce); 2878 } 2879 2880 err = (*fn)(ps); 2881 if (igt_live_test_end(&t)) 2882 err = -EIO; 2883 2884 for (idx = 0; idx < nengines; idx++) { 2885 struct perf_stats *p = &stats[idx]; 2886 struct intel_context *ce = ps->ce[idx]; 2887 int integer, decimal; 2888 u64 busy, dt, now; 2889 2890 if (p->busy) 2891 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine, 2892 &now), 2893 p->busy - 1); 2894 else 2895 now = ktime_get(); 2896 p->time = ktime_sub(now, p->time); 2897 2898 err = switch_to_kernel_sync(ce, err); 2899 p->runtime += intel_context_get_total_runtime_ns(ce); 2900 intel_engine_pm_put(p->engine); 2901 2902 busy = 100 * ktime_to_ns(p->busy); 2903 dt = ktime_to_ns(p->time); 2904 if (dt) { 2905 integer = div64_u64(busy, dt); 2906 busy -= integer * dt; 2907 decimal = div64_u64(100 * busy, dt); 2908 } else { 2909 integer = 0; 2910 decimal = 0; 2911 } 2912 2913 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2914 name, p->engine->name, ce->timeline->seqno, 2915 integer, decimal, 2916 div_u64(p->runtime, 1000 * 1000), 2917 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2918 } 2919 } 2920 2921 out: 2922 for (idx = 0; idx < nengines; idx++) { 2923 if (IS_ERR_OR_NULL(ps->ce[idx])) 2924 break; 2925 2926 intel_context_unpin(ps->ce[idx]); 2927 intel_context_put(ps->ce[idx]); 2928 } 2929 kfree(ps); 2930 2931 cpu_latency_qos_remove_request(&qos); 2932 kfree(stats); 2933 return err; 2934 } 2935 2936 struct p_thread { 2937 struct perf_stats p; 2938 struct kthread_worker *worker; 2939 struct kthread_work work; 2940 struct intel_engine_cs *engine; 2941 int result; 2942 }; 2943 2944 static void p_sync0(struct kthread_work *work) 2945 { 2946 struct p_thread *thread = container_of(work, typeof(*thread), work); 2947 struct perf_stats *p = &thread->p; 2948 struct intel_engine_cs *engine = p->engine; 2949 struct intel_context *ce; 2950 IGT_TIMEOUT(end_time); 2951 unsigned long count; 2952 bool busy; 2953 int err = 0; 2954 2955 ce = intel_context_create(engine); 2956 if (IS_ERR(ce)) { 2957 thread->result = PTR_ERR(ce); 2958 return; 2959 } 2960 2961 err = intel_context_pin(ce); 2962 if (err) { 2963 intel_context_put(ce); 2964 thread->result = err; 2965 return; 2966 } 2967 2968 if (intel_engine_supports_stats(engine)) { 2969 p->busy = intel_engine_get_busy_time(engine, &p->time); 2970 busy = true; 2971 } else { 2972 p->time = ktime_get(); 2973 busy = false; 2974 } 2975 2976 count = 0; 2977 do { 2978 struct i915_request *rq; 2979 2980 rq = i915_request_create(ce); 2981 if (IS_ERR(rq)) { 2982 err = PTR_ERR(rq); 2983 break; 2984 } 2985 2986 i915_request_get(rq); 2987 i915_request_add(rq); 2988 2989 err = 0; 2990 if (i915_request_wait(rq, 0, HZ) < 0) 2991 err = -ETIME; 2992 i915_request_put(rq); 2993 if (err) 2994 break; 2995 2996 count++; 2997 } while (!__igt_timeout(end_time, NULL)); 2998 2999 if (busy) { 3000 ktime_t now; 3001 3002 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3003 p->busy); 3004 p->time = ktime_sub(now, p->time); 3005 } else { 3006 p->time = ktime_sub(ktime_get(), p->time); 3007 } 3008 3009 err = switch_to_kernel_sync(ce, err); 3010 p->runtime = intel_context_get_total_runtime_ns(ce); 3011 p->count = count; 3012 3013 intel_context_unpin(ce); 3014 intel_context_put(ce); 3015 thread->result = err; 3016 } 3017 3018 static void p_sync1(struct kthread_work *work) 3019 { 3020 struct p_thread *thread = container_of(work, typeof(*thread), work); 3021 struct perf_stats *p = &thread->p; 3022 struct intel_engine_cs *engine = p->engine; 3023 struct i915_request *prev = NULL; 3024 struct intel_context *ce; 3025 IGT_TIMEOUT(end_time); 3026 unsigned long count; 3027 bool busy; 3028 int err = 0; 3029 3030 ce = intel_context_create(engine); 3031 if (IS_ERR(ce)) { 3032 thread->result = PTR_ERR(ce); 3033 return; 3034 } 3035 3036 err = intel_context_pin(ce); 3037 if (err) { 3038 intel_context_put(ce); 3039 thread->result = err; 3040 return; 3041 } 3042 3043 if (intel_engine_supports_stats(engine)) { 3044 p->busy = intel_engine_get_busy_time(engine, &p->time); 3045 busy = true; 3046 } else { 3047 p->time = ktime_get(); 3048 busy = false; 3049 } 3050 3051 count = 0; 3052 do { 3053 struct i915_request *rq; 3054 3055 rq = i915_request_create(ce); 3056 if (IS_ERR(rq)) { 3057 err = PTR_ERR(rq); 3058 break; 3059 } 3060 3061 i915_request_get(rq); 3062 i915_request_add(rq); 3063 3064 err = 0; 3065 if (prev && i915_request_wait(prev, 0, HZ) < 0) 3066 err = -ETIME; 3067 i915_request_put(prev); 3068 prev = rq; 3069 if (err) 3070 break; 3071 3072 count++; 3073 } while (!__igt_timeout(end_time, NULL)); 3074 i915_request_put(prev); 3075 3076 if (busy) { 3077 ktime_t now; 3078 3079 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3080 p->busy); 3081 p->time = ktime_sub(now, p->time); 3082 } else { 3083 p->time = ktime_sub(ktime_get(), p->time); 3084 } 3085 3086 err = switch_to_kernel_sync(ce, err); 3087 p->runtime = intel_context_get_total_runtime_ns(ce); 3088 p->count = count; 3089 3090 intel_context_unpin(ce); 3091 intel_context_put(ce); 3092 thread->result = err; 3093 } 3094 3095 static void p_many(struct kthread_work *work) 3096 { 3097 struct p_thread *thread = container_of(work, typeof(*thread), work); 3098 struct perf_stats *p = &thread->p; 3099 struct intel_engine_cs *engine = p->engine; 3100 struct intel_context *ce; 3101 IGT_TIMEOUT(end_time); 3102 unsigned long count; 3103 int err = 0; 3104 bool busy; 3105 3106 ce = intel_context_create(engine); 3107 if (IS_ERR(ce)) { 3108 thread->result = PTR_ERR(ce); 3109 return; 3110 } 3111 3112 err = intel_context_pin(ce); 3113 if (err) { 3114 intel_context_put(ce); 3115 thread->result = err; 3116 return; 3117 } 3118 3119 if (intel_engine_supports_stats(engine)) { 3120 p->busy = intel_engine_get_busy_time(engine, &p->time); 3121 busy = true; 3122 } else { 3123 p->time = ktime_get(); 3124 busy = false; 3125 } 3126 3127 count = 0; 3128 do { 3129 struct i915_request *rq; 3130 3131 rq = i915_request_create(ce); 3132 if (IS_ERR(rq)) { 3133 err = PTR_ERR(rq); 3134 break; 3135 } 3136 3137 i915_request_add(rq); 3138 count++; 3139 } while (!__igt_timeout(end_time, NULL)); 3140 3141 if (busy) { 3142 ktime_t now; 3143 3144 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3145 p->busy); 3146 p->time = ktime_sub(now, p->time); 3147 } else { 3148 p->time = ktime_sub(ktime_get(), p->time); 3149 } 3150 3151 err = switch_to_kernel_sync(ce, err); 3152 p->runtime = intel_context_get_total_runtime_ns(ce); 3153 p->count = count; 3154 3155 intel_context_unpin(ce); 3156 intel_context_put(ce); 3157 thread->result = err; 3158 } 3159 3160 static int perf_parallel_engines(void *arg) 3161 { 3162 struct drm_i915_private *i915 = arg; 3163 static void (* const func[])(struct kthread_work *) = { 3164 p_sync0, 3165 p_sync1, 3166 p_many, 3167 NULL, 3168 }; 3169 const unsigned int nengines = num_uabi_engines(i915); 3170 void (* const *fn)(struct kthread_work *); 3171 struct intel_engine_cs *engine; 3172 struct pm_qos_request qos; 3173 struct p_thread *engines; 3174 int err = 0; 3175 3176 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL); 3177 if (!engines) 3178 return -ENOMEM; 3179 3180 cpu_latency_qos_add_request(&qos, 0); 3181 3182 for (fn = func; *fn; fn++) { 3183 char name[KSYM_NAME_LEN]; 3184 struct igt_live_test t; 3185 unsigned int idx; 3186 3187 snprintf(name, sizeof(name), "%ps", *fn); 3188 err = igt_live_test_begin(&t, i915, __func__, name); 3189 if (err) 3190 break; 3191 3192 atomic_set(&i915->selftest.counter, nengines); 3193 3194 idx = 0; 3195 for_each_uabi_engine(engine, i915) { 3196 struct kthread_worker *worker; 3197 3198 intel_engine_pm_get(engine); 3199 3200 memset(&engines[idx].p, 0, sizeof(engines[idx].p)); 3201 3202 worker = kthread_create_worker(0, "igt:%s", 3203 engine->name); 3204 if (IS_ERR(worker)) { 3205 err = PTR_ERR(worker); 3206 intel_engine_pm_put(engine); 3207 break; 3208 } 3209 engines[idx].worker = worker; 3210 engines[idx].result = 0; 3211 engines[idx].p.engine = engine; 3212 engines[idx].engine = engine; 3213 3214 kthread_init_work(&engines[idx].work, *fn); 3215 kthread_queue_work(worker, &engines[idx].work); 3216 idx++; 3217 } 3218 3219 idx = 0; 3220 for_each_uabi_engine(engine, i915) { 3221 int status; 3222 3223 if (!engines[idx].worker) 3224 break; 3225 3226 kthread_flush_work(&engines[idx].work); 3227 status = READ_ONCE(engines[idx].result); 3228 if (status && !err) 3229 err = status; 3230 3231 intel_engine_pm_put(engine); 3232 3233 kthread_destroy_worker(engines[idx].worker); 3234 idx++; 3235 } 3236 3237 if (igt_live_test_end(&t)) 3238 err = -EIO; 3239 if (err) 3240 break; 3241 3242 idx = 0; 3243 for_each_uabi_engine(engine, i915) { 3244 struct perf_stats *p = &engines[idx].p; 3245 u64 busy = 100 * ktime_to_ns(p->busy); 3246 u64 dt = ktime_to_ns(p->time); 3247 int integer, decimal; 3248 3249 if (dt) { 3250 integer = div64_u64(busy, dt); 3251 busy -= integer * dt; 3252 decimal = div64_u64(100 * busy, dt); 3253 } else { 3254 integer = 0; 3255 decimal = 0; 3256 } 3257 3258 GEM_BUG_ON(engine != p->engine); 3259 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 3260 name, engine->name, p->count, integer, decimal, 3261 div_u64(p->runtime, 1000 * 1000), 3262 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 3263 idx++; 3264 } 3265 } 3266 3267 cpu_latency_qos_remove_request(&qos); 3268 kfree(engines); 3269 return err; 3270 } 3271 3272 int i915_request_perf_selftests(struct drm_i915_private *i915) 3273 { 3274 static const struct i915_subtest tests[] = { 3275 SUBTEST(perf_request_latency), 3276 SUBTEST(perf_series_engines), 3277 SUBTEST(perf_parallel_engines), 3278 }; 3279 3280 if (intel_gt_is_wedged(to_gt(i915))) 3281 return 0; 3282 3283 return i915_subtests(tests, i915); 3284 } 3285