1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/prime_numbers.h> 26 #include <linux/pm_qos.h> 27 #include <linux/sort.h> 28 29 #include "gem/i915_gem_internal.h" 30 #include "gem/i915_gem_pm.h" 31 #include "gem/selftests/mock_context.h" 32 33 #include "gt/intel_engine_heartbeat.h" 34 #include "gt/intel_engine_pm.h" 35 #include "gt/intel_engine_user.h" 36 #include "gt/intel_gt.h" 37 #include "gt/intel_gt_clock_utils.h" 38 #include "gt/intel_gt_requests.h" 39 #include "gt/selftest_engine_heartbeat.h" 40 41 #include "i915_random.h" 42 #include "i915_selftest.h" 43 #include "igt_flush_test.h" 44 #include "igt_live_test.h" 45 #include "igt_spinner.h" 46 #include "lib_sw_fence.h" 47 48 #include "mock_drm.h" 49 #include "mock_gem_device.h" 50 51 static unsigned int num_uabi_engines(struct drm_i915_private *i915) 52 { 53 struct intel_engine_cs *engine; 54 unsigned int count; 55 56 count = 0; 57 for_each_uabi_engine(engine, i915) 58 count++; 59 60 return count; 61 } 62 63 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915) 64 { 65 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0); 66 } 67 68 static int igt_add_request(void *arg) 69 { 70 struct drm_i915_private *i915 = arg; 71 struct i915_request *request; 72 73 /* Basic preliminary test to create a request and let it loose! */ 74 75 request = mock_request(rcs0(i915)->kernel_context, HZ / 10); 76 if (!request) 77 return -ENOMEM; 78 79 i915_request_add(request); 80 81 return 0; 82 } 83 84 static int igt_wait_request(void *arg) 85 { 86 const long T = HZ / 4; 87 struct drm_i915_private *i915 = arg; 88 struct i915_request *request; 89 int err = -EINVAL; 90 91 /* Submit a request, then wait upon it */ 92 93 request = mock_request(rcs0(i915)->kernel_context, T); 94 if (!request) 95 return -ENOMEM; 96 97 i915_request_get(request); 98 99 if (i915_request_wait(request, 0, 0) != -ETIME) { 100 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n"); 101 goto out_request; 102 } 103 104 if (i915_request_wait(request, 0, T) != -ETIME) { 105 pr_err("request wait succeeded (expected timeout before submit!)\n"); 106 goto out_request; 107 } 108 109 if (i915_request_completed(request)) { 110 pr_err("request completed before submit!!\n"); 111 goto out_request; 112 } 113 114 i915_request_add(request); 115 116 if (i915_request_wait(request, 0, 0) != -ETIME) { 117 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n"); 118 goto out_request; 119 } 120 121 if (i915_request_completed(request)) { 122 pr_err("request completed immediately!\n"); 123 goto out_request; 124 } 125 126 if (i915_request_wait(request, 0, T / 2) != -ETIME) { 127 pr_err("request wait succeeded (expected timeout!)\n"); 128 goto out_request; 129 } 130 131 if (i915_request_wait(request, 0, T) == -ETIME) { 132 pr_err("request wait timed out!\n"); 133 goto out_request; 134 } 135 136 if (!i915_request_completed(request)) { 137 pr_err("request not complete after waiting!\n"); 138 goto out_request; 139 } 140 141 if (i915_request_wait(request, 0, T) == -ETIME) { 142 pr_err("request wait timed out when already complete!\n"); 143 goto out_request; 144 } 145 146 err = 0; 147 out_request: 148 i915_request_put(request); 149 mock_device_flush(i915); 150 return err; 151 } 152 153 static int igt_fence_wait(void *arg) 154 { 155 const long T = HZ / 4; 156 struct drm_i915_private *i915 = arg; 157 struct i915_request *request; 158 int err = -EINVAL; 159 160 /* Submit a request, treat it as a fence and wait upon it */ 161 162 request = mock_request(rcs0(i915)->kernel_context, T); 163 if (!request) 164 return -ENOMEM; 165 166 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) { 167 pr_err("fence wait success before submit (expected timeout)!\n"); 168 goto out; 169 } 170 171 i915_request_add(request); 172 173 if (dma_fence_is_signaled(&request->fence)) { 174 pr_err("fence signaled immediately!\n"); 175 goto out; 176 } 177 178 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) { 179 pr_err("fence wait success after submit (expected timeout)!\n"); 180 goto out; 181 } 182 183 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 184 pr_err("fence wait timed out (expected success)!\n"); 185 goto out; 186 } 187 188 if (!dma_fence_is_signaled(&request->fence)) { 189 pr_err("fence unsignaled after waiting!\n"); 190 goto out; 191 } 192 193 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 194 pr_err("fence wait timed out when complete (expected success)!\n"); 195 goto out; 196 } 197 198 err = 0; 199 out: 200 mock_device_flush(i915); 201 return err; 202 } 203 204 static int igt_request_rewind(void *arg) 205 { 206 struct drm_i915_private *i915 = arg; 207 struct i915_request *request, *vip; 208 struct i915_gem_context *ctx[2]; 209 struct intel_context *ce; 210 int err = -EINVAL; 211 212 ctx[0] = mock_context(i915, "A"); 213 if (!ctx[0]) { 214 err = -ENOMEM; 215 goto err_ctx_0; 216 } 217 218 ce = i915_gem_context_get_engine(ctx[0], RCS0); 219 GEM_BUG_ON(IS_ERR(ce)); 220 request = mock_request(ce, 2 * HZ); 221 intel_context_put(ce); 222 if (!request) { 223 err = -ENOMEM; 224 goto err_context_0; 225 } 226 227 i915_request_get(request); 228 i915_request_add(request); 229 230 ctx[1] = mock_context(i915, "B"); 231 if (!ctx[1]) { 232 err = -ENOMEM; 233 goto err_ctx_1; 234 } 235 236 ce = i915_gem_context_get_engine(ctx[1], RCS0); 237 GEM_BUG_ON(IS_ERR(ce)); 238 vip = mock_request(ce, 0); 239 intel_context_put(ce); 240 if (!vip) { 241 err = -ENOMEM; 242 goto err_context_1; 243 } 244 245 /* Simulate preemption by manual reordering */ 246 if (!mock_cancel_request(request)) { 247 pr_err("failed to cancel request (already executed)!\n"); 248 i915_request_add(vip); 249 goto err_context_1; 250 } 251 i915_request_get(vip); 252 i915_request_add(vip); 253 rcu_read_lock(); 254 request->engine->submit_request(request); 255 rcu_read_unlock(); 256 257 258 if (i915_request_wait(vip, 0, HZ) == -ETIME) { 259 pr_err("timed out waiting for high priority request\n"); 260 goto err; 261 } 262 263 if (i915_request_completed(request)) { 264 pr_err("low priority request already completed\n"); 265 goto err; 266 } 267 268 err = 0; 269 err: 270 i915_request_put(vip); 271 err_context_1: 272 mock_context_close(ctx[1]); 273 err_ctx_1: 274 i915_request_put(request); 275 err_context_0: 276 mock_context_close(ctx[0]); 277 err_ctx_0: 278 mock_device_flush(i915); 279 return err; 280 } 281 282 struct smoketest { 283 struct intel_engine_cs *engine; 284 struct i915_gem_context **contexts; 285 atomic_long_t num_waits, num_fences; 286 int ncontexts, max_batch; 287 struct i915_request *(*request_alloc)(struct intel_context *ce); 288 }; 289 290 static struct i915_request * 291 __mock_request_alloc(struct intel_context *ce) 292 { 293 return mock_request(ce, 0); 294 } 295 296 static struct i915_request * 297 __live_request_alloc(struct intel_context *ce) 298 { 299 return intel_context_create_request(ce); 300 } 301 302 struct smoke_thread { 303 struct kthread_worker *worker; 304 struct kthread_work work; 305 struct smoketest *t; 306 bool stop; 307 int result; 308 }; 309 310 static void __igt_breadcrumbs_smoketest(struct kthread_work *work) 311 { 312 struct smoke_thread *thread = container_of(work, typeof(*thread), work); 313 struct smoketest *t = thread->t; 314 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1; 315 const unsigned int total = 4 * t->ncontexts + 1; 316 unsigned int num_waits = 0, num_fences = 0; 317 struct i915_request **requests; 318 I915_RND_STATE(prng); 319 unsigned int *order; 320 int err = 0; 321 322 /* 323 * A very simple test to catch the most egregious of list handling bugs. 324 * 325 * At its heart, we simply create oodles of requests running across 326 * multiple kthreads and enable signaling on them, for the sole purpose 327 * of stressing our breadcrumb handling. The only inspection we do is 328 * that the fences were marked as signaled. 329 */ 330 331 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL); 332 if (!requests) { 333 thread->result = -ENOMEM; 334 return; 335 } 336 337 order = i915_random_order(total, &prng); 338 if (!order) { 339 err = -ENOMEM; 340 goto out_requests; 341 } 342 343 while (!READ_ONCE(thread->stop)) { 344 struct i915_sw_fence *submit, *wait; 345 unsigned int n, count; 346 347 submit = heap_fence_create(GFP_KERNEL); 348 if (!submit) { 349 err = -ENOMEM; 350 break; 351 } 352 353 wait = heap_fence_create(GFP_KERNEL); 354 if (!wait) { 355 i915_sw_fence_commit(submit); 356 heap_fence_put(submit); 357 err = -ENOMEM; 358 break; 359 } 360 361 i915_random_reorder(order, total, &prng); 362 count = 1 + i915_prandom_u32_max_state(max_batch, &prng); 363 364 for (n = 0; n < count; n++) { 365 struct i915_gem_context *ctx = 366 t->contexts[order[n] % t->ncontexts]; 367 struct i915_request *rq; 368 struct intel_context *ce; 369 370 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx); 371 GEM_BUG_ON(IS_ERR(ce)); 372 rq = t->request_alloc(ce); 373 intel_context_put(ce); 374 if (IS_ERR(rq)) { 375 err = PTR_ERR(rq); 376 count = n; 377 break; 378 } 379 380 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 381 submit, 382 GFP_KERNEL); 383 384 requests[n] = i915_request_get(rq); 385 i915_request_add(rq); 386 387 if (err >= 0) 388 err = i915_sw_fence_await_dma_fence(wait, 389 &rq->fence, 390 0, 391 GFP_KERNEL); 392 393 if (err < 0) { 394 i915_request_put(rq); 395 count = n; 396 break; 397 } 398 } 399 400 i915_sw_fence_commit(submit); 401 i915_sw_fence_commit(wait); 402 403 if (!wait_event_timeout(wait->wait, 404 i915_sw_fence_done(wait), 405 5 * HZ)) { 406 struct i915_request *rq = requests[count - 1]; 407 408 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", 409 atomic_read(&wait->pending), count, 410 rq->fence.context, rq->fence.seqno, 411 t->engine->name); 412 GEM_TRACE_DUMP(); 413 414 intel_gt_set_wedged(t->engine->gt); 415 GEM_BUG_ON(!i915_request_completed(rq)); 416 i915_sw_fence_wait(wait); 417 err = -EIO; 418 } 419 420 for (n = 0; n < count; n++) { 421 struct i915_request *rq = requests[n]; 422 423 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 424 &rq->fence.flags)) { 425 pr_err("%llu:%llu was not signaled!\n", 426 rq->fence.context, rq->fence.seqno); 427 err = -EINVAL; 428 } 429 430 i915_request_put(rq); 431 } 432 433 heap_fence_put(wait); 434 heap_fence_put(submit); 435 436 if (err < 0) 437 break; 438 439 num_fences += count; 440 num_waits++; 441 442 cond_resched(); 443 } 444 445 atomic_long_add(num_fences, &t->num_fences); 446 atomic_long_add(num_waits, &t->num_waits); 447 448 kfree(order); 449 out_requests: 450 kfree(requests); 451 thread->result = err; 452 } 453 454 static int mock_breadcrumbs_smoketest(void *arg) 455 { 456 struct drm_i915_private *i915 = arg; 457 struct smoketest t = { 458 .engine = rcs0(i915), 459 .ncontexts = 1024, 460 .max_batch = 1024, 461 .request_alloc = __mock_request_alloc 462 }; 463 unsigned int ncpus = num_online_cpus(); 464 struct smoke_thread *threads; 465 unsigned int n; 466 int ret = 0; 467 468 /* 469 * Smoketest our breadcrumb/signal handling for requests across multiple 470 * threads. A very simple test to only catch the most egregious of bugs. 471 * See __igt_breadcrumbs_smoketest(); 472 */ 473 474 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL); 475 if (!threads) 476 return -ENOMEM; 477 478 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL); 479 if (!t.contexts) { 480 ret = -ENOMEM; 481 goto out_threads; 482 } 483 484 for (n = 0; n < t.ncontexts; n++) { 485 t.contexts[n] = mock_context(t.engine->i915, "mock"); 486 if (!t.contexts[n]) { 487 ret = -ENOMEM; 488 goto out_contexts; 489 } 490 } 491 492 for (n = 0; n < ncpus; n++) { 493 struct kthread_worker *worker; 494 495 worker = kthread_create_worker(0, "igt/%d", n); 496 if (IS_ERR(worker)) { 497 ret = PTR_ERR(worker); 498 ncpus = n; 499 break; 500 } 501 502 threads[n].worker = worker; 503 threads[n].t = &t; 504 threads[n].stop = false; 505 threads[n].result = 0; 506 507 kthread_init_work(&threads[n].work, 508 __igt_breadcrumbs_smoketest); 509 kthread_queue_work(worker, &threads[n].work); 510 } 511 512 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 513 514 for (n = 0; n < ncpus; n++) { 515 int err; 516 517 WRITE_ONCE(threads[n].stop, true); 518 kthread_flush_work(&threads[n].work); 519 err = READ_ONCE(threads[n].result); 520 if (err < 0 && !ret) 521 ret = err; 522 523 kthread_destroy_worker(threads[n].worker); 524 } 525 pr_info("Completed %lu waits for %lu fence across %d cpus\n", 526 atomic_long_read(&t.num_waits), 527 atomic_long_read(&t.num_fences), 528 ncpus); 529 530 out_contexts: 531 for (n = 0; n < t.ncontexts; n++) { 532 if (!t.contexts[n]) 533 break; 534 mock_context_close(t.contexts[n]); 535 } 536 kfree(t.contexts); 537 out_threads: 538 kfree(threads); 539 return ret; 540 } 541 542 int i915_request_mock_selftests(void) 543 { 544 static const struct i915_subtest tests[] = { 545 SUBTEST(igt_add_request), 546 SUBTEST(igt_wait_request), 547 SUBTEST(igt_fence_wait), 548 SUBTEST(igt_request_rewind), 549 SUBTEST(mock_breadcrumbs_smoketest), 550 }; 551 struct drm_i915_private *i915; 552 intel_wakeref_t wakeref; 553 int err = 0; 554 555 i915 = mock_gem_device(); 556 if (!i915) 557 return -ENOMEM; 558 559 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 560 err = i915_subtests(tests, i915); 561 562 mock_destroy_device(i915); 563 564 return err; 565 } 566 567 static int live_nop_request(void *arg) 568 { 569 struct drm_i915_private *i915 = arg; 570 struct intel_engine_cs *engine; 571 struct igt_live_test t; 572 int err = -ENODEV; 573 574 /* 575 * Submit various sized batches of empty requests, to each engine 576 * (individually), and wait for the batch to complete. We can check 577 * the overhead of submitting requests to the hardware. 578 */ 579 580 for_each_uabi_engine(engine, i915) { 581 unsigned long n, prime; 582 IGT_TIMEOUT(end_time); 583 ktime_t times[2] = {}; 584 585 err = igt_live_test_begin(&t, i915, __func__, engine->name); 586 if (err) 587 return err; 588 589 intel_engine_pm_get(engine); 590 for_each_prime_number_from(prime, 1, 8192) { 591 struct i915_request *request = NULL; 592 593 times[1] = ktime_get_raw(); 594 595 for (n = 0; n < prime; n++) { 596 i915_request_put(request); 597 request = i915_request_create(engine->kernel_context); 598 if (IS_ERR(request)) 599 return PTR_ERR(request); 600 601 /* 602 * This space is left intentionally blank. 603 * 604 * We do not actually want to perform any 605 * action with this request, we just want 606 * to measure the latency in allocation 607 * and submission of our breadcrumbs - 608 * ensuring that the bare request is sufficient 609 * for the system to work (i.e. proper HEAD 610 * tracking of the rings, interrupt handling, 611 * etc). It also gives us the lowest bounds 612 * for latency. 613 */ 614 615 i915_request_get(request); 616 i915_request_add(request); 617 } 618 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 619 i915_request_put(request); 620 621 times[1] = ktime_sub(ktime_get_raw(), times[1]); 622 if (prime == 1) 623 times[0] = times[1]; 624 625 if (__igt_timeout(end_time, NULL)) 626 break; 627 } 628 intel_engine_pm_put(engine); 629 630 err = igt_live_test_end(&t); 631 if (err) 632 return err; 633 634 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n", 635 engine->name, 636 ktime_to_ns(times[0]), 637 prime, div64_u64(ktime_to_ns(times[1]), prime)); 638 } 639 640 return err; 641 } 642 643 static int __cancel_inactive(struct intel_engine_cs *engine) 644 { 645 struct intel_context *ce; 646 struct igt_spinner spin; 647 struct i915_request *rq; 648 int err = 0; 649 650 if (igt_spinner_init(&spin, engine->gt)) 651 return -ENOMEM; 652 653 ce = intel_context_create(engine); 654 if (IS_ERR(ce)) { 655 err = PTR_ERR(ce); 656 goto out_spin; 657 } 658 659 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 660 if (IS_ERR(rq)) { 661 err = PTR_ERR(rq); 662 goto out_ce; 663 } 664 665 pr_debug("%s: Cancelling inactive request\n", engine->name); 666 i915_request_cancel(rq, -EINTR); 667 i915_request_get(rq); 668 i915_request_add(rq); 669 670 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 671 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 672 673 pr_err("%s: Failed to cancel inactive request\n", engine->name); 674 intel_engine_dump(engine, &p, "%s\n", engine->name); 675 err = -ETIME; 676 goto out_rq; 677 } 678 679 if (rq->fence.error != -EINTR) { 680 pr_err("%s: fence not cancelled (%u)\n", 681 engine->name, rq->fence.error); 682 err = -EINVAL; 683 } 684 685 out_rq: 686 i915_request_put(rq); 687 out_ce: 688 intel_context_put(ce); 689 out_spin: 690 igt_spinner_fini(&spin); 691 if (err) 692 pr_err("%s: %s error %d\n", __func__, engine->name, err); 693 return err; 694 } 695 696 static int __cancel_active(struct intel_engine_cs *engine) 697 { 698 struct intel_context *ce; 699 struct igt_spinner spin; 700 struct i915_request *rq; 701 int err = 0; 702 703 if (igt_spinner_init(&spin, engine->gt)) 704 return -ENOMEM; 705 706 ce = intel_context_create(engine); 707 if (IS_ERR(ce)) { 708 err = PTR_ERR(ce); 709 goto out_spin; 710 } 711 712 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 713 if (IS_ERR(rq)) { 714 err = PTR_ERR(rq); 715 goto out_ce; 716 } 717 718 pr_debug("%s: Cancelling active request\n", engine->name); 719 i915_request_get(rq); 720 i915_request_add(rq); 721 if (!igt_wait_for_spinner(&spin, rq)) { 722 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 723 724 pr_err("Failed to start spinner on %s\n", engine->name); 725 intel_engine_dump(engine, &p, "%s\n", engine->name); 726 err = -ETIME; 727 goto out_rq; 728 } 729 i915_request_cancel(rq, -EINTR); 730 731 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 732 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 733 734 pr_err("%s: Failed to cancel active request\n", engine->name); 735 intel_engine_dump(engine, &p, "%s\n", engine->name); 736 err = -ETIME; 737 goto out_rq; 738 } 739 740 if (rq->fence.error != -EINTR) { 741 pr_err("%s: fence not cancelled (%u)\n", 742 engine->name, rq->fence.error); 743 err = -EINVAL; 744 } 745 746 out_rq: 747 i915_request_put(rq); 748 out_ce: 749 intel_context_put(ce); 750 out_spin: 751 igt_spinner_fini(&spin); 752 if (err) 753 pr_err("%s: %s error %d\n", __func__, engine->name, err); 754 return err; 755 } 756 757 static int __cancel_completed(struct intel_engine_cs *engine) 758 { 759 struct intel_context *ce; 760 struct igt_spinner spin; 761 struct i915_request *rq; 762 int err = 0; 763 764 if (igt_spinner_init(&spin, engine->gt)) 765 return -ENOMEM; 766 767 ce = intel_context_create(engine); 768 if (IS_ERR(ce)) { 769 err = PTR_ERR(ce); 770 goto out_spin; 771 } 772 773 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 774 if (IS_ERR(rq)) { 775 err = PTR_ERR(rq); 776 goto out_ce; 777 } 778 igt_spinner_end(&spin); 779 i915_request_get(rq); 780 i915_request_add(rq); 781 782 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 783 err = -ETIME; 784 goto out_rq; 785 } 786 787 pr_debug("%s: Cancelling completed request\n", engine->name); 788 i915_request_cancel(rq, -EINTR); 789 if (rq->fence.error) { 790 pr_err("%s: fence not cancelled (%u)\n", 791 engine->name, rq->fence.error); 792 err = -EINVAL; 793 } 794 795 out_rq: 796 i915_request_put(rq); 797 out_ce: 798 intel_context_put(ce); 799 out_spin: 800 igt_spinner_fini(&spin); 801 if (err) 802 pr_err("%s: %s error %d\n", __func__, engine->name, err); 803 return err; 804 } 805 806 /* 807 * Test to prove a non-preemptable request can be cancelled and a subsequent 808 * request on the same context can successfully complete after cancellation. 809 * 810 * Testing methodology is to create a non-preemptible request and submit it, 811 * wait for spinner to start, create a NOP request and submit it, cancel the 812 * spinner, wait for spinner to complete and verify it failed with an error, 813 * finally wait for NOP request to complete verify it succeeded without an 814 * error. Preemption timeout also reduced / restored so test runs in a timely 815 * maner. 816 */ 817 static int __cancel_reset(struct drm_i915_private *i915, 818 struct intel_engine_cs *engine) 819 { 820 struct intel_context *ce; 821 struct igt_spinner spin; 822 struct i915_request *rq, *nop; 823 unsigned long preempt_timeout_ms; 824 int err = 0; 825 826 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT || 827 !intel_has_reset_engine(engine->gt)) 828 return 0; 829 830 preempt_timeout_ms = engine->props.preempt_timeout_ms; 831 engine->props.preempt_timeout_ms = 100; 832 833 if (igt_spinner_init(&spin, engine->gt)) 834 goto out_restore; 835 836 ce = intel_context_create(engine); 837 if (IS_ERR(ce)) { 838 err = PTR_ERR(ce); 839 goto out_spin; 840 } 841 842 rq = igt_spinner_create_request(&spin, ce, MI_NOOP); 843 if (IS_ERR(rq)) { 844 err = PTR_ERR(rq); 845 goto out_ce; 846 } 847 848 pr_debug("%s: Cancelling active non-preemptable request\n", 849 engine->name); 850 i915_request_get(rq); 851 i915_request_add(rq); 852 if (!igt_wait_for_spinner(&spin, rq)) { 853 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 854 855 pr_err("Failed to start spinner on %s\n", engine->name); 856 intel_engine_dump(engine, &p, "%s\n", engine->name); 857 err = -ETIME; 858 goto out_rq; 859 } 860 861 nop = intel_context_create_request(ce); 862 if (IS_ERR(nop)) 863 goto out_rq; 864 i915_request_get(nop); 865 i915_request_add(nop); 866 867 i915_request_cancel(rq, -EINTR); 868 869 if (i915_request_wait(rq, 0, HZ) < 0) { 870 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 871 872 pr_err("%s: Failed to cancel hung request\n", engine->name); 873 intel_engine_dump(engine, &p, "%s\n", engine->name); 874 err = -ETIME; 875 goto out_nop; 876 } 877 878 if (rq->fence.error != -EINTR) { 879 pr_err("%s: fence not cancelled (%u)\n", 880 engine->name, rq->fence.error); 881 err = -EINVAL; 882 goto out_nop; 883 } 884 885 if (i915_request_wait(nop, 0, HZ) < 0) { 886 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 887 888 pr_err("%s: Failed to complete nop request\n", engine->name); 889 intel_engine_dump(engine, &p, "%s\n", engine->name); 890 err = -ETIME; 891 goto out_nop; 892 } 893 894 if (nop->fence.error != 0) { 895 pr_err("%s: Nop request errored (%u)\n", 896 engine->name, nop->fence.error); 897 err = -EINVAL; 898 } 899 900 out_nop: 901 i915_request_put(nop); 902 out_rq: 903 i915_request_put(rq); 904 out_ce: 905 intel_context_put(ce); 906 out_spin: 907 igt_spinner_fini(&spin); 908 out_restore: 909 engine->props.preempt_timeout_ms = preempt_timeout_ms; 910 if (err) 911 pr_err("%s: %s error %d\n", __func__, engine->name, err); 912 return err; 913 } 914 915 static int live_cancel_request(void *arg) 916 { 917 struct drm_i915_private *i915 = arg; 918 struct intel_engine_cs *engine; 919 920 /* 921 * Check cancellation of requests. We expect to be able to immediately 922 * cancel active requests, even if they are currently on the GPU. 923 */ 924 925 for_each_uabi_engine(engine, i915) { 926 struct igt_live_test t; 927 int err, err2; 928 929 if (!intel_engine_has_preemption(engine)) 930 continue; 931 932 err = igt_live_test_begin(&t, i915, __func__, engine->name); 933 if (err) 934 return err; 935 936 err = __cancel_inactive(engine); 937 if (err == 0) 938 err = __cancel_active(engine); 939 if (err == 0) 940 err = __cancel_completed(engine); 941 942 err2 = igt_live_test_end(&t); 943 if (err) 944 return err; 945 if (err2) 946 return err2; 947 948 /* Expects reset so call outside of igt_live_test_* */ 949 err = __cancel_reset(i915, engine); 950 if (err) 951 return err; 952 953 if (igt_flush_test(i915)) 954 return -EIO; 955 } 956 957 return 0; 958 } 959 960 static struct i915_vma *empty_batch(struct drm_i915_private *i915) 961 { 962 struct drm_i915_gem_object *obj; 963 struct i915_vma *vma; 964 u32 *cmd; 965 int err; 966 967 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 968 if (IS_ERR(obj)) 969 return ERR_CAST(obj); 970 971 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB); 972 if (IS_ERR(cmd)) { 973 err = PTR_ERR(cmd); 974 goto err; 975 } 976 977 *cmd = MI_BATCH_BUFFER_END; 978 979 __i915_gem_object_flush_map(obj, 0, 64); 980 i915_gem_object_unpin_map(obj); 981 982 intel_gt_chipset_flush(to_gt(i915)); 983 984 vma = i915_vma_instance(obj, &to_gt(i915)->ggtt->vm, NULL); 985 if (IS_ERR(vma)) { 986 err = PTR_ERR(vma); 987 goto err; 988 } 989 990 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL); 991 if (err) 992 goto err; 993 994 /* Force the wait now to avoid including it in the benchmark */ 995 err = i915_vma_sync(vma); 996 if (err) 997 goto err_pin; 998 999 return vma; 1000 1001 err_pin: 1002 i915_vma_unpin(vma); 1003 err: 1004 i915_gem_object_put(obj); 1005 return ERR_PTR(err); 1006 } 1007 1008 static struct i915_request * 1009 empty_request(struct intel_engine_cs *engine, 1010 struct i915_vma *batch) 1011 { 1012 struct i915_request *request; 1013 int err; 1014 1015 request = i915_request_create(engine->kernel_context); 1016 if (IS_ERR(request)) 1017 return request; 1018 1019 err = engine->emit_bb_start(request, 1020 batch->node.start, 1021 batch->node.size, 1022 I915_DISPATCH_SECURE); 1023 if (err) 1024 goto out_request; 1025 1026 i915_request_get(request); 1027 out_request: 1028 i915_request_add(request); 1029 return err ? ERR_PTR(err) : request; 1030 } 1031 1032 static int live_empty_request(void *arg) 1033 { 1034 struct drm_i915_private *i915 = arg; 1035 struct intel_engine_cs *engine; 1036 struct igt_live_test t; 1037 struct i915_vma *batch; 1038 int err = 0; 1039 1040 /* 1041 * Submit various sized batches of empty requests, to each engine 1042 * (individually), and wait for the batch to complete. We can check 1043 * the overhead of submitting requests to the hardware. 1044 */ 1045 1046 batch = empty_batch(i915); 1047 if (IS_ERR(batch)) 1048 return PTR_ERR(batch); 1049 1050 for_each_uabi_engine(engine, i915) { 1051 IGT_TIMEOUT(end_time); 1052 struct i915_request *request; 1053 unsigned long n, prime; 1054 ktime_t times[2] = {}; 1055 1056 err = igt_live_test_begin(&t, i915, __func__, engine->name); 1057 if (err) 1058 goto out_batch; 1059 1060 intel_engine_pm_get(engine); 1061 1062 /* Warmup / preload */ 1063 request = empty_request(engine, batch); 1064 if (IS_ERR(request)) { 1065 err = PTR_ERR(request); 1066 intel_engine_pm_put(engine); 1067 goto out_batch; 1068 } 1069 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1070 1071 for_each_prime_number_from(prime, 1, 8192) { 1072 times[1] = ktime_get_raw(); 1073 1074 for (n = 0; n < prime; n++) { 1075 i915_request_put(request); 1076 request = empty_request(engine, batch); 1077 if (IS_ERR(request)) { 1078 err = PTR_ERR(request); 1079 intel_engine_pm_put(engine); 1080 goto out_batch; 1081 } 1082 } 1083 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1084 1085 times[1] = ktime_sub(ktime_get_raw(), times[1]); 1086 if (prime == 1) 1087 times[0] = times[1]; 1088 1089 if (__igt_timeout(end_time, NULL)) 1090 break; 1091 } 1092 i915_request_put(request); 1093 intel_engine_pm_put(engine); 1094 1095 err = igt_live_test_end(&t); 1096 if (err) 1097 goto out_batch; 1098 1099 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n", 1100 engine->name, 1101 ktime_to_ns(times[0]), 1102 prime, div64_u64(ktime_to_ns(times[1]), prime)); 1103 } 1104 1105 out_batch: 1106 i915_vma_unpin(batch); 1107 i915_vma_put(batch); 1108 return err; 1109 } 1110 1111 static struct i915_vma *recursive_batch(struct drm_i915_private *i915) 1112 { 1113 struct drm_i915_gem_object *obj; 1114 const int ver = GRAPHICS_VER(i915); 1115 struct i915_vma *vma; 1116 u32 *cmd; 1117 int err; 1118 1119 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 1120 if (IS_ERR(obj)) 1121 return ERR_CAST(obj); 1122 1123 vma = i915_vma_instance(obj, to_gt(i915)->vm, NULL); 1124 if (IS_ERR(vma)) { 1125 err = PTR_ERR(vma); 1126 goto err; 1127 } 1128 1129 err = i915_vma_pin(vma, 0, 0, PIN_USER); 1130 if (err) 1131 goto err; 1132 1133 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC); 1134 if (IS_ERR(cmd)) { 1135 err = PTR_ERR(cmd); 1136 goto err; 1137 } 1138 1139 if (ver >= 8) { 1140 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 1141 *cmd++ = lower_32_bits(vma->node.start); 1142 *cmd++ = upper_32_bits(vma->node.start); 1143 } else if (ver >= 6) { 1144 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8; 1145 *cmd++ = lower_32_bits(vma->node.start); 1146 } else { 1147 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; 1148 *cmd++ = lower_32_bits(vma->node.start); 1149 } 1150 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */ 1151 1152 __i915_gem_object_flush_map(obj, 0, 64); 1153 i915_gem_object_unpin_map(obj); 1154 1155 intel_gt_chipset_flush(to_gt(i915)); 1156 1157 return vma; 1158 1159 err: 1160 i915_gem_object_put(obj); 1161 return ERR_PTR(err); 1162 } 1163 1164 static int recursive_batch_resolve(struct i915_vma *batch) 1165 { 1166 u32 *cmd; 1167 1168 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC); 1169 if (IS_ERR(cmd)) 1170 return PTR_ERR(cmd); 1171 1172 *cmd = MI_BATCH_BUFFER_END; 1173 1174 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd)); 1175 i915_gem_object_unpin_map(batch->obj); 1176 1177 intel_gt_chipset_flush(batch->vm->gt); 1178 1179 return 0; 1180 } 1181 1182 static int live_all_engines(void *arg) 1183 { 1184 struct drm_i915_private *i915 = arg; 1185 const unsigned int nengines = num_uabi_engines(i915); 1186 struct intel_engine_cs *engine; 1187 struct i915_request **request; 1188 struct igt_live_test t; 1189 struct i915_vma *batch; 1190 unsigned int idx; 1191 int err; 1192 1193 /* 1194 * Check we can submit requests to all engines simultaneously. We 1195 * send a recursive batch to each engine - checking that we don't 1196 * block doing so, and that they don't complete too soon. 1197 */ 1198 1199 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1200 if (!request) 1201 return -ENOMEM; 1202 1203 err = igt_live_test_begin(&t, i915, __func__, ""); 1204 if (err) 1205 goto out_free; 1206 1207 batch = recursive_batch(i915); 1208 if (IS_ERR(batch)) { 1209 err = PTR_ERR(batch); 1210 pr_err("%s: Unable to create batch, err=%d\n", __func__, err); 1211 goto out_free; 1212 } 1213 1214 i915_vma_lock(batch); 1215 1216 idx = 0; 1217 for_each_uabi_engine(engine, i915) { 1218 request[idx] = intel_engine_create_kernel_request(engine); 1219 if (IS_ERR(request[idx])) { 1220 err = PTR_ERR(request[idx]); 1221 pr_err("%s: Request allocation failed with err=%d\n", 1222 __func__, err); 1223 goto out_request; 1224 } 1225 1226 err = i915_request_await_object(request[idx], batch->obj, 0); 1227 if (err == 0) 1228 err = i915_vma_move_to_active(batch, request[idx], 0); 1229 GEM_BUG_ON(err); 1230 1231 err = engine->emit_bb_start(request[idx], 1232 batch->node.start, 1233 batch->node.size, 1234 0); 1235 GEM_BUG_ON(err); 1236 request[idx]->batch = batch; 1237 1238 i915_request_get(request[idx]); 1239 i915_request_add(request[idx]); 1240 idx++; 1241 } 1242 1243 i915_vma_unlock(batch); 1244 1245 idx = 0; 1246 for_each_uabi_engine(engine, i915) { 1247 if (i915_request_completed(request[idx])) { 1248 pr_err("%s(%s): request completed too early!\n", 1249 __func__, engine->name); 1250 err = -EINVAL; 1251 goto out_request; 1252 } 1253 idx++; 1254 } 1255 1256 err = recursive_batch_resolve(batch); 1257 if (err) { 1258 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err); 1259 goto out_request; 1260 } 1261 1262 idx = 0; 1263 for_each_uabi_engine(engine, i915) { 1264 long timeout; 1265 1266 timeout = i915_request_wait(request[idx], 0, 1267 MAX_SCHEDULE_TIMEOUT); 1268 if (timeout < 0) { 1269 err = timeout; 1270 pr_err("%s: error waiting for request on %s, err=%d\n", 1271 __func__, engine->name, err); 1272 goto out_request; 1273 } 1274 1275 GEM_BUG_ON(!i915_request_completed(request[idx])); 1276 i915_request_put(request[idx]); 1277 request[idx] = NULL; 1278 idx++; 1279 } 1280 1281 err = igt_live_test_end(&t); 1282 1283 out_request: 1284 idx = 0; 1285 for_each_uabi_engine(engine, i915) { 1286 if (request[idx]) 1287 i915_request_put(request[idx]); 1288 idx++; 1289 } 1290 i915_vma_unpin(batch); 1291 i915_vma_put(batch); 1292 out_free: 1293 kfree(request); 1294 return err; 1295 } 1296 1297 static int live_sequential_engines(void *arg) 1298 { 1299 struct drm_i915_private *i915 = arg; 1300 const unsigned int nengines = num_uabi_engines(i915); 1301 struct i915_request **request; 1302 struct i915_request *prev = NULL; 1303 struct intel_engine_cs *engine; 1304 struct igt_live_test t; 1305 unsigned int idx; 1306 int err; 1307 1308 /* 1309 * Check we can submit requests to all engines sequentially, such 1310 * that each successive request waits for the earlier ones. This 1311 * tests that we don't execute requests out of order, even though 1312 * they are running on independent engines. 1313 */ 1314 1315 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1316 if (!request) 1317 return -ENOMEM; 1318 1319 err = igt_live_test_begin(&t, i915, __func__, ""); 1320 if (err) 1321 goto out_free; 1322 1323 idx = 0; 1324 for_each_uabi_engine(engine, i915) { 1325 struct i915_vma *batch; 1326 1327 batch = recursive_batch(i915); 1328 if (IS_ERR(batch)) { 1329 err = PTR_ERR(batch); 1330 pr_err("%s: Unable to create batch for %s, err=%d\n", 1331 __func__, engine->name, err); 1332 goto out_free; 1333 } 1334 1335 i915_vma_lock(batch); 1336 request[idx] = intel_engine_create_kernel_request(engine); 1337 if (IS_ERR(request[idx])) { 1338 err = PTR_ERR(request[idx]); 1339 pr_err("%s: Request allocation failed for %s with err=%d\n", 1340 __func__, engine->name, err); 1341 goto out_unlock; 1342 } 1343 1344 if (prev) { 1345 err = i915_request_await_dma_fence(request[idx], 1346 &prev->fence); 1347 if (err) { 1348 i915_request_add(request[idx]); 1349 pr_err("%s: Request await failed for %s with err=%d\n", 1350 __func__, engine->name, err); 1351 goto out_unlock; 1352 } 1353 } 1354 1355 err = i915_request_await_object(request[idx], 1356 batch->obj, false); 1357 if (err == 0) 1358 err = i915_vma_move_to_active(batch, request[idx], 0); 1359 GEM_BUG_ON(err); 1360 1361 err = engine->emit_bb_start(request[idx], 1362 batch->node.start, 1363 batch->node.size, 1364 0); 1365 GEM_BUG_ON(err); 1366 request[idx]->batch = batch; 1367 1368 i915_request_get(request[idx]); 1369 i915_request_add(request[idx]); 1370 1371 prev = request[idx]; 1372 idx++; 1373 1374 out_unlock: 1375 i915_vma_unlock(batch); 1376 if (err) 1377 goto out_request; 1378 } 1379 1380 idx = 0; 1381 for_each_uabi_engine(engine, i915) { 1382 long timeout; 1383 1384 if (i915_request_completed(request[idx])) { 1385 pr_err("%s(%s): request completed too early!\n", 1386 __func__, engine->name); 1387 err = -EINVAL; 1388 goto out_request; 1389 } 1390 1391 err = recursive_batch_resolve(request[idx]->batch); 1392 if (err) { 1393 pr_err("%s: failed to resolve batch, err=%d\n", 1394 __func__, err); 1395 goto out_request; 1396 } 1397 1398 timeout = i915_request_wait(request[idx], 0, 1399 MAX_SCHEDULE_TIMEOUT); 1400 if (timeout < 0) { 1401 err = timeout; 1402 pr_err("%s: error waiting for request on %s, err=%d\n", 1403 __func__, engine->name, err); 1404 goto out_request; 1405 } 1406 1407 GEM_BUG_ON(!i915_request_completed(request[idx])); 1408 idx++; 1409 } 1410 1411 err = igt_live_test_end(&t); 1412 1413 out_request: 1414 idx = 0; 1415 for_each_uabi_engine(engine, i915) { 1416 u32 *cmd; 1417 1418 if (!request[idx]) 1419 break; 1420 1421 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj, 1422 I915_MAP_WC); 1423 if (!IS_ERR(cmd)) { 1424 *cmd = MI_BATCH_BUFFER_END; 1425 1426 __i915_gem_object_flush_map(request[idx]->batch->obj, 1427 0, sizeof(*cmd)); 1428 i915_gem_object_unpin_map(request[idx]->batch->obj); 1429 1430 intel_gt_chipset_flush(engine->gt); 1431 } 1432 1433 i915_vma_put(request[idx]->batch); 1434 i915_request_put(request[idx]); 1435 idx++; 1436 } 1437 out_free: 1438 kfree(request); 1439 return err; 1440 } 1441 1442 struct parallel_thread { 1443 struct kthread_worker *worker; 1444 struct kthread_work work; 1445 struct intel_engine_cs *engine; 1446 int result; 1447 }; 1448 1449 static void __live_parallel_engine1(struct kthread_work *work) 1450 { 1451 struct parallel_thread *thread = 1452 container_of(work, typeof(*thread), work); 1453 struct intel_engine_cs *engine = thread->engine; 1454 IGT_TIMEOUT(end_time); 1455 unsigned long count; 1456 int err = 0; 1457 1458 count = 0; 1459 intel_engine_pm_get(engine); 1460 do { 1461 struct i915_request *rq; 1462 1463 rq = i915_request_create(engine->kernel_context); 1464 if (IS_ERR(rq)) { 1465 err = PTR_ERR(rq); 1466 break; 1467 } 1468 1469 i915_request_get(rq); 1470 i915_request_add(rq); 1471 1472 err = 0; 1473 if (i915_request_wait(rq, 0, HZ) < 0) 1474 err = -ETIME; 1475 i915_request_put(rq); 1476 if (err) 1477 break; 1478 1479 count++; 1480 } while (!__igt_timeout(end_time, NULL)); 1481 intel_engine_pm_put(engine); 1482 1483 pr_info("%s: %lu request + sync\n", engine->name, count); 1484 thread->result = err; 1485 } 1486 1487 static void __live_parallel_engineN(struct kthread_work *work) 1488 { 1489 struct parallel_thread *thread = 1490 container_of(work, typeof(*thread), work); 1491 struct intel_engine_cs *engine = thread->engine; 1492 IGT_TIMEOUT(end_time); 1493 unsigned long count; 1494 int err = 0; 1495 1496 count = 0; 1497 intel_engine_pm_get(engine); 1498 do { 1499 struct i915_request *rq; 1500 1501 rq = i915_request_create(engine->kernel_context); 1502 if (IS_ERR(rq)) { 1503 err = PTR_ERR(rq); 1504 break; 1505 } 1506 1507 i915_request_add(rq); 1508 count++; 1509 } while (!__igt_timeout(end_time, NULL)); 1510 intel_engine_pm_put(engine); 1511 1512 pr_info("%s: %lu requests\n", engine->name, count); 1513 thread->result = err; 1514 } 1515 1516 static bool wake_all(struct drm_i915_private *i915) 1517 { 1518 if (atomic_dec_and_test(&i915->selftest.counter)) { 1519 wake_up_var(&i915->selftest.counter); 1520 return true; 1521 } 1522 1523 return false; 1524 } 1525 1526 static int wait_for_all(struct drm_i915_private *i915) 1527 { 1528 if (wake_all(i915)) 1529 return 0; 1530 1531 if (wait_var_event_timeout(&i915->selftest.counter, 1532 !atomic_read(&i915->selftest.counter), 1533 i915_selftest.timeout_jiffies)) 1534 return 0; 1535 1536 return -ETIME; 1537 } 1538 1539 static void __live_parallel_spin(struct kthread_work *work) 1540 { 1541 struct parallel_thread *thread = 1542 container_of(work, typeof(*thread), work); 1543 struct intel_engine_cs *engine = thread->engine; 1544 struct igt_spinner spin; 1545 struct i915_request *rq; 1546 int err = 0; 1547 1548 /* 1549 * Create a spinner running for eternity on each engine. If a second 1550 * spinner is incorrectly placed on the same engine, it will not be 1551 * able to start in time. 1552 */ 1553 1554 if (igt_spinner_init(&spin, engine->gt)) { 1555 wake_all(engine->i915); 1556 thread->result = -ENOMEM; 1557 return; 1558 } 1559 1560 intel_engine_pm_get(engine); 1561 rq = igt_spinner_create_request(&spin, 1562 engine->kernel_context, 1563 MI_NOOP); /* no preemption */ 1564 intel_engine_pm_put(engine); 1565 if (IS_ERR(rq)) { 1566 err = PTR_ERR(rq); 1567 if (err == -ENODEV) 1568 err = 0; 1569 wake_all(engine->i915); 1570 goto out_spin; 1571 } 1572 1573 i915_request_get(rq); 1574 i915_request_add(rq); 1575 if (igt_wait_for_spinner(&spin, rq)) { 1576 /* Occupy this engine for the whole test */ 1577 err = wait_for_all(engine->i915); 1578 } else { 1579 pr_err("Failed to start spinner on %s\n", engine->name); 1580 err = -EINVAL; 1581 } 1582 igt_spinner_end(&spin); 1583 1584 if (err == 0 && i915_request_wait(rq, 0, HZ) < 0) 1585 err = -EIO; 1586 i915_request_put(rq); 1587 1588 out_spin: 1589 igt_spinner_fini(&spin); 1590 thread->result = err; 1591 } 1592 1593 static int live_parallel_engines(void *arg) 1594 { 1595 struct drm_i915_private *i915 = arg; 1596 static void (* const func[])(struct kthread_work *) = { 1597 __live_parallel_engine1, 1598 __live_parallel_engineN, 1599 __live_parallel_spin, 1600 NULL, 1601 }; 1602 const unsigned int nengines = num_uabi_engines(i915); 1603 struct parallel_thread *threads; 1604 struct intel_engine_cs *engine; 1605 void (* const *fn)(struct kthread_work *); 1606 int err = 0; 1607 1608 /* 1609 * Check we can submit requests to all engines concurrently. This 1610 * tests that we load up the system maximally. 1611 */ 1612 1613 threads = kcalloc(nengines, sizeof(*threads), GFP_KERNEL); 1614 if (!threads) 1615 return -ENOMEM; 1616 1617 for (fn = func; !err && *fn; fn++) { 1618 char name[KSYM_NAME_LEN]; 1619 struct igt_live_test t; 1620 unsigned int idx; 1621 1622 snprintf(name, sizeof(name), "%ps", *fn); 1623 err = igt_live_test_begin(&t, i915, __func__, name); 1624 if (err) 1625 break; 1626 1627 atomic_set(&i915->selftest.counter, nengines); 1628 1629 idx = 0; 1630 for_each_uabi_engine(engine, i915) { 1631 struct kthread_worker *worker; 1632 1633 worker = kthread_create_worker(0, "igt/parallel:%s", 1634 engine->name); 1635 if (IS_ERR(worker)) { 1636 err = PTR_ERR(worker); 1637 break; 1638 } 1639 1640 threads[idx].worker = worker; 1641 threads[idx].result = 0; 1642 threads[idx].engine = engine; 1643 1644 kthread_init_work(&threads[idx].work, *fn); 1645 kthread_queue_work(worker, &threads[idx].work); 1646 idx++; 1647 } 1648 1649 idx = 0; 1650 for_each_uabi_engine(engine, i915) { 1651 int status; 1652 1653 if (!threads[idx].worker) 1654 break; 1655 1656 kthread_flush_work(&threads[idx].work); 1657 status = READ_ONCE(threads[idx].result); 1658 if (status && !err) 1659 err = status; 1660 1661 kthread_destroy_worker(threads[idx++].worker); 1662 } 1663 1664 if (igt_live_test_end(&t)) 1665 err = -EIO; 1666 } 1667 1668 kfree(threads); 1669 return err; 1670 } 1671 1672 static int 1673 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine) 1674 { 1675 struct i915_request *rq; 1676 int ret; 1677 1678 /* 1679 * Before execlists, all contexts share the same ringbuffer. With 1680 * execlists, each context/engine has a separate ringbuffer and 1681 * for the purposes of this test, inexhaustible. 1682 * 1683 * For the global ringbuffer though, we have to be very careful 1684 * that we do not wrap while preventing the execution of requests 1685 * with a unsignaled fence. 1686 */ 1687 if (HAS_EXECLISTS(ctx->i915)) 1688 return INT_MAX; 1689 1690 rq = igt_request_alloc(ctx, engine); 1691 if (IS_ERR(rq)) { 1692 ret = PTR_ERR(rq); 1693 } else { 1694 int sz; 1695 1696 ret = rq->ring->size - rq->reserved_space; 1697 i915_request_add(rq); 1698 1699 sz = rq->ring->emit - rq->head; 1700 if (sz < 0) 1701 sz += rq->ring->size; 1702 ret /= sz; 1703 ret /= 2; /* leave half spare, in case of emergency! */ 1704 } 1705 1706 return ret; 1707 } 1708 1709 static int live_breadcrumbs_smoketest(void *arg) 1710 { 1711 struct drm_i915_private *i915 = arg; 1712 const unsigned int nengines = num_uabi_engines(i915); 1713 const unsigned int ncpus = num_online_cpus(); 1714 unsigned long num_waits, num_fences; 1715 struct intel_engine_cs *engine; 1716 struct smoke_thread *threads; 1717 struct igt_live_test live; 1718 intel_wakeref_t wakeref; 1719 struct smoketest *smoke; 1720 unsigned int n, idx; 1721 struct file *file; 1722 int ret = 0; 1723 1724 /* 1725 * Smoketest our breadcrumb/signal handling for requests across multiple 1726 * threads. A very simple test to only catch the most egregious of bugs. 1727 * See __igt_breadcrumbs_smoketest(); 1728 * 1729 * On real hardware this time. 1730 */ 1731 1732 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1733 1734 file = mock_file(i915); 1735 if (IS_ERR(file)) { 1736 ret = PTR_ERR(file); 1737 goto out_rpm; 1738 } 1739 1740 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL); 1741 if (!smoke) { 1742 ret = -ENOMEM; 1743 goto out_file; 1744 } 1745 1746 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL); 1747 if (!threads) { 1748 ret = -ENOMEM; 1749 goto out_smoke; 1750 } 1751 1752 smoke[0].request_alloc = __live_request_alloc; 1753 smoke[0].ncontexts = 64; 1754 smoke[0].contexts = kcalloc(smoke[0].ncontexts, 1755 sizeof(*smoke[0].contexts), 1756 GFP_KERNEL); 1757 if (!smoke[0].contexts) { 1758 ret = -ENOMEM; 1759 goto out_threads; 1760 } 1761 1762 for (n = 0; n < smoke[0].ncontexts; n++) { 1763 smoke[0].contexts[n] = live_context(i915, file); 1764 if (IS_ERR(smoke[0].contexts[n])) { 1765 ret = PTR_ERR(smoke[0].contexts[n]); 1766 goto out_contexts; 1767 } 1768 } 1769 1770 ret = igt_live_test_begin(&live, i915, __func__, ""); 1771 if (ret) 1772 goto out_contexts; 1773 1774 idx = 0; 1775 for_each_uabi_engine(engine, i915) { 1776 smoke[idx] = smoke[0]; 1777 smoke[idx].engine = engine; 1778 smoke[idx].max_batch = 1779 max_batches(smoke[0].contexts[0], engine); 1780 if (smoke[idx].max_batch < 0) { 1781 ret = smoke[idx].max_batch; 1782 goto out_flush; 1783 } 1784 /* One ring interleaved between requests from all cpus */ 1785 smoke[idx].max_batch /= num_online_cpus() + 1; 1786 pr_debug("Limiting batches to %d requests on %s\n", 1787 smoke[idx].max_batch, engine->name); 1788 1789 for (n = 0; n < ncpus; n++) { 1790 unsigned int i = idx * ncpus + n; 1791 struct kthread_worker *worker; 1792 1793 worker = kthread_create_worker(0, "igt/%d.%d", idx, n); 1794 if (IS_ERR(worker)) { 1795 ret = PTR_ERR(worker); 1796 goto out_flush; 1797 } 1798 1799 threads[i].worker = worker; 1800 threads[i].t = &smoke[idx]; 1801 1802 kthread_init_work(&threads[i].work, 1803 __igt_breadcrumbs_smoketest); 1804 kthread_queue_work(worker, &threads[i].work); 1805 } 1806 1807 idx++; 1808 } 1809 1810 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 1811 1812 out_flush: 1813 idx = 0; 1814 num_waits = 0; 1815 num_fences = 0; 1816 for_each_uabi_engine(engine, i915) { 1817 for (n = 0; n < ncpus; n++) { 1818 unsigned int i = idx * ncpus + n; 1819 int err; 1820 1821 if (!threads[i].worker) 1822 continue; 1823 1824 WRITE_ONCE(threads[i].stop, true); 1825 kthread_flush_work(&threads[i].work); 1826 err = READ_ONCE(threads[i].result); 1827 if (err < 0 && !ret) 1828 ret = err; 1829 1830 kthread_destroy_worker(threads[i].worker); 1831 } 1832 1833 num_waits += atomic_long_read(&smoke[idx].num_waits); 1834 num_fences += atomic_long_read(&smoke[idx].num_fences); 1835 idx++; 1836 } 1837 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", 1838 num_waits, num_fences, idx, ncpus); 1839 1840 ret = igt_live_test_end(&live) ?: ret; 1841 out_contexts: 1842 kfree(smoke[0].contexts); 1843 out_threads: 1844 kfree(threads); 1845 out_smoke: 1846 kfree(smoke); 1847 out_file: 1848 fput(file); 1849 out_rpm: 1850 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1851 1852 return ret; 1853 } 1854 1855 int i915_request_live_selftests(struct drm_i915_private *i915) 1856 { 1857 static const struct i915_subtest tests[] = { 1858 SUBTEST(live_nop_request), 1859 SUBTEST(live_all_engines), 1860 SUBTEST(live_sequential_engines), 1861 SUBTEST(live_parallel_engines), 1862 SUBTEST(live_empty_request), 1863 SUBTEST(live_cancel_request), 1864 SUBTEST(live_breadcrumbs_smoketest), 1865 }; 1866 1867 if (intel_gt_is_wedged(to_gt(i915))) 1868 return 0; 1869 1870 return i915_live_subtests(tests, i915); 1871 } 1872 1873 static int switch_to_kernel_sync(struct intel_context *ce, int err) 1874 { 1875 struct i915_request *rq; 1876 struct dma_fence *fence; 1877 1878 rq = intel_engine_create_kernel_request(ce->engine); 1879 if (IS_ERR(rq)) 1880 return PTR_ERR(rq); 1881 1882 fence = i915_active_fence_get(&ce->timeline->last_request); 1883 if (fence) { 1884 i915_request_await_dma_fence(rq, fence); 1885 dma_fence_put(fence); 1886 } 1887 1888 rq = i915_request_get(rq); 1889 i915_request_add(rq); 1890 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err) 1891 err = -ETIME; 1892 i915_request_put(rq); 1893 1894 while (!err && !intel_engine_is_idle(ce->engine)) 1895 intel_engine_flush_submission(ce->engine); 1896 1897 return err; 1898 } 1899 1900 struct perf_stats { 1901 struct intel_engine_cs *engine; 1902 unsigned long count; 1903 ktime_t time; 1904 ktime_t busy; 1905 u64 runtime; 1906 }; 1907 1908 struct perf_series { 1909 struct drm_i915_private *i915; 1910 unsigned int nengines; 1911 struct intel_context *ce[]; 1912 }; 1913 1914 static int cmp_u32(const void *A, const void *B) 1915 { 1916 const u32 *a = A, *b = B; 1917 1918 return *a - *b; 1919 } 1920 1921 static u32 trifilter(u32 *a) 1922 { 1923 u64 sum; 1924 1925 #define TF_COUNT 5 1926 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); 1927 1928 sum = mul_u32_u32(a[2], 2); 1929 sum += a[1]; 1930 sum += a[3]; 1931 1932 GEM_BUG_ON(sum > U32_MAX); 1933 return sum; 1934 #define TF_BIAS 2 1935 } 1936 1937 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) 1938 { 1939 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles); 1940 1941 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); 1942 } 1943 1944 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) 1945 { 1946 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 1947 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); 1948 *cs++ = offset; 1949 *cs++ = 0; 1950 1951 return cs; 1952 } 1953 1954 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) 1955 { 1956 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1957 *cs++ = offset; 1958 *cs++ = 0; 1959 *cs++ = value; 1960 1961 return cs; 1962 } 1963 1964 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) 1965 { 1966 *cs++ = MI_SEMAPHORE_WAIT | 1967 MI_SEMAPHORE_GLOBAL_GTT | 1968 MI_SEMAPHORE_POLL | 1969 mode; 1970 *cs++ = value; 1971 *cs++ = offset; 1972 *cs++ = 0; 1973 1974 return cs; 1975 } 1976 1977 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) 1978 { 1979 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); 1980 } 1981 1982 static void semaphore_set(u32 *sema, u32 value) 1983 { 1984 WRITE_ONCE(*sema, value); 1985 wmb(); /* flush the update to the cache, and beyond */ 1986 } 1987 1988 static u32 *hwsp_scratch(const struct intel_context *ce) 1989 { 1990 return memset32(ce->engine->status_page.addr + 1000, 0, 21); 1991 } 1992 1993 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) 1994 { 1995 return (i915_ggtt_offset(ce->engine->status_page.vma) + 1996 offset_in_page(dw)); 1997 } 1998 1999 static int measure_semaphore_response(struct intel_context *ce) 2000 { 2001 u32 *sema = hwsp_scratch(ce); 2002 const u32 offset = hwsp_offset(ce, sema); 2003 u32 elapsed[TF_COUNT], cycles; 2004 struct i915_request *rq; 2005 u32 *cs; 2006 int err; 2007 int i; 2008 2009 /* 2010 * Measure how many cycles it takes for the HW to detect the change 2011 * in a semaphore value. 2012 * 2013 * A: read CS_TIMESTAMP from CPU 2014 * poke semaphore 2015 * B: read CS_TIMESTAMP on GPU 2016 * 2017 * Semaphore latency: B - A 2018 */ 2019 2020 semaphore_set(sema, -1); 2021 2022 rq = i915_request_create(ce); 2023 if (IS_ERR(rq)) 2024 return PTR_ERR(rq); 2025 2026 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); 2027 if (IS_ERR(cs)) { 2028 i915_request_add(rq); 2029 err = PTR_ERR(cs); 2030 goto err; 2031 } 2032 2033 cs = emit_store_dw(cs, offset, 0); 2034 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2035 cs = emit_semaphore_poll_until(cs, offset, i); 2036 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2037 cs = emit_store_dw(cs, offset, 0); 2038 } 2039 2040 intel_ring_advance(rq, cs); 2041 i915_request_add(rq); 2042 2043 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2044 err = -EIO; 2045 goto err; 2046 } 2047 2048 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2049 preempt_disable(); 2050 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2051 semaphore_set(sema, i); 2052 preempt_enable(); 2053 2054 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2055 err = -EIO; 2056 goto err; 2057 } 2058 2059 elapsed[i - 1] = sema[i] - cycles; 2060 } 2061 2062 cycles = trifilter(elapsed); 2063 pr_info("%s: semaphore response %d cycles, %lluns\n", 2064 ce->engine->name, cycles >> TF_BIAS, 2065 cycles_to_ns(ce->engine, cycles)); 2066 2067 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2068 2069 err: 2070 intel_gt_set_wedged(ce->engine->gt); 2071 return err; 2072 } 2073 2074 static int measure_idle_dispatch(struct intel_context *ce) 2075 { 2076 u32 *sema = hwsp_scratch(ce); 2077 const u32 offset = hwsp_offset(ce, sema); 2078 u32 elapsed[TF_COUNT], cycles; 2079 u32 *cs; 2080 int err; 2081 int i; 2082 2083 /* 2084 * Measure how long it takes for us to submit a request while the 2085 * engine is idle, but is resting in our context. 2086 * 2087 * A: read CS_TIMESTAMP from CPU 2088 * submit request 2089 * B: read CS_TIMESTAMP on GPU 2090 * 2091 * Submission latency: B - A 2092 */ 2093 2094 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2095 struct i915_request *rq; 2096 2097 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2098 if (err) 2099 return err; 2100 2101 rq = i915_request_create(ce); 2102 if (IS_ERR(rq)) { 2103 err = PTR_ERR(rq); 2104 goto err; 2105 } 2106 2107 cs = intel_ring_begin(rq, 4); 2108 if (IS_ERR(cs)) { 2109 i915_request_add(rq); 2110 err = PTR_ERR(cs); 2111 goto err; 2112 } 2113 2114 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2115 2116 intel_ring_advance(rq, cs); 2117 2118 preempt_disable(); 2119 local_bh_disable(); 2120 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2121 i915_request_add(rq); 2122 local_bh_enable(); 2123 preempt_enable(); 2124 } 2125 2126 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2127 if (err) 2128 goto err; 2129 2130 for (i = 0; i < ARRAY_SIZE(elapsed); i++) 2131 elapsed[i] = sema[i] - elapsed[i]; 2132 2133 cycles = trifilter(elapsed); 2134 pr_info("%s: idle dispatch latency %d cycles, %lluns\n", 2135 ce->engine->name, cycles >> TF_BIAS, 2136 cycles_to_ns(ce->engine, cycles)); 2137 2138 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2139 2140 err: 2141 intel_gt_set_wedged(ce->engine->gt); 2142 return err; 2143 } 2144 2145 static int measure_busy_dispatch(struct intel_context *ce) 2146 { 2147 u32 *sema = hwsp_scratch(ce); 2148 const u32 offset = hwsp_offset(ce, sema); 2149 u32 elapsed[TF_COUNT + 1], cycles; 2150 u32 *cs; 2151 int err; 2152 int i; 2153 2154 /* 2155 * Measure how long it takes for us to submit a request while the 2156 * engine is busy, polling on a semaphore in our context. With 2157 * direct submission, this will include the cost of a lite restore. 2158 * 2159 * A: read CS_TIMESTAMP from CPU 2160 * submit request 2161 * B: read CS_TIMESTAMP on GPU 2162 * 2163 * Submission latency: B - A 2164 */ 2165 2166 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2167 struct i915_request *rq; 2168 2169 rq = i915_request_create(ce); 2170 if (IS_ERR(rq)) { 2171 err = PTR_ERR(rq); 2172 goto err; 2173 } 2174 2175 cs = intel_ring_begin(rq, 12); 2176 if (IS_ERR(cs)) { 2177 i915_request_add(rq); 2178 err = PTR_ERR(cs); 2179 goto err; 2180 } 2181 2182 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2183 cs = emit_semaphore_poll_until(cs, offset, i); 2184 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2185 2186 intel_ring_advance(rq, cs); 2187 2188 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { 2189 err = -EIO; 2190 goto err; 2191 } 2192 2193 preempt_disable(); 2194 local_bh_disable(); 2195 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2196 i915_request_add(rq); 2197 local_bh_enable(); 2198 semaphore_set(sema, i - 1); 2199 preempt_enable(); 2200 } 2201 2202 wait_for(READ_ONCE(sema[i - 1]), 500); 2203 semaphore_set(sema, i - 1); 2204 2205 for (i = 1; i <= TF_COUNT; i++) { 2206 GEM_BUG_ON(sema[i] == -1); 2207 elapsed[i - 1] = sema[i] - elapsed[i]; 2208 } 2209 2210 cycles = trifilter(elapsed); 2211 pr_info("%s: busy dispatch latency %d cycles, %lluns\n", 2212 ce->engine->name, cycles >> TF_BIAS, 2213 cycles_to_ns(ce->engine, cycles)); 2214 2215 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2216 2217 err: 2218 intel_gt_set_wedged(ce->engine->gt); 2219 return err; 2220 } 2221 2222 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) 2223 { 2224 const u32 offset = 2225 i915_ggtt_offset(engine->status_page.vma) + 2226 offset_in_page(sema); 2227 struct i915_request *rq; 2228 u32 *cs; 2229 2230 rq = i915_request_create(engine->kernel_context); 2231 if (IS_ERR(rq)) 2232 return PTR_ERR(rq); 2233 2234 cs = intel_ring_begin(rq, 4); 2235 if (IS_ERR(cs)) { 2236 i915_request_add(rq); 2237 return PTR_ERR(cs); 2238 } 2239 2240 cs = emit_semaphore_poll(cs, mode, value, offset); 2241 2242 intel_ring_advance(rq, cs); 2243 i915_request_add(rq); 2244 2245 return 0; 2246 } 2247 2248 static int measure_inter_request(struct intel_context *ce) 2249 { 2250 u32 *sema = hwsp_scratch(ce); 2251 const u32 offset = hwsp_offset(ce, sema); 2252 u32 elapsed[TF_COUNT + 1], cycles; 2253 struct i915_sw_fence *submit; 2254 int i, err; 2255 2256 /* 2257 * Measure how long it takes to advance from one request into the 2258 * next. Between each request we flush the GPU caches to memory, 2259 * update the breadcrumbs, and then invalidate those caches. 2260 * We queue up all the requests to be submitted in one batch so 2261 * it should be one set of contiguous measurements. 2262 * 2263 * A: read CS_TIMESTAMP on GPU 2264 * advance request 2265 * B: read CS_TIMESTAMP on GPU 2266 * 2267 * Request latency: B - A 2268 */ 2269 2270 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2271 if (err) 2272 return err; 2273 2274 submit = heap_fence_create(GFP_KERNEL); 2275 if (!submit) { 2276 semaphore_set(sema, 1); 2277 return -ENOMEM; 2278 } 2279 2280 intel_engine_flush_submission(ce->engine); 2281 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2282 struct i915_request *rq; 2283 u32 *cs; 2284 2285 rq = i915_request_create(ce); 2286 if (IS_ERR(rq)) { 2287 err = PTR_ERR(rq); 2288 goto err_submit; 2289 } 2290 2291 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 2292 submit, 2293 GFP_KERNEL); 2294 if (err < 0) { 2295 i915_request_add(rq); 2296 goto err_submit; 2297 } 2298 2299 cs = intel_ring_begin(rq, 4); 2300 if (IS_ERR(cs)) { 2301 i915_request_add(rq); 2302 err = PTR_ERR(cs); 2303 goto err_submit; 2304 } 2305 2306 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2307 2308 intel_ring_advance(rq, cs); 2309 i915_request_add(rq); 2310 } 2311 i915_sw_fence_commit(submit); 2312 intel_engine_flush_submission(ce->engine); 2313 heap_fence_put(submit); 2314 2315 semaphore_set(sema, 1); 2316 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2317 if (err) 2318 goto err; 2319 2320 for (i = 1; i <= TF_COUNT; i++) 2321 elapsed[i - 1] = sema[i + 1] - sema[i]; 2322 2323 cycles = trifilter(elapsed); 2324 pr_info("%s: inter-request latency %d cycles, %lluns\n", 2325 ce->engine->name, cycles >> TF_BIAS, 2326 cycles_to_ns(ce->engine, cycles)); 2327 2328 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2329 2330 err_submit: 2331 i915_sw_fence_commit(submit); 2332 heap_fence_put(submit); 2333 semaphore_set(sema, 1); 2334 err: 2335 intel_gt_set_wedged(ce->engine->gt); 2336 return err; 2337 } 2338 2339 static int measure_context_switch(struct intel_context *ce) 2340 { 2341 u32 *sema = hwsp_scratch(ce); 2342 const u32 offset = hwsp_offset(ce, sema); 2343 struct i915_request *fence = NULL; 2344 u32 elapsed[TF_COUNT + 1], cycles; 2345 int i, j, err; 2346 u32 *cs; 2347 2348 /* 2349 * Measure how long it takes to advance from one request in one 2350 * context to a request in another context. This allows us to 2351 * measure how long the context save/restore take, along with all 2352 * the inter-context setup we require. 2353 * 2354 * A: read CS_TIMESTAMP on GPU 2355 * switch context 2356 * B: read CS_TIMESTAMP on GPU 2357 * 2358 * Context switch latency: B - A 2359 */ 2360 2361 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2362 if (err) 2363 return err; 2364 2365 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2366 struct intel_context *arr[] = { 2367 ce, ce->engine->kernel_context 2368 }; 2369 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); 2370 2371 for (j = 0; j < ARRAY_SIZE(arr); j++) { 2372 struct i915_request *rq; 2373 2374 rq = i915_request_create(arr[j]); 2375 if (IS_ERR(rq)) { 2376 err = PTR_ERR(rq); 2377 goto err_fence; 2378 } 2379 2380 if (fence) { 2381 err = i915_request_await_dma_fence(rq, 2382 &fence->fence); 2383 if (err) { 2384 i915_request_add(rq); 2385 goto err_fence; 2386 } 2387 } 2388 2389 cs = intel_ring_begin(rq, 4); 2390 if (IS_ERR(cs)) { 2391 i915_request_add(rq); 2392 err = PTR_ERR(cs); 2393 goto err_fence; 2394 } 2395 2396 cs = emit_timestamp_store(cs, ce, addr); 2397 addr += sizeof(u32); 2398 2399 intel_ring_advance(rq, cs); 2400 2401 i915_request_put(fence); 2402 fence = i915_request_get(rq); 2403 2404 i915_request_add(rq); 2405 } 2406 } 2407 i915_request_put(fence); 2408 intel_engine_flush_submission(ce->engine); 2409 2410 semaphore_set(sema, 1); 2411 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2412 if (err) 2413 goto err; 2414 2415 for (i = 1; i <= TF_COUNT; i++) 2416 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; 2417 2418 cycles = trifilter(elapsed); 2419 pr_info("%s: context switch latency %d cycles, %lluns\n", 2420 ce->engine->name, cycles >> TF_BIAS, 2421 cycles_to_ns(ce->engine, cycles)); 2422 2423 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2424 2425 err_fence: 2426 i915_request_put(fence); 2427 semaphore_set(sema, 1); 2428 err: 2429 intel_gt_set_wedged(ce->engine->gt); 2430 return err; 2431 } 2432 2433 static int measure_preemption(struct intel_context *ce) 2434 { 2435 u32 *sema = hwsp_scratch(ce); 2436 const u32 offset = hwsp_offset(ce, sema); 2437 u32 elapsed[TF_COUNT], cycles; 2438 u32 *cs; 2439 int err; 2440 int i; 2441 2442 /* 2443 * We measure two latencies while triggering preemption. The first 2444 * latency is how long it takes for us to submit a preempting request. 2445 * The second latency is how it takes for us to return from the 2446 * preemption back to the original context. 2447 * 2448 * A: read CS_TIMESTAMP from CPU 2449 * submit preemption 2450 * B: read CS_TIMESTAMP on GPU (in preempting context) 2451 * context switch 2452 * C: read CS_TIMESTAMP on GPU (in original context) 2453 * 2454 * Preemption dispatch latency: B - A 2455 * Preemption switch latency: C - B 2456 */ 2457 2458 if (!intel_engine_has_preemption(ce->engine)) 2459 return 0; 2460 2461 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2462 u32 addr = offset + 2 * i * sizeof(u32); 2463 struct i915_request *rq; 2464 2465 rq = i915_request_create(ce); 2466 if (IS_ERR(rq)) { 2467 err = PTR_ERR(rq); 2468 goto err; 2469 } 2470 2471 cs = intel_ring_begin(rq, 12); 2472 if (IS_ERR(cs)) { 2473 i915_request_add(rq); 2474 err = PTR_ERR(cs); 2475 goto err; 2476 } 2477 2478 cs = emit_store_dw(cs, addr, -1); 2479 cs = emit_semaphore_poll_until(cs, offset, i); 2480 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); 2481 2482 intel_ring_advance(rq, cs); 2483 i915_request_add(rq); 2484 2485 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { 2486 err = -EIO; 2487 goto err; 2488 } 2489 2490 rq = i915_request_create(ce->engine->kernel_context); 2491 if (IS_ERR(rq)) { 2492 err = PTR_ERR(rq); 2493 goto err; 2494 } 2495 2496 cs = intel_ring_begin(rq, 8); 2497 if (IS_ERR(cs)) { 2498 i915_request_add(rq); 2499 err = PTR_ERR(cs); 2500 goto err; 2501 } 2502 2503 cs = emit_timestamp_store(cs, ce, addr); 2504 cs = emit_store_dw(cs, offset, i); 2505 2506 intel_ring_advance(rq, cs); 2507 rq->sched.attr.priority = I915_PRIORITY_BARRIER; 2508 2509 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2510 i915_request_add(rq); 2511 } 2512 2513 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { 2514 err = -EIO; 2515 goto err; 2516 } 2517 2518 for (i = 1; i <= TF_COUNT; i++) 2519 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; 2520 2521 cycles = trifilter(elapsed); 2522 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", 2523 ce->engine->name, cycles >> TF_BIAS, 2524 cycles_to_ns(ce->engine, cycles)); 2525 2526 for (i = 1; i <= TF_COUNT; i++) 2527 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; 2528 2529 cycles = trifilter(elapsed); 2530 pr_info("%s: preemption switch latency %d cycles, %lluns\n", 2531 ce->engine->name, cycles >> TF_BIAS, 2532 cycles_to_ns(ce->engine, cycles)); 2533 2534 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2535 2536 err: 2537 intel_gt_set_wedged(ce->engine->gt); 2538 return err; 2539 } 2540 2541 struct signal_cb { 2542 struct dma_fence_cb base; 2543 bool seen; 2544 }; 2545 2546 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 2547 { 2548 struct signal_cb *s = container_of(cb, typeof(*s), base); 2549 2550 smp_store_mb(s->seen, true); /* be safe, be strong */ 2551 } 2552 2553 static int measure_completion(struct intel_context *ce) 2554 { 2555 u32 *sema = hwsp_scratch(ce); 2556 const u32 offset = hwsp_offset(ce, sema); 2557 u32 elapsed[TF_COUNT], cycles; 2558 u32 *cs; 2559 int err; 2560 int i; 2561 2562 /* 2563 * Measure how long it takes for the signal (interrupt) to be 2564 * sent from the GPU to be processed by the CPU. 2565 * 2566 * A: read CS_TIMESTAMP on GPU 2567 * signal 2568 * B: read CS_TIMESTAMP from CPU 2569 * 2570 * Completion latency: B - A 2571 */ 2572 2573 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2574 struct signal_cb cb = { .seen = false }; 2575 struct i915_request *rq; 2576 2577 rq = i915_request_create(ce); 2578 if (IS_ERR(rq)) { 2579 err = PTR_ERR(rq); 2580 goto err; 2581 } 2582 2583 cs = intel_ring_begin(rq, 12); 2584 if (IS_ERR(cs)) { 2585 i915_request_add(rq); 2586 err = PTR_ERR(cs); 2587 goto err; 2588 } 2589 2590 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2591 cs = emit_semaphore_poll_until(cs, offset, i); 2592 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2593 2594 intel_ring_advance(rq, cs); 2595 2596 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); 2597 i915_request_add(rq); 2598 2599 intel_engine_flush_submission(ce->engine); 2600 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { 2601 err = -EIO; 2602 goto err; 2603 } 2604 2605 preempt_disable(); 2606 semaphore_set(sema, i); 2607 while (!READ_ONCE(cb.seen)) 2608 cpu_relax(); 2609 2610 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2611 preempt_enable(); 2612 } 2613 2614 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2615 if (err) 2616 goto err; 2617 2618 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2619 GEM_BUG_ON(sema[i + 1] == -1); 2620 elapsed[i] = elapsed[i] - sema[i + 1]; 2621 } 2622 2623 cycles = trifilter(elapsed); 2624 pr_info("%s: completion latency %d cycles, %lluns\n", 2625 ce->engine->name, cycles >> TF_BIAS, 2626 cycles_to_ns(ce->engine, cycles)); 2627 2628 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2629 2630 err: 2631 intel_gt_set_wedged(ce->engine->gt); 2632 return err; 2633 } 2634 2635 static void rps_pin(struct intel_gt *gt) 2636 { 2637 /* Pin the frequency to max */ 2638 atomic_inc(>->rps.num_waiters); 2639 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 2640 2641 mutex_lock(>->rps.lock); 2642 intel_rps_set(>->rps, gt->rps.max_freq); 2643 mutex_unlock(>->rps.lock); 2644 } 2645 2646 static void rps_unpin(struct intel_gt *gt) 2647 { 2648 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 2649 atomic_dec(>->rps.num_waiters); 2650 } 2651 2652 static int perf_request_latency(void *arg) 2653 { 2654 struct drm_i915_private *i915 = arg; 2655 struct intel_engine_cs *engine; 2656 struct pm_qos_request qos; 2657 int err = 0; 2658 2659 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */ 2660 return 0; 2661 2662 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2663 2664 for_each_uabi_engine(engine, i915) { 2665 struct intel_context *ce; 2666 2667 ce = intel_context_create(engine); 2668 if (IS_ERR(ce)) { 2669 err = PTR_ERR(ce); 2670 goto out; 2671 } 2672 2673 err = intel_context_pin(ce); 2674 if (err) { 2675 intel_context_put(ce); 2676 goto out; 2677 } 2678 2679 st_engine_heartbeat_disable(engine); 2680 rps_pin(engine->gt); 2681 2682 if (err == 0) 2683 err = measure_semaphore_response(ce); 2684 if (err == 0) 2685 err = measure_idle_dispatch(ce); 2686 if (err == 0) 2687 err = measure_busy_dispatch(ce); 2688 if (err == 0) 2689 err = measure_inter_request(ce); 2690 if (err == 0) 2691 err = measure_context_switch(ce); 2692 if (err == 0) 2693 err = measure_preemption(ce); 2694 if (err == 0) 2695 err = measure_completion(ce); 2696 2697 rps_unpin(engine->gt); 2698 st_engine_heartbeat_enable(engine); 2699 2700 intel_context_unpin(ce); 2701 intel_context_put(ce); 2702 if (err) 2703 goto out; 2704 } 2705 2706 out: 2707 if (igt_flush_test(i915)) 2708 err = -EIO; 2709 2710 cpu_latency_qos_remove_request(&qos); 2711 return err; 2712 } 2713 2714 static int s_sync0(void *arg) 2715 { 2716 struct perf_series *ps = arg; 2717 IGT_TIMEOUT(end_time); 2718 unsigned int idx = 0; 2719 int err = 0; 2720 2721 GEM_BUG_ON(!ps->nengines); 2722 do { 2723 struct i915_request *rq; 2724 2725 rq = i915_request_create(ps->ce[idx]); 2726 if (IS_ERR(rq)) { 2727 err = PTR_ERR(rq); 2728 break; 2729 } 2730 2731 i915_request_get(rq); 2732 i915_request_add(rq); 2733 2734 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2735 err = -ETIME; 2736 i915_request_put(rq); 2737 if (err) 2738 break; 2739 2740 if (++idx == ps->nengines) 2741 idx = 0; 2742 } while (!__igt_timeout(end_time, NULL)); 2743 2744 return err; 2745 } 2746 2747 static int s_sync1(void *arg) 2748 { 2749 struct perf_series *ps = arg; 2750 struct i915_request *prev = NULL; 2751 IGT_TIMEOUT(end_time); 2752 unsigned int idx = 0; 2753 int err = 0; 2754 2755 GEM_BUG_ON(!ps->nengines); 2756 do { 2757 struct i915_request *rq; 2758 2759 rq = i915_request_create(ps->ce[idx]); 2760 if (IS_ERR(rq)) { 2761 err = PTR_ERR(rq); 2762 break; 2763 } 2764 2765 i915_request_get(rq); 2766 i915_request_add(rq); 2767 2768 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2769 err = -ETIME; 2770 i915_request_put(prev); 2771 prev = rq; 2772 if (err) 2773 break; 2774 2775 if (++idx == ps->nengines) 2776 idx = 0; 2777 } while (!__igt_timeout(end_time, NULL)); 2778 i915_request_put(prev); 2779 2780 return err; 2781 } 2782 2783 static int s_many(void *arg) 2784 { 2785 struct perf_series *ps = arg; 2786 IGT_TIMEOUT(end_time); 2787 unsigned int idx = 0; 2788 2789 GEM_BUG_ON(!ps->nengines); 2790 do { 2791 struct i915_request *rq; 2792 2793 rq = i915_request_create(ps->ce[idx]); 2794 if (IS_ERR(rq)) 2795 return PTR_ERR(rq); 2796 2797 i915_request_add(rq); 2798 2799 if (++idx == ps->nengines) 2800 idx = 0; 2801 } while (!__igt_timeout(end_time, NULL)); 2802 2803 return 0; 2804 } 2805 2806 static int perf_series_engines(void *arg) 2807 { 2808 struct drm_i915_private *i915 = arg; 2809 static int (* const func[])(void *arg) = { 2810 s_sync0, 2811 s_sync1, 2812 s_many, 2813 NULL, 2814 }; 2815 const unsigned int nengines = num_uabi_engines(i915); 2816 struct intel_engine_cs *engine; 2817 int (* const *fn)(void *arg); 2818 struct pm_qos_request qos; 2819 struct perf_stats *stats; 2820 struct perf_series *ps; 2821 unsigned int idx; 2822 int err = 0; 2823 2824 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL); 2825 if (!stats) 2826 return -ENOMEM; 2827 2828 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL); 2829 if (!ps) { 2830 kfree(stats); 2831 return -ENOMEM; 2832 } 2833 2834 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2835 2836 ps->i915 = i915; 2837 ps->nengines = nengines; 2838 2839 idx = 0; 2840 for_each_uabi_engine(engine, i915) { 2841 struct intel_context *ce; 2842 2843 ce = intel_context_create(engine); 2844 if (IS_ERR(ce)) { 2845 err = PTR_ERR(ce); 2846 goto out; 2847 } 2848 2849 err = intel_context_pin(ce); 2850 if (err) { 2851 intel_context_put(ce); 2852 goto out; 2853 } 2854 2855 ps->ce[idx++] = ce; 2856 } 2857 GEM_BUG_ON(idx != ps->nengines); 2858 2859 for (fn = func; *fn && !err; fn++) { 2860 char name[KSYM_NAME_LEN]; 2861 struct igt_live_test t; 2862 2863 snprintf(name, sizeof(name), "%ps", *fn); 2864 err = igt_live_test_begin(&t, i915, __func__, name); 2865 if (err) 2866 break; 2867 2868 for (idx = 0; idx < nengines; idx++) { 2869 struct perf_stats *p = 2870 memset(&stats[idx], 0, sizeof(stats[idx])); 2871 struct intel_context *ce = ps->ce[idx]; 2872 2873 p->engine = ps->ce[idx]->engine; 2874 intel_engine_pm_get(p->engine); 2875 2876 if (intel_engine_supports_stats(p->engine)) 2877 p->busy = intel_engine_get_busy_time(p->engine, 2878 &p->time) + 1; 2879 else 2880 p->time = ktime_get(); 2881 p->runtime = -intel_context_get_total_runtime_ns(ce); 2882 } 2883 2884 err = (*fn)(ps); 2885 if (igt_live_test_end(&t)) 2886 err = -EIO; 2887 2888 for (idx = 0; idx < nengines; idx++) { 2889 struct perf_stats *p = &stats[idx]; 2890 struct intel_context *ce = ps->ce[idx]; 2891 int integer, decimal; 2892 u64 busy, dt, now; 2893 2894 if (p->busy) 2895 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine, 2896 &now), 2897 p->busy - 1); 2898 else 2899 now = ktime_get(); 2900 p->time = ktime_sub(now, p->time); 2901 2902 err = switch_to_kernel_sync(ce, err); 2903 p->runtime += intel_context_get_total_runtime_ns(ce); 2904 intel_engine_pm_put(p->engine); 2905 2906 busy = 100 * ktime_to_ns(p->busy); 2907 dt = ktime_to_ns(p->time); 2908 if (dt) { 2909 integer = div64_u64(busy, dt); 2910 busy -= integer * dt; 2911 decimal = div64_u64(100 * busy, dt); 2912 } else { 2913 integer = 0; 2914 decimal = 0; 2915 } 2916 2917 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2918 name, p->engine->name, ce->timeline->seqno, 2919 integer, decimal, 2920 div_u64(p->runtime, 1000 * 1000), 2921 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2922 } 2923 } 2924 2925 out: 2926 for (idx = 0; idx < nengines; idx++) { 2927 if (IS_ERR_OR_NULL(ps->ce[idx])) 2928 break; 2929 2930 intel_context_unpin(ps->ce[idx]); 2931 intel_context_put(ps->ce[idx]); 2932 } 2933 kfree(ps); 2934 2935 cpu_latency_qos_remove_request(&qos); 2936 kfree(stats); 2937 return err; 2938 } 2939 2940 struct p_thread { 2941 struct perf_stats p; 2942 struct kthread_worker *worker; 2943 struct kthread_work work; 2944 struct intel_engine_cs *engine; 2945 int result; 2946 }; 2947 2948 static void p_sync0(struct kthread_work *work) 2949 { 2950 struct p_thread *thread = container_of(work, typeof(*thread), work); 2951 struct perf_stats *p = &thread->p; 2952 struct intel_engine_cs *engine = p->engine; 2953 struct intel_context *ce; 2954 IGT_TIMEOUT(end_time); 2955 unsigned long count; 2956 bool busy; 2957 int err = 0; 2958 2959 ce = intel_context_create(engine); 2960 if (IS_ERR(ce)) { 2961 thread->result = PTR_ERR(ce); 2962 return; 2963 } 2964 2965 err = intel_context_pin(ce); 2966 if (err) { 2967 intel_context_put(ce); 2968 thread->result = err; 2969 return; 2970 } 2971 2972 if (intel_engine_supports_stats(engine)) { 2973 p->busy = intel_engine_get_busy_time(engine, &p->time); 2974 busy = true; 2975 } else { 2976 p->time = ktime_get(); 2977 busy = false; 2978 } 2979 2980 count = 0; 2981 do { 2982 struct i915_request *rq; 2983 2984 rq = i915_request_create(ce); 2985 if (IS_ERR(rq)) { 2986 err = PTR_ERR(rq); 2987 break; 2988 } 2989 2990 i915_request_get(rq); 2991 i915_request_add(rq); 2992 2993 err = 0; 2994 if (i915_request_wait(rq, 0, HZ) < 0) 2995 err = -ETIME; 2996 i915_request_put(rq); 2997 if (err) 2998 break; 2999 3000 count++; 3001 } while (!__igt_timeout(end_time, NULL)); 3002 3003 if (busy) { 3004 ktime_t now; 3005 3006 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3007 p->busy); 3008 p->time = ktime_sub(now, p->time); 3009 } else { 3010 p->time = ktime_sub(ktime_get(), p->time); 3011 } 3012 3013 err = switch_to_kernel_sync(ce, err); 3014 p->runtime = intel_context_get_total_runtime_ns(ce); 3015 p->count = count; 3016 3017 intel_context_unpin(ce); 3018 intel_context_put(ce); 3019 thread->result = err; 3020 } 3021 3022 static void p_sync1(struct kthread_work *work) 3023 { 3024 struct p_thread *thread = container_of(work, typeof(*thread), work); 3025 struct perf_stats *p = &thread->p; 3026 struct intel_engine_cs *engine = p->engine; 3027 struct i915_request *prev = NULL; 3028 struct intel_context *ce; 3029 IGT_TIMEOUT(end_time); 3030 unsigned long count; 3031 bool busy; 3032 int err = 0; 3033 3034 ce = intel_context_create(engine); 3035 if (IS_ERR(ce)) { 3036 thread->result = PTR_ERR(ce); 3037 return; 3038 } 3039 3040 err = intel_context_pin(ce); 3041 if (err) { 3042 intel_context_put(ce); 3043 thread->result = err; 3044 return; 3045 } 3046 3047 if (intel_engine_supports_stats(engine)) { 3048 p->busy = intel_engine_get_busy_time(engine, &p->time); 3049 busy = true; 3050 } else { 3051 p->time = ktime_get(); 3052 busy = false; 3053 } 3054 3055 count = 0; 3056 do { 3057 struct i915_request *rq; 3058 3059 rq = i915_request_create(ce); 3060 if (IS_ERR(rq)) { 3061 err = PTR_ERR(rq); 3062 break; 3063 } 3064 3065 i915_request_get(rq); 3066 i915_request_add(rq); 3067 3068 err = 0; 3069 if (prev && i915_request_wait(prev, 0, HZ) < 0) 3070 err = -ETIME; 3071 i915_request_put(prev); 3072 prev = rq; 3073 if (err) 3074 break; 3075 3076 count++; 3077 } while (!__igt_timeout(end_time, NULL)); 3078 i915_request_put(prev); 3079 3080 if (busy) { 3081 ktime_t now; 3082 3083 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3084 p->busy); 3085 p->time = ktime_sub(now, p->time); 3086 } else { 3087 p->time = ktime_sub(ktime_get(), p->time); 3088 } 3089 3090 err = switch_to_kernel_sync(ce, err); 3091 p->runtime = intel_context_get_total_runtime_ns(ce); 3092 p->count = count; 3093 3094 intel_context_unpin(ce); 3095 intel_context_put(ce); 3096 thread->result = err; 3097 } 3098 3099 static void p_many(struct kthread_work *work) 3100 { 3101 struct p_thread *thread = container_of(work, typeof(*thread), work); 3102 struct perf_stats *p = &thread->p; 3103 struct intel_engine_cs *engine = p->engine; 3104 struct intel_context *ce; 3105 IGT_TIMEOUT(end_time); 3106 unsigned long count; 3107 int err = 0; 3108 bool busy; 3109 3110 ce = intel_context_create(engine); 3111 if (IS_ERR(ce)) { 3112 thread->result = PTR_ERR(ce); 3113 return; 3114 } 3115 3116 err = intel_context_pin(ce); 3117 if (err) { 3118 intel_context_put(ce); 3119 thread->result = err; 3120 return; 3121 } 3122 3123 if (intel_engine_supports_stats(engine)) { 3124 p->busy = intel_engine_get_busy_time(engine, &p->time); 3125 busy = true; 3126 } else { 3127 p->time = ktime_get(); 3128 busy = false; 3129 } 3130 3131 count = 0; 3132 do { 3133 struct i915_request *rq; 3134 3135 rq = i915_request_create(ce); 3136 if (IS_ERR(rq)) { 3137 err = PTR_ERR(rq); 3138 break; 3139 } 3140 3141 i915_request_add(rq); 3142 count++; 3143 } while (!__igt_timeout(end_time, NULL)); 3144 3145 if (busy) { 3146 ktime_t now; 3147 3148 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3149 p->busy); 3150 p->time = ktime_sub(now, p->time); 3151 } else { 3152 p->time = ktime_sub(ktime_get(), p->time); 3153 } 3154 3155 err = switch_to_kernel_sync(ce, err); 3156 p->runtime = intel_context_get_total_runtime_ns(ce); 3157 p->count = count; 3158 3159 intel_context_unpin(ce); 3160 intel_context_put(ce); 3161 thread->result = err; 3162 } 3163 3164 static int perf_parallel_engines(void *arg) 3165 { 3166 struct drm_i915_private *i915 = arg; 3167 static void (* const func[])(struct kthread_work *) = { 3168 p_sync0, 3169 p_sync1, 3170 p_many, 3171 NULL, 3172 }; 3173 const unsigned int nengines = num_uabi_engines(i915); 3174 void (* const *fn)(struct kthread_work *); 3175 struct intel_engine_cs *engine; 3176 struct pm_qos_request qos; 3177 struct p_thread *engines; 3178 int err = 0; 3179 3180 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL); 3181 if (!engines) 3182 return -ENOMEM; 3183 3184 cpu_latency_qos_add_request(&qos, 0); 3185 3186 for (fn = func; *fn; fn++) { 3187 char name[KSYM_NAME_LEN]; 3188 struct igt_live_test t; 3189 unsigned int idx; 3190 3191 snprintf(name, sizeof(name), "%ps", *fn); 3192 err = igt_live_test_begin(&t, i915, __func__, name); 3193 if (err) 3194 break; 3195 3196 atomic_set(&i915->selftest.counter, nengines); 3197 3198 idx = 0; 3199 for_each_uabi_engine(engine, i915) { 3200 struct kthread_worker *worker; 3201 3202 intel_engine_pm_get(engine); 3203 3204 memset(&engines[idx].p, 0, sizeof(engines[idx].p)); 3205 3206 worker = kthread_create_worker(0, "igt:%s", 3207 engine->name); 3208 if (IS_ERR(worker)) { 3209 err = PTR_ERR(worker); 3210 intel_engine_pm_put(engine); 3211 break; 3212 } 3213 engines[idx].worker = worker; 3214 engines[idx].result = 0; 3215 engines[idx].p.engine = engine; 3216 engines[idx].engine = engine; 3217 3218 kthread_init_work(&engines[idx].work, *fn); 3219 kthread_queue_work(worker, &engines[idx].work); 3220 idx++; 3221 } 3222 3223 idx = 0; 3224 for_each_uabi_engine(engine, i915) { 3225 int status; 3226 3227 if (!engines[idx].worker) 3228 break; 3229 3230 kthread_flush_work(&engines[idx].work); 3231 status = READ_ONCE(engines[idx].result); 3232 if (status && !err) 3233 err = status; 3234 3235 intel_engine_pm_put(engine); 3236 3237 kthread_destroy_worker(engines[idx].worker); 3238 idx++; 3239 } 3240 3241 if (igt_live_test_end(&t)) 3242 err = -EIO; 3243 if (err) 3244 break; 3245 3246 idx = 0; 3247 for_each_uabi_engine(engine, i915) { 3248 struct perf_stats *p = &engines[idx].p; 3249 u64 busy = 100 * ktime_to_ns(p->busy); 3250 u64 dt = ktime_to_ns(p->time); 3251 int integer, decimal; 3252 3253 if (dt) { 3254 integer = div64_u64(busy, dt); 3255 busy -= integer * dt; 3256 decimal = div64_u64(100 * busy, dt); 3257 } else { 3258 integer = 0; 3259 decimal = 0; 3260 } 3261 3262 GEM_BUG_ON(engine != p->engine); 3263 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 3264 name, engine->name, p->count, integer, decimal, 3265 div_u64(p->runtime, 1000 * 1000), 3266 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 3267 idx++; 3268 } 3269 } 3270 3271 cpu_latency_qos_remove_request(&qos); 3272 kfree(engines); 3273 return err; 3274 } 3275 3276 int i915_request_perf_selftests(struct drm_i915_private *i915) 3277 { 3278 static const struct i915_subtest tests[] = { 3279 SUBTEST(perf_request_latency), 3280 SUBTEST(perf_series_engines), 3281 SUBTEST(perf_parallel_engines), 3282 }; 3283 3284 if (intel_gt_is_wedged(to_gt(i915))) 3285 return 0; 3286 3287 return i915_subtests(tests, i915); 3288 } 3289