1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/prime_numbers.h> 26 #include <linux/pm_qos.h> 27 #include <linux/sort.h> 28 29 #include "gem/i915_gem_internal.h" 30 #include "gem/i915_gem_pm.h" 31 #include "gem/selftests/mock_context.h" 32 33 #include "gt/intel_engine_heartbeat.h" 34 #include "gt/intel_engine_pm.h" 35 #include "gt/intel_engine_user.h" 36 #include "gt/intel_gt.h" 37 #include "gt/intel_gt_clock_utils.h" 38 #include "gt/intel_gt_requests.h" 39 #include "gt/selftest_engine_heartbeat.h" 40 41 #include "i915_random.h" 42 #include "i915_selftest.h" 43 #include "igt_flush_test.h" 44 #include "igt_live_test.h" 45 #include "igt_spinner.h" 46 #include "lib_sw_fence.h" 47 48 #include "mock_drm.h" 49 #include "mock_gem_device.h" 50 51 static unsigned int num_uabi_engines(struct drm_i915_private *i915) 52 { 53 struct intel_engine_cs *engine; 54 unsigned int count; 55 56 count = 0; 57 for_each_uabi_engine(engine, i915) 58 count++; 59 60 return count; 61 } 62 63 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915) 64 { 65 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0); 66 } 67 68 static int igt_add_request(void *arg) 69 { 70 struct drm_i915_private *i915 = arg; 71 struct i915_request *request; 72 73 /* Basic preliminary test to create a request and let it loose! */ 74 75 request = mock_request(rcs0(i915)->kernel_context, HZ / 10); 76 if (!request) 77 return -ENOMEM; 78 79 i915_request_add(request); 80 81 return 0; 82 } 83 84 static int igt_wait_request(void *arg) 85 { 86 const long T = HZ / 4; 87 struct drm_i915_private *i915 = arg; 88 struct i915_request *request; 89 int err = -EINVAL; 90 91 /* Submit a request, then wait upon it */ 92 93 request = mock_request(rcs0(i915)->kernel_context, T); 94 if (!request) 95 return -ENOMEM; 96 97 i915_request_get(request); 98 99 if (i915_request_wait(request, 0, 0) != -ETIME) { 100 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n"); 101 goto out_request; 102 } 103 104 if (i915_request_wait(request, 0, T) != -ETIME) { 105 pr_err("request wait succeeded (expected timeout before submit!)\n"); 106 goto out_request; 107 } 108 109 if (i915_request_completed(request)) { 110 pr_err("request completed before submit!!\n"); 111 goto out_request; 112 } 113 114 i915_request_add(request); 115 116 if (i915_request_wait(request, 0, 0) != -ETIME) { 117 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n"); 118 goto out_request; 119 } 120 121 if (i915_request_completed(request)) { 122 pr_err("request completed immediately!\n"); 123 goto out_request; 124 } 125 126 if (i915_request_wait(request, 0, T / 2) != -ETIME) { 127 pr_err("request wait succeeded (expected timeout!)\n"); 128 goto out_request; 129 } 130 131 if (i915_request_wait(request, 0, T) == -ETIME) { 132 pr_err("request wait timed out!\n"); 133 goto out_request; 134 } 135 136 if (!i915_request_completed(request)) { 137 pr_err("request not complete after waiting!\n"); 138 goto out_request; 139 } 140 141 if (i915_request_wait(request, 0, T) == -ETIME) { 142 pr_err("request wait timed out when already complete!\n"); 143 goto out_request; 144 } 145 146 err = 0; 147 out_request: 148 i915_request_put(request); 149 mock_device_flush(i915); 150 return err; 151 } 152 153 static int igt_fence_wait(void *arg) 154 { 155 const long T = HZ / 4; 156 struct drm_i915_private *i915 = arg; 157 struct i915_request *request; 158 int err = -EINVAL; 159 160 /* Submit a request, treat it as a fence and wait upon it */ 161 162 request = mock_request(rcs0(i915)->kernel_context, T); 163 if (!request) 164 return -ENOMEM; 165 166 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) { 167 pr_err("fence wait success before submit (expected timeout)!\n"); 168 goto out; 169 } 170 171 i915_request_add(request); 172 173 if (dma_fence_is_signaled(&request->fence)) { 174 pr_err("fence signaled immediately!\n"); 175 goto out; 176 } 177 178 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) { 179 pr_err("fence wait success after submit (expected timeout)!\n"); 180 goto out; 181 } 182 183 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 184 pr_err("fence wait timed out (expected success)!\n"); 185 goto out; 186 } 187 188 if (!dma_fence_is_signaled(&request->fence)) { 189 pr_err("fence unsignaled after waiting!\n"); 190 goto out; 191 } 192 193 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 194 pr_err("fence wait timed out when complete (expected success)!\n"); 195 goto out; 196 } 197 198 err = 0; 199 out: 200 mock_device_flush(i915); 201 return err; 202 } 203 204 static int igt_request_rewind(void *arg) 205 { 206 struct drm_i915_private *i915 = arg; 207 struct i915_request *request, *vip; 208 struct i915_gem_context *ctx[2]; 209 struct intel_context *ce; 210 int err = -EINVAL; 211 212 ctx[0] = mock_context(i915, "A"); 213 if (!ctx[0]) { 214 err = -ENOMEM; 215 goto err_ctx_0; 216 } 217 218 ce = i915_gem_context_get_engine(ctx[0], RCS0); 219 GEM_BUG_ON(IS_ERR(ce)); 220 request = mock_request(ce, 2 * HZ); 221 intel_context_put(ce); 222 if (!request) { 223 err = -ENOMEM; 224 goto err_context_0; 225 } 226 227 i915_request_get(request); 228 i915_request_add(request); 229 230 ctx[1] = mock_context(i915, "B"); 231 if (!ctx[1]) { 232 err = -ENOMEM; 233 goto err_ctx_1; 234 } 235 236 ce = i915_gem_context_get_engine(ctx[1], RCS0); 237 GEM_BUG_ON(IS_ERR(ce)); 238 vip = mock_request(ce, 0); 239 intel_context_put(ce); 240 if (!vip) { 241 err = -ENOMEM; 242 goto err_context_1; 243 } 244 245 /* Simulate preemption by manual reordering */ 246 if (!mock_cancel_request(request)) { 247 pr_err("failed to cancel request (already executed)!\n"); 248 i915_request_add(vip); 249 goto err_context_1; 250 } 251 i915_request_get(vip); 252 i915_request_add(vip); 253 rcu_read_lock(); 254 request->engine->submit_request(request); 255 rcu_read_unlock(); 256 257 258 if (i915_request_wait(vip, 0, HZ) == -ETIME) { 259 pr_err("timed out waiting for high priority request\n"); 260 goto err; 261 } 262 263 if (i915_request_completed(request)) { 264 pr_err("low priority request already completed\n"); 265 goto err; 266 } 267 268 err = 0; 269 err: 270 i915_request_put(vip); 271 err_context_1: 272 mock_context_close(ctx[1]); 273 err_ctx_1: 274 i915_request_put(request); 275 err_context_0: 276 mock_context_close(ctx[0]); 277 err_ctx_0: 278 mock_device_flush(i915); 279 return err; 280 } 281 282 struct smoketest { 283 struct intel_engine_cs *engine; 284 struct i915_gem_context **contexts; 285 atomic_long_t num_waits, num_fences; 286 int ncontexts, max_batch; 287 struct i915_request *(*request_alloc)(struct intel_context *ce); 288 }; 289 290 static struct i915_request * 291 __mock_request_alloc(struct intel_context *ce) 292 { 293 return mock_request(ce, 0); 294 } 295 296 static struct i915_request * 297 __live_request_alloc(struct intel_context *ce) 298 { 299 return intel_context_create_request(ce); 300 } 301 302 struct smoke_thread { 303 struct kthread_worker *worker; 304 struct kthread_work work; 305 struct smoketest *t; 306 bool stop; 307 int result; 308 }; 309 310 static void __igt_breadcrumbs_smoketest(struct kthread_work *work) 311 { 312 struct smoke_thread *thread = container_of(work, typeof(*thread), work); 313 struct smoketest *t = thread->t; 314 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1; 315 const unsigned int total = 4 * t->ncontexts + 1; 316 unsigned int num_waits = 0, num_fences = 0; 317 struct i915_request **requests; 318 I915_RND_STATE(prng); 319 unsigned int *order; 320 int err = 0; 321 322 /* 323 * A very simple test to catch the most egregious of list handling bugs. 324 * 325 * At its heart, we simply create oodles of requests running across 326 * multiple kthreads and enable signaling on them, for the sole purpose 327 * of stressing our breadcrumb handling. The only inspection we do is 328 * that the fences were marked as signaled. 329 */ 330 331 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL); 332 if (!requests) { 333 thread->result = -ENOMEM; 334 return; 335 } 336 337 order = i915_random_order(total, &prng); 338 if (!order) { 339 err = -ENOMEM; 340 goto out_requests; 341 } 342 343 while (!READ_ONCE(thread->stop)) { 344 struct i915_sw_fence *submit, *wait; 345 unsigned int n, count; 346 347 submit = heap_fence_create(GFP_KERNEL); 348 if (!submit) { 349 err = -ENOMEM; 350 break; 351 } 352 353 wait = heap_fence_create(GFP_KERNEL); 354 if (!wait) { 355 i915_sw_fence_commit(submit); 356 heap_fence_put(submit); 357 err = -ENOMEM; 358 break; 359 } 360 361 i915_random_reorder(order, total, &prng); 362 count = 1 + i915_prandom_u32_max_state(max_batch, &prng); 363 364 for (n = 0; n < count; n++) { 365 struct i915_gem_context *ctx = 366 t->contexts[order[n] % t->ncontexts]; 367 struct i915_request *rq; 368 struct intel_context *ce; 369 370 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx); 371 GEM_BUG_ON(IS_ERR(ce)); 372 rq = t->request_alloc(ce); 373 intel_context_put(ce); 374 if (IS_ERR(rq)) { 375 err = PTR_ERR(rq); 376 count = n; 377 break; 378 } 379 380 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 381 submit, 382 GFP_KERNEL); 383 384 requests[n] = i915_request_get(rq); 385 i915_request_add(rq); 386 387 if (err >= 0) 388 err = i915_sw_fence_await_dma_fence(wait, 389 &rq->fence, 390 0, 391 GFP_KERNEL); 392 393 if (err < 0) { 394 i915_request_put(rq); 395 count = n; 396 break; 397 } 398 } 399 400 i915_sw_fence_commit(submit); 401 i915_sw_fence_commit(wait); 402 403 if (!wait_event_timeout(wait->wait, 404 i915_sw_fence_done(wait), 405 5 * HZ)) { 406 struct i915_request *rq = requests[count - 1]; 407 408 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", 409 atomic_read(&wait->pending), count, 410 rq->fence.context, rq->fence.seqno, 411 t->engine->name); 412 GEM_TRACE_DUMP(); 413 414 intel_gt_set_wedged(t->engine->gt); 415 GEM_BUG_ON(!i915_request_completed(rq)); 416 i915_sw_fence_wait(wait); 417 err = -EIO; 418 } 419 420 for (n = 0; n < count; n++) { 421 struct i915_request *rq = requests[n]; 422 423 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 424 &rq->fence.flags)) { 425 pr_err("%llu:%llu was not signaled!\n", 426 rq->fence.context, rq->fence.seqno); 427 err = -EINVAL; 428 } 429 430 i915_request_put(rq); 431 } 432 433 heap_fence_put(wait); 434 heap_fence_put(submit); 435 436 if (err < 0) 437 break; 438 439 num_fences += count; 440 num_waits++; 441 442 cond_resched(); 443 } 444 445 atomic_long_add(num_fences, &t->num_fences); 446 atomic_long_add(num_waits, &t->num_waits); 447 448 kfree(order); 449 out_requests: 450 kfree(requests); 451 thread->result = err; 452 } 453 454 static int mock_breadcrumbs_smoketest(void *arg) 455 { 456 struct drm_i915_private *i915 = arg; 457 struct smoketest t = { 458 .engine = rcs0(i915), 459 .ncontexts = 1024, 460 .max_batch = 1024, 461 .request_alloc = __mock_request_alloc 462 }; 463 unsigned int ncpus = num_online_cpus(); 464 struct smoke_thread *threads; 465 unsigned int n; 466 int ret = 0; 467 468 /* 469 * Smoketest our breadcrumb/signal handling for requests across multiple 470 * threads. A very simple test to only catch the most egregious of bugs. 471 * See __igt_breadcrumbs_smoketest(); 472 */ 473 474 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL); 475 if (!threads) 476 return -ENOMEM; 477 478 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL); 479 if (!t.contexts) { 480 ret = -ENOMEM; 481 goto out_threads; 482 } 483 484 for (n = 0; n < t.ncontexts; n++) { 485 t.contexts[n] = mock_context(t.engine->i915, "mock"); 486 if (!t.contexts[n]) { 487 ret = -ENOMEM; 488 goto out_contexts; 489 } 490 } 491 492 for (n = 0; n < ncpus; n++) { 493 struct kthread_worker *worker; 494 495 worker = kthread_create_worker(0, "igt/%d", n); 496 if (IS_ERR(worker)) { 497 ret = PTR_ERR(worker); 498 ncpus = n; 499 break; 500 } 501 502 threads[n].worker = worker; 503 threads[n].t = &t; 504 threads[n].stop = false; 505 threads[n].result = 0; 506 507 kthread_init_work(&threads[n].work, 508 __igt_breadcrumbs_smoketest); 509 kthread_queue_work(worker, &threads[n].work); 510 } 511 512 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 513 514 for (n = 0; n < ncpus; n++) { 515 int err; 516 517 WRITE_ONCE(threads[n].stop, true); 518 kthread_flush_work(&threads[n].work); 519 err = READ_ONCE(threads[n].result); 520 if (err < 0 && !ret) 521 ret = err; 522 523 kthread_destroy_worker(threads[n].worker); 524 } 525 pr_info("Completed %lu waits for %lu fence across %d cpus\n", 526 atomic_long_read(&t.num_waits), 527 atomic_long_read(&t.num_fences), 528 ncpus); 529 530 out_contexts: 531 for (n = 0; n < t.ncontexts; n++) { 532 if (!t.contexts[n]) 533 break; 534 mock_context_close(t.contexts[n]); 535 } 536 kfree(t.contexts); 537 out_threads: 538 kfree(threads); 539 return ret; 540 } 541 542 int i915_request_mock_selftests(void) 543 { 544 static const struct i915_subtest tests[] = { 545 SUBTEST(igt_add_request), 546 SUBTEST(igt_wait_request), 547 SUBTEST(igt_fence_wait), 548 SUBTEST(igt_request_rewind), 549 SUBTEST(mock_breadcrumbs_smoketest), 550 }; 551 struct drm_i915_private *i915; 552 intel_wakeref_t wakeref; 553 int err = 0; 554 555 i915 = mock_gem_device(); 556 if (!i915) 557 return -ENOMEM; 558 559 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 560 err = i915_subtests(tests, i915); 561 562 mock_destroy_device(i915); 563 564 return err; 565 } 566 567 static int live_nop_request(void *arg) 568 { 569 struct drm_i915_private *i915 = arg; 570 struct intel_engine_cs *engine; 571 struct igt_live_test t; 572 int err = -ENODEV; 573 574 /* 575 * Submit various sized batches of empty requests, to each engine 576 * (individually), and wait for the batch to complete. We can check 577 * the overhead of submitting requests to the hardware. 578 */ 579 580 for_each_uabi_engine(engine, i915) { 581 unsigned long n, prime; 582 IGT_TIMEOUT(end_time); 583 ktime_t times[2] = {}; 584 585 err = igt_live_test_begin(&t, i915, __func__, engine->name); 586 if (err) 587 return err; 588 589 intel_engine_pm_get(engine); 590 for_each_prime_number_from(prime, 1, 8192) { 591 struct i915_request *request = NULL; 592 593 times[1] = ktime_get_raw(); 594 595 for (n = 0; n < prime; n++) { 596 i915_request_put(request); 597 request = i915_request_create(engine->kernel_context); 598 if (IS_ERR(request)) 599 return PTR_ERR(request); 600 601 /* 602 * This space is left intentionally blank. 603 * 604 * We do not actually want to perform any 605 * action with this request, we just want 606 * to measure the latency in allocation 607 * and submission of our breadcrumbs - 608 * ensuring that the bare request is sufficient 609 * for the system to work (i.e. proper HEAD 610 * tracking of the rings, interrupt handling, 611 * etc). It also gives us the lowest bounds 612 * for latency. 613 */ 614 615 i915_request_get(request); 616 i915_request_add(request); 617 } 618 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 619 i915_request_put(request); 620 621 times[1] = ktime_sub(ktime_get_raw(), times[1]); 622 if (prime == 1) 623 times[0] = times[1]; 624 625 if (__igt_timeout(end_time, NULL)) 626 break; 627 } 628 intel_engine_pm_put(engine); 629 630 err = igt_live_test_end(&t); 631 if (err) 632 return err; 633 634 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n", 635 engine->name, 636 ktime_to_ns(times[0]), 637 prime, div64_u64(ktime_to_ns(times[1]), prime)); 638 } 639 640 return err; 641 } 642 643 static int __cancel_inactive(struct intel_engine_cs *engine) 644 { 645 struct intel_context *ce; 646 struct igt_spinner spin; 647 struct i915_request *rq; 648 int err = 0; 649 650 if (igt_spinner_init(&spin, engine->gt)) 651 return -ENOMEM; 652 653 ce = intel_context_create(engine); 654 if (IS_ERR(ce)) { 655 err = PTR_ERR(ce); 656 goto out_spin; 657 } 658 659 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 660 if (IS_ERR(rq)) { 661 err = PTR_ERR(rq); 662 goto out_ce; 663 } 664 665 pr_debug("%s: Cancelling inactive request\n", engine->name); 666 i915_request_cancel(rq, -EINTR); 667 i915_request_get(rq); 668 i915_request_add(rq); 669 670 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 671 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 672 673 pr_err("%s: Failed to cancel inactive request\n", engine->name); 674 intel_engine_dump(engine, &p, "%s\n", engine->name); 675 err = -ETIME; 676 goto out_rq; 677 } 678 679 if (rq->fence.error != -EINTR) { 680 pr_err("%s: fence not cancelled (%u)\n", 681 engine->name, rq->fence.error); 682 err = -EINVAL; 683 } 684 685 out_rq: 686 i915_request_put(rq); 687 out_ce: 688 intel_context_put(ce); 689 out_spin: 690 igt_spinner_fini(&spin); 691 if (err) 692 pr_err("%s: %s error %d\n", __func__, engine->name, err); 693 return err; 694 } 695 696 static int __cancel_active(struct intel_engine_cs *engine) 697 { 698 struct intel_context *ce; 699 struct igt_spinner spin; 700 struct i915_request *rq; 701 int err = 0; 702 703 if (igt_spinner_init(&spin, engine->gt)) 704 return -ENOMEM; 705 706 ce = intel_context_create(engine); 707 if (IS_ERR(ce)) { 708 err = PTR_ERR(ce); 709 goto out_spin; 710 } 711 712 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 713 if (IS_ERR(rq)) { 714 err = PTR_ERR(rq); 715 goto out_ce; 716 } 717 718 pr_debug("%s: Cancelling active request\n", engine->name); 719 i915_request_get(rq); 720 i915_request_add(rq); 721 if (!igt_wait_for_spinner(&spin, rq)) { 722 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 723 724 pr_err("Failed to start spinner on %s\n", engine->name); 725 intel_engine_dump(engine, &p, "%s\n", engine->name); 726 err = -ETIME; 727 goto out_rq; 728 } 729 i915_request_cancel(rq, -EINTR); 730 731 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 732 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 733 734 pr_err("%s: Failed to cancel active request\n", engine->name); 735 intel_engine_dump(engine, &p, "%s\n", engine->name); 736 err = -ETIME; 737 goto out_rq; 738 } 739 740 if (rq->fence.error != -EINTR) { 741 pr_err("%s: fence not cancelled (%u)\n", 742 engine->name, rq->fence.error); 743 err = -EINVAL; 744 } 745 746 out_rq: 747 i915_request_put(rq); 748 out_ce: 749 intel_context_put(ce); 750 out_spin: 751 igt_spinner_fini(&spin); 752 if (err) 753 pr_err("%s: %s error %d\n", __func__, engine->name, err); 754 return err; 755 } 756 757 static int __cancel_completed(struct intel_engine_cs *engine) 758 { 759 struct intel_context *ce; 760 struct igt_spinner spin; 761 struct i915_request *rq; 762 int err = 0; 763 764 if (igt_spinner_init(&spin, engine->gt)) 765 return -ENOMEM; 766 767 ce = intel_context_create(engine); 768 if (IS_ERR(ce)) { 769 err = PTR_ERR(ce); 770 goto out_spin; 771 } 772 773 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 774 if (IS_ERR(rq)) { 775 err = PTR_ERR(rq); 776 goto out_ce; 777 } 778 igt_spinner_end(&spin); 779 i915_request_get(rq); 780 i915_request_add(rq); 781 782 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 783 err = -ETIME; 784 goto out_rq; 785 } 786 787 pr_debug("%s: Cancelling completed request\n", engine->name); 788 i915_request_cancel(rq, -EINTR); 789 if (rq->fence.error) { 790 pr_err("%s: fence not cancelled (%u)\n", 791 engine->name, rq->fence.error); 792 err = -EINVAL; 793 } 794 795 out_rq: 796 i915_request_put(rq); 797 out_ce: 798 intel_context_put(ce); 799 out_spin: 800 igt_spinner_fini(&spin); 801 if (err) 802 pr_err("%s: %s error %d\n", __func__, engine->name, err); 803 return err; 804 } 805 806 /* 807 * Test to prove a non-preemptable request can be cancelled and a subsequent 808 * request on the same context can successfully complete after cancellation. 809 * 810 * Testing methodology is to create a non-preemptible request and submit it, 811 * wait for spinner to start, create a NOP request and submit it, cancel the 812 * spinner, wait for spinner to complete and verify it failed with an error, 813 * finally wait for NOP request to complete verify it succeeded without an 814 * error. Preemption timeout also reduced / restored so test runs in a timely 815 * maner. 816 */ 817 static int __cancel_reset(struct drm_i915_private *i915, 818 struct intel_engine_cs *engine) 819 { 820 struct intel_context *ce; 821 struct igt_spinner spin; 822 struct i915_request *rq, *nop; 823 unsigned long preempt_timeout_ms; 824 int err = 0; 825 826 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT || 827 !intel_has_reset_engine(engine->gt)) 828 return 0; 829 830 preempt_timeout_ms = engine->props.preempt_timeout_ms; 831 engine->props.preempt_timeout_ms = 100; 832 833 if (igt_spinner_init(&spin, engine->gt)) 834 goto out_restore; 835 836 ce = intel_context_create(engine); 837 if (IS_ERR(ce)) { 838 err = PTR_ERR(ce); 839 goto out_spin; 840 } 841 842 rq = igt_spinner_create_request(&spin, ce, MI_NOOP); 843 if (IS_ERR(rq)) { 844 err = PTR_ERR(rq); 845 goto out_ce; 846 } 847 848 pr_debug("%s: Cancelling active non-preemptable request\n", 849 engine->name); 850 i915_request_get(rq); 851 i915_request_add(rq); 852 if (!igt_wait_for_spinner(&spin, rq)) { 853 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 854 855 pr_err("Failed to start spinner on %s\n", engine->name); 856 intel_engine_dump(engine, &p, "%s\n", engine->name); 857 err = -ETIME; 858 goto out_rq; 859 } 860 861 nop = intel_context_create_request(ce); 862 if (IS_ERR(nop)) 863 goto out_rq; 864 i915_request_get(nop); 865 i915_request_add(nop); 866 867 i915_request_cancel(rq, -EINTR); 868 869 if (i915_request_wait(rq, 0, HZ) < 0) { 870 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 871 872 pr_err("%s: Failed to cancel hung request\n", engine->name); 873 intel_engine_dump(engine, &p, "%s\n", engine->name); 874 err = -ETIME; 875 goto out_nop; 876 } 877 878 if (rq->fence.error != -EINTR) { 879 pr_err("%s: fence not cancelled (%u)\n", 880 engine->name, rq->fence.error); 881 err = -EINVAL; 882 goto out_nop; 883 } 884 885 if (i915_request_wait(nop, 0, HZ) < 0) { 886 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 887 888 pr_err("%s: Failed to complete nop request\n", engine->name); 889 intel_engine_dump(engine, &p, "%s\n", engine->name); 890 err = -ETIME; 891 goto out_nop; 892 } 893 894 if (nop->fence.error != 0) { 895 pr_err("%s: Nop request errored (%u)\n", 896 engine->name, nop->fence.error); 897 err = -EINVAL; 898 } 899 900 out_nop: 901 i915_request_put(nop); 902 out_rq: 903 i915_request_put(rq); 904 out_ce: 905 intel_context_put(ce); 906 out_spin: 907 igt_spinner_fini(&spin); 908 out_restore: 909 engine->props.preempt_timeout_ms = preempt_timeout_ms; 910 if (err) 911 pr_err("%s: %s error %d\n", __func__, engine->name, err); 912 return err; 913 } 914 915 static int live_cancel_request(void *arg) 916 { 917 struct drm_i915_private *i915 = arg; 918 struct intel_engine_cs *engine; 919 920 /* 921 * Check cancellation of requests. We expect to be able to immediately 922 * cancel active requests, even if they are currently on the GPU. 923 */ 924 925 for_each_uabi_engine(engine, i915) { 926 struct igt_live_test t; 927 int err, err2; 928 929 if (!intel_engine_has_preemption(engine)) 930 continue; 931 932 err = igt_live_test_begin(&t, i915, __func__, engine->name); 933 if (err) 934 return err; 935 936 err = __cancel_inactive(engine); 937 if (err == 0) 938 err = __cancel_active(engine); 939 if (err == 0) 940 err = __cancel_completed(engine); 941 942 err2 = igt_live_test_end(&t); 943 if (err) 944 return err; 945 if (err2) 946 return err2; 947 948 /* Expects reset so call outside of igt_live_test_* */ 949 err = __cancel_reset(i915, engine); 950 if (err) 951 return err; 952 953 if (igt_flush_test(i915)) 954 return -EIO; 955 } 956 957 return 0; 958 } 959 960 static struct i915_vma *empty_batch(struct drm_i915_private *i915) 961 { 962 struct drm_i915_gem_object *obj; 963 struct i915_vma *vma; 964 u32 *cmd; 965 int err; 966 967 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 968 if (IS_ERR(obj)) 969 return ERR_CAST(obj); 970 971 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB); 972 if (IS_ERR(cmd)) { 973 err = PTR_ERR(cmd); 974 goto err; 975 } 976 977 *cmd = MI_BATCH_BUFFER_END; 978 979 __i915_gem_object_flush_map(obj, 0, 64); 980 i915_gem_object_unpin_map(obj); 981 982 intel_gt_chipset_flush(to_gt(i915)); 983 984 vma = i915_vma_instance(obj, &to_gt(i915)->ggtt->vm, NULL); 985 if (IS_ERR(vma)) { 986 err = PTR_ERR(vma); 987 goto err; 988 } 989 990 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL); 991 if (err) 992 goto err; 993 994 /* Force the wait now to avoid including it in the benchmark */ 995 err = i915_vma_sync(vma); 996 if (err) 997 goto err_pin; 998 999 return vma; 1000 1001 err_pin: 1002 i915_vma_unpin(vma); 1003 err: 1004 i915_gem_object_put(obj); 1005 return ERR_PTR(err); 1006 } 1007 1008 static struct i915_request * 1009 empty_request(struct intel_engine_cs *engine, 1010 struct i915_vma *batch) 1011 { 1012 struct i915_request *request; 1013 int err; 1014 1015 request = i915_request_create(engine->kernel_context); 1016 if (IS_ERR(request)) 1017 return request; 1018 1019 err = engine->emit_bb_start(request, 1020 batch->node.start, 1021 batch->node.size, 1022 I915_DISPATCH_SECURE); 1023 if (err) 1024 goto out_request; 1025 1026 i915_request_get(request); 1027 out_request: 1028 i915_request_add(request); 1029 return err ? ERR_PTR(err) : request; 1030 } 1031 1032 static int live_empty_request(void *arg) 1033 { 1034 struct drm_i915_private *i915 = arg; 1035 struct intel_engine_cs *engine; 1036 struct igt_live_test t; 1037 struct i915_vma *batch; 1038 int err = 0; 1039 1040 /* 1041 * Submit various sized batches of empty requests, to each engine 1042 * (individually), and wait for the batch to complete. We can check 1043 * the overhead of submitting requests to the hardware. 1044 */ 1045 1046 batch = empty_batch(i915); 1047 if (IS_ERR(batch)) 1048 return PTR_ERR(batch); 1049 1050 for_each_uabi_engine(engine, i915) { 1051 IGT_TIMEOUT(end_time); 1052 struct i915_request *request; 1053 unsigned long n, prime; 1054 ktime_t times[2] = {}; 1055 1056 err = igt_live_test_begin(&t, i915, __func__, engine->name); 1057 if (err) 1058 goto out_batch; 1059 1060 intel_engine_pm_get(engine); 1061 1062 /* Warmup / preload */ 1063 request = empty_request(engine, batch); 1064 if (IS_ERR(request)) { 1065 err = PTR_ERR(request); 1066 intel_engine_pm_put(engine); 1067 goto out_batch; 1068 } 1069 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1070 1071 for_each_prime_number_from(prime, 1, 8192) { 1072 times[1] = ktime_get_raw(); 1073 1074 for (n = 0; n < prime; n++) { 1075 i915_request_put(request); 1076 request = empty_request(engine, batch); 1077 if (IS_ERR(request)) { 1078 err = PTR_ERR(request); 1079 intel_engine_pm_put(engine); 1080 goto out_batch; 1081 } 1082 } 1083 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1084 1085 times[1] = ktime_sub(ktime_get_raw(), times[1]); 1086 if (prime == 1) 1087 times[0] = times[1]; 1088 1089 if (__igt_timeout(end_time, NULL)) 1090 break; 1091 } 1092 i915_request_put(request); 1093 intel_engine_pm_put(engine); 1094 1095 err = igt_live_test_end(&t); 1096 if (err) 1097 goto out_batch; 1098 1099 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n", 1100 engine->name, 1101 ktime_to_ns(times[0]), 1102 prime, div64_u64(ktime_to_ns(times[1]), prime)); 1103 } 1104 1105 out_batch: 1106 i915_vma_unpin(batch); 1107 i915_vma_put(batch); 1108 return err; 1109 } 1110 1111 static struct i915_vma *recursive_batch(struct drm_i915_private *i915) 1112 { 1113 struct drm_i915_gem_object *obj; 1114 const int ver = GRAPHICS_VER(i915); 1115 struct i915_vma *vma; 1116 u32 *cmd; 1117 int err; 1118 1119 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 1120 if (IS_ERR(obj)) 1121 return ERR_CAST(obj); 1122 1123 vma = i915_vma_instance(obj, to_gt(i915)->vm, NULL); 1124 if (IS_ERR(vma)) { 1125 err = PTR_ERR(vma); 1126 goto err; 1127 } 1128 1129 err = i915_vma_pin(vma, 0, 0, PIN_USER); 1130 if (err) 1131 goto err; 1132 1133 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC); 1134 if (IS_ERR(cmd)) { 1135 err = PTR_ERR(cmd); 1136 goto err; 1137 } 1138 1139 if (ver >= 8) { 1140 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 1141 *cmd++ = lower_32_bits(vma->node.start); 1142 *cmd++ = upper_32_bits(vma->node.start); 1143 } else if (ver >= 6) { 1144 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8; 1145 *cmd++ = lower_32_bits(vma->node.start); 1146 } else { 1147 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; 1148 *cmd++ = lower_32_bits(vma->node.start); 1149 } 1150 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */ 1151 1152 __i915_gem_object_flush_map(obj, 0, 64); 1153 i915_gem_object_unpin_map(obj); 1154 1155 intel_gt_chipset_flush(to_gt(i915)); 1156 1157 return vma; 1158 1159 err: 1160 i915_gem_object_put(obj); 1161 return ERR_PTR(err); 1162 } 1163 1164 static int recursive_batch_resolve(struct i915_vma *batch) 1165 { 1166 u32 *cmd; 1167 1168 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC); 1169 if (IS_ERR(cmd)) 1170 return PTR_ERR(cmd); 1171 1172 *cmd = MI_BATCH_BUFFER_END; 1173 1174 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd)); 1175 i915_gem_object_unpin_map(batch->obj); 1176 1177 intel_gt_chipset_flush(batch->vm->gt); 1178 1179 return 0; 1180 } 1181 1182 static int live_all_engines(void *arg) 1183 { 1184 struct drm_i915_private *i915 = arg; 1185 const unsigned int nengines = num_uabi_engines(i915); 1186 struct intel_engine_cs *engine; 1187 struct i915_request **request; 1188 struct igt_live_test t; 1189 struct i915_vma *batch; 1190 unsigned int idx; 1191 int err; 1192 1193 /* 1194 * Check we can submit requests to all engines simultaneously. We 1195 * send a recursive batch to each engine - checking that we don't 1196 * block doing so, and that they don't complete too soon. 1197 */ 1198 1199 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1200 if (!request) 1201 return -ENOMEM; 1202 1203 err = igt_live_test_begin(&t, i915, __func__, ""); 1204 if (err) 1205 goto out_free; 1206 1207 batch = recursive_batch(i915); 1208 if (IS_ERR(batch)) { 1209 err = PTR_ERR(batch); 1210 pr_err("%s: Unable to create batch, err=%d\n", __func__, err); 1211 goto out_free; 1212 } 1213 1214 i915_vma_lock(batch); 1215 1216 idx = 0; 1217 for_each_uabi_engine(engine, i915) { 1218 request[idx] = intel_engine_create_kernel_request(engine); 1219 if (IS_ERR(request[idx])) { 1220 err = PTR_ERR(request[idx]); 1221 pr_err("%s: Request allocation failed with err=%d\n", 1222 __func__, err); 1223 goto out_request; 1224 } 1225 1226 err = i915_request_await_object(request[idx], batch->obj, 0); 1227 if (err == 0) 1228 err = i915_vma_move_to_active(batch, request[idx], 0); 1229 GEM_BUG_ON(err); 1230 1231 err = engine->emit_bb_start(request[idx], 1232 batch->node.start, 1233 batch->node.size, 1234 0); 1235 GEM_BUG_ON(err); 1236 request[idx]->batch = batch; 1237 1238 i915_request_get(request[idx]); 1239 i915_request_add(request[idx]); 1240 idx++; 1241 } 1242 1243 i915_vma_unlock(batch); 1244 1245 idx = 0; 1246 for_each_uabi_engine(engine, i915) { 1247 if (i915_request_completed(request[idx])) { 1248 pr_err("%s(%s): request completed too early!\n", 1249 __func__, engine->name); 1250 err = -EINVAL; 1251 goto out_request; 1252 } 1253 idx++; 1254 } 1255 1256 err = recursive_batch_resolve(batch); 1257 if (err) { 1258 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err); 1259 goto out_request; 1260 } 1261 1262 idx = 0; 1263 for_each_uabi_engine(engine, i915) { 1264 long timeout; 1265 1266 timeout = i915_request_wait(request[idx], 0, 1267 MAX_SCHEDULE_TIMEOUT); 1268 if (timeout < 0) { 1269 err = timeout; 1270 pr_err("%s: error waiting for request on %s, err=%d\n", 1271 __func__, engine->name, err); 1272 goto out_request; 1273 } 1274 1275 GEM_BUG_ON(!i915_request_completed(request[idx])); 1276 i915_request_put(request[idx]); 1277 request[idx] = NULL; 1278 idx++; 1279 } 1280 1281 err = igt_live_test_end(&t); 1282 1283 out_request: 1284 idx = 0; 1285 for_each_uabi_engine(engine, i915) { 1286 if (request[idx]) 1287 i915_request_put(request[idx]); 1288 idx++; 1289 } 1290 i915_vma_unpin(batch); 1291 i915_vma_put(batch); 1292 out_free: 1293 kfree(request); 1294 return err; 1295 } 1296 1297 static int live_sequential_engines(void *arg) 1298 { 1299 struct drm_i915_private *i915 = arg; 1300 const unsigned int nengines = num_uabi_engines(i915); 1301 struct i915_request **request; 1302 struct i915_request *prev = NULL; 1303 struct intel_engine_cs *engine; 1304 struct igt_live_test t; 1305 unsigned int idx; 1306 int err; 1307 1308 /* 1309 * Check we can submit requests to all engines sequentially, such 1310 * that each successive request waits for the earlier ones. This 1311 * tests that we don't execute requests out of order, even though 1312 * they are running on independent engines. 1313 */ 1314 1315 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1316 if (!request) 1317 return -ENOMEM; 1318 1319 err = igt_live_test_begin(&t, i915, __func__, ""); 1320 if (err) 1321 goto out_free; 1322 1323 idx = 0; 1324 for_each_uabi_engine(engine, i915) { 1325 struct i915_vma *batch; 1326 1327 batch = recursive_batch(i915); 1328 if (IS_ERR(batch)) { 1329 err = PTR_ERR(batch); 1330 pr_err("%s: Unable to create batch for %s, err=%d\n", 1331 __func__, engine->name, err); 1332 goto out_free; 1333 } 1334 1335 i915_vma_lock(batch); 1336 request[idx] = intel_engine_create_kernel_request(engine); 1337 if (IS_ERR(request[idx])) { 1338 err = PTR_ERR(request[idx]); 1339 pr_err("%s: Request allocation failed for %s with err=%d\n", 1340 __func__, engine->name, err); 1341 goto out_unlock; 1342 } 1343 1344 if (prev) { 1345 err = i915_request_await_dma_fence(request[idx], 1346 &prev->fence); 1347 if (err) { 1348 i915_request_add(request[idx]); 1349 pr_err("%s: Request await failed for %s with err=%d\n", 1350 __func__, engine->name, err); 1351 goto out_unlock; 1352 } 1353 } 1354 1355 err = i915_request_await_object(request[idx], 1356 batch->obj, false); 1357 if (err == 0) 1358 err = i915_vma_move_to_active(batch, request[idx], 0); 1359 GEM_BUG_ON(err); 1360 1361 err = engine->emit_bb_start(request[idx], 1362 batch->node.start, 1363 batch->node.size, 1364 0); 1365 GEM_BUG_ON(err); 1366 request[idx]->batch = batch; 1367 1368 i915_request_get(request[idx]); 1369 i915_request_add(request[idx]); 1370 1371 prev = request[idx]; 1372 idx++; 1373 1374 out_unlock: 1375 i915_vma_unlock(batch); 1376 if (err) 1377 goto out_request; 1378 } 1379 1380 idx = 0; 1381 for_each_uabi_engine(engine, i915) { 1382 long timeout; 1383 1384 if (i915_request_completed(request[idx])) { 1385 pr_err("%s(%s): request completed too early!\n", 1386 __func__, engine->name); 1387 err = -EINVAL; 1388 goto out_request; 1389 } 1390 1391 err = recursive_batch_resolve(request[idx]->batch); 1392 if (err) { 1393 pr_err("%s: failed to resolve batch, err=%d\n", 1394 __func__, err); 1395 goto out_request; 1396 } 1397 1398 timeout = i915_request_wait(request[idx], 0, 1399 MAX_SCHEDULE_TIMEOUT); 1400 if (timeout < 0) { 1401 err = timeout; 1402 pr_err("%s: error waiting for request on %s, err=%d\n", 1403 __func__, engine->name, err); 1404 goto out_request; 1405 } 1406 1407 GEM_BUG_ON(!i915_request_completed(request[idx])); 1408 idx++; 1409 } 1410 1411 err = igt_live_test_end(&t); 1412 1413 out_request: 1414 idx = 0; 1415 for_each_uabi_engine(engine, i915) { 1416 u32 *cmd; 1417 1418 if (!request[idx]) 1419 break; 1420 1421 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj, 1422 I915_MAP_WC); 1423 if (!IS_ERR(cmd)) { 1424 *cmd = MI_BATCH_BUFFER_END; 1425 1426 __i915_gem_object_flush_map(request[idx]->batch->obj, 1427 0, sizeof(*cmd)); 1428 i915_gem_object_unpin_map(request[idx]->batch->obj); 1429 1430 intel_gt_chipset_flush(engine->gt); 1431 } 1432 1433 i915_vma_put(request[idx]->batch); 1434 i915_request_put(request[idx]); 1435 idx++; 1436 } 1437 out_free: 1438 kfree(request); 1439 return err; 1440 } 1441 1442 struct parallel_thread { 1443 struct kthread_worker *worker; 1444 struct kthread_work work; 1445 struct intel_engine_cs *engine; 1446 int result; 1447 }; 1448 1449 static void __live_parallel_engine1(struct kthread_work *work) 1450 { 1451 struct parallel_thread *thread = 1452 container_of(work, typeof(*thread), work); 1453 struct intel_engine_cs *engine = thread->engine; 1454 IGT_TIMEOUT(end_time); 1455 unsigned long count; 1456 int err = 0; 1457 1458 count = 0; 1459 intel_engine_pm_get(engine); 1460 do { 1461 struct i915_request *rq; 1462 1463 rq = i915_request_create(engine->kernel_context); 1464 if (IS_ERR(rq)) { 1465 err = PTR_ERR(rq); 1466 break; 1467 } 1468 1469 i915_request_get(rq); 1470 i915_request_add(rq); 1471 1472 err = 0; 1473 if (i915_request_wait(rq, 0, HZ) < 0) 1474 err = -ETIME; 1475 i915_request_put(rq); 1476 if (err) 1477 break; 1478 1479 count++; 1480 } while (!__igt_timeout(end_time, NULL)); 1481 intel_engine_pm_put(engine); 1482 1483 pr_info("%s: %lu request + sync\n", engine->name, count); 1484 thread->result = err; 1485 } 1486 1487 static void __live_parallel_engineN(struct kthread_work *work) 1488 { 1489 struct parallel_thread *thread = 1490 container_of(work, typeof(*thread), work); 1491 struct intel_engine_cs *engine = thread->engine; 1492 IGT_TIMEOUT(end_time); 1493 unsigned long count; 1494 int err = 0; 1495 1496 count = 0; 1497 intel_engine_pm_get(engine); 1498 do { 1499 struct i915_request *rq; 1500 1501 rq = i915_request_create(engine->kernel_context); 1502 if (IS_ERR(rq)) { 1503 err = PTR_ERR(rq); 1504 break; 1505 } 1506 1507 i915_request_add(rq); 1508 count++; 1509 } while (!__igt_timeout(end_time, NULL)); 1510 intel_engine_pm_put(engine); 1511 1512 pr_info("%s: %lu requests\n", engine->name, count); 1513 thread->result = err; 1514 } 1515 1516 static bool wake_all(struct drm_i915_private *i915) 1517 { 1518 if (atomic_dec_and_test(&i915->selftest.counter)) { 1519 wake_up_var(&i915->selftest.counter); 1520 return true; 1521 } 1522 1523 return false; 1524 } 1525 1526 static int wait_for_all(struct drm_i915_private *i915) 1527 { 1528 if (wake_all(i915)) 1529 return 0; 1530 1531 if (wait_var_event_timeout(&i915->selftest.counter, 1532 !atomic_read(&i915->selftest.counter), 1533 i915_selftest.timeout_jiffies)) 1534 return 0; 1535 1536 return -ETIME; 1537 } 1538 1539 static void __live_parallel_spin(struct kthread_work *work) 1540 { 1541 struct parallel_thread *thread = 1542 container_of(work, typeof(*thread), work); 1543 struct intel_engine_cs *engine = thread->engine; 1544 struct igt_spinner spin; 1545 struct i915_request *rq; 1546 int err = 0; 1547 1548 /* 1549 * Create a spinner running for eternity on each engine. If a second 1550 * spinner is incorrectly placed on the same engine, it will not be 1551 * able to start in time. 1552 */ 1553 1554 if (igt_spinner_init(&spin, engine->gt)) { 1555 wake_all(engine->i915); 1556 thread->result = -ENOMEM; 1557 return; 1558 } 1559 1560 intel_engine_pm_get(engine); 1561 rq = igt_spinner_create_request(&spin, 1562 engine->kernel_context, 1563 MI_NOOP); /* no preemption */ 1564 intel_engine_pm_put(engine); 1565 if (IS_ERR(rq)) { 1566 err = PTR_ERR(rq); 1567 if (err == -ENODEV) 1568 err = 0; 1569 wake_all(engine->i915); 1570 goto out_spin; 1571 } 1572 1573 i915_request_get(rq); 1574 i915_request_add(rq); 1575 if (igt_wait_for_spinner(&spin, rq)) { 1576 /* Occupy this engine for the whole test */ 1577 err = wait_for_all(engine->i915); 1578 } else { 1579 pr_err("Failed to start spinner on %s\n", engine->name); 1580 err = -EINVAL; 1581 } 1582 igt_spinner_end(&spin); 1583 1584 if (err == 0 && i915_request_wait(rq, 0, HZ) < 0) 1585 err = -EIO; 1586 i915_request_put(rq); 1587 1588 out_spin: 1589 igt_spinner_fini(&spin); 1590 thread->result = err; 1591 } 1592 1593 static int live_parallel_engines(void *arg) 1594 { 1595 struct drm_i915_private *i915 = arg; 1596 static void (* const func[])(struct kthread_work *) = { 1597 __live_parallel_engine1, 1598 __live_parallel_engineN, 1599 __live_parallel_spin, 1600 NULL, 1601 }; 1602 const unsigned int nengines = num_uabi_engines(i915); 1603 struct parallel_thread *threads; 1604 struct intel_engine_cs *engine; 1605 void (* const *fn)(struct kthread_work *); 1606 int err = 0; 1607 1608 /* 1609 * Check we can submit requests to all engines concurrently. This 1610 * tests that we load up the system maximally. 1611 */ 1612 1613 threads = kcalloc(nengines, sizeof(*threads), GFP_KERNEL); 1614 if (!threads) 1615 return -ENOMEM; 1616 1617 for (fn = func; !err && *fn; fn++) { 1618 char name[KSYM_NAME_LEN]; 1619 struct igt_live_test t; 1620 unsigned int idx; 1621 1622 snprintf(name, sizeof(name), "%ps", *fn); 1623 err = igt_live_test_begin(&t, i915, __func__, name); 1624 if (err) 1625 break; 1626 1627 atomic_set(&i915->selftest.counter, nengines); 1628 1629 idx = 0; 1630 for_each_uabi_engine(engine, i915) { 1631 struct kthread_worker *worker; 1632 1633 worker = kthread_create_worker(0, "igt/parallel:%s", 1634 engine->name); 1635 if (IS_ERR(worker)) { 1636 err = PTR_ERR(worker); 1637 break; 1638 } 1639 1640 threads[idx].worker = worker; 1641 threads[idx].result = 0; 1642 threads[idx].engine = engine; 1643 1644 kthread_init_work(&threads[idx].work, *fn); 1645 kthread_queue_work(worker, &threads[idx].work); 1646 idx++; 1647 } 1648 1649 idx = 0; 1650 for_each_uabi_engine(engine, i915) { 1651 int status; 1652 1653 if (!threads[idx].worker) 1654 break; 1655 1656 kthread_flush_work(&threads[idx].work); 1657 status = READ_ONCE(threads[idx].result); 1658 if (status && !err) 1659 err = status; 1660 1661 kthread_destroy_worker(threads[idx++].worker); 1662 } 1663 1664 if (igt_live_test_end(&t)) 1665 err = -EIO; 1666 } 1667 1668 kfree(threads); 1669 return err; 1670 } 1671 1672 static int 1673 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine) 1674 { 1675 struct i915_request *rq; 1676 int ret; 1677 1678 /* 1679 * Before execlists, all contexts share the same ringbuffer. With 1680 * execlists, each context/engine has a separate ringbuffer and 1681 * for the purposes of this test, inexhaustible. 1682 * 1683 * For the global ringbuffer though, we have to be very careful 1684 * that we do not wrap while preventing the execution of requests 1685 * with a unsignaled fence. 1686 */ 1687 if (HAS_EXECLISTS(ctx->i915)) 1688 return INT_MAX; 1689 1690 rq = igt_request_alloc(ctx, engine); 1691 if (IS_ERR(rq)) { 1692 ret = PTR_ERR(rq); 1693 } else { 1694 int sz; 1695 1696 ret = rq->ring->size - rq->reserved_space; 1697 i915_request_add(rq); 1698 1699 sz = rq->ring->emit - rq->head; 1700 if (sz < 0) 1701 sz += rq->ring->size; 1702 ret /= sz; 1703 ret /= 2; /* leave half spare, in case of emergency! */ 1704 } 1705 1706 return ret; 1707 } 1708 1709 static int live_breadcrumbs_smoketest(void *arg) 1710 { 1711 struct drm_i915_private *i915 = arg; 1712 const unsigned int nengines = num_uabi_engines(i915); 1713 const unsigned int ncpus = /* saturate with nengines * ncpus */ 1714 max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines)); 1715 unsigned long num_waits, num_fences; 1716 struct intel_engine_cs *engine; 1717 struct smoke_thread *threads; 1718 struct igt_live_test live; 1719 intel_wakeref_t wakeref; 1720 struct smoketest *smoke; 1721 unsigned int n, idx; 1722 struct file *file; 1723 int ret = 0; 1724 1725 /* 1726 * Smoketest our breadcrumb/signal handling for requests across multiple 1727 * threads. A very simple test to only catch the most egregious of bugs. 1728 * See __igt_breadcrumbs_smoketest(); 1729 * 1730 * On real hardware this time. 1731 */ 1732 1733 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1734 1735 file = mock_file(i915); 1736 if (IS_ERR(file)) { 1737 ret = PTR_ERR(file); 1738 goto out_rpm; 1739 } 1740 1741 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL); 1742 if (!smoke) { 1743 ret = -ENOMEM; 1744 goto out_file; 1745 } 1746 1747 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL); 1748 if (!threads) { 1749 ret = -ENOMEM; 1750 goto out_smoke; 1751 } 1752 1753 smoke[0].request_alloc = __live_request_alloc; 1754 smoke[0].ncontexts = 64; 1755 smoke[0].contexts = kcalloc(smoke[0].ncontexts, 1756 sizeof(*smoke[0].contexts), 1757 GFP_KERNEL); 1758 if (!smoke[0].contexts) { 1759 ret = -ENOMEM; 1760 goto out_threads; 1761 } 1762 1763 for (n = 0; n < smoke[0].ncontexts; n++) { 1764 smoke[0].contexts[n] = live_context(i915, file); 1765 if (IS_ERR(smoke[0].contexts[n])) { 1766 ret = PTR_ERR(smoke[0].contexts[n]); 1767 goto out_contexts; 1768 } 1769 } 1770 1771 ret = igt_live_test_begin(&live, i915, __func__, ""); 1772 if (ret) 1773 goto out_contexts; 1774 1775 idx = 0; 1776 for_each_uabi_engine(engine, i915) { 1777 smoke[idx] = smoke[0]; 1778 smoke[idx].engine = engine; 1779 smoke[idx].max_batch = 1780 max_batches(smoke[0].contexts[0], engine); 1781 if (smoke[idx].max_batch < 0) { 1782 ret = smoke[idx].max_batch; 1783 goto out_flush; 1784 } 1785 /* One ring interleaved between requests from all cpus */ 1786 smoke[idx].max_batch /= ncpus + 1; 1787 pr_debug("Limiting batches to %d requests on %s\n", 1788 smoke[idx].max_batch, engine->name); 1789 1790 for (n = 0; n < ncpus; n++) { 1791 unsigned int i = idx * ncpus + n; 1792 struct kthread_worker *worker; 1793 1794 worker = kthread_create_worker(0, "igt/%d.%d", idx, n); 1795 if (IS_ERR(worker)) { 1796 ret = PTR_ERR(worker); 1797 goto out_flush; 1798 } 1799 1800 threads[i].worker = worker; 1801 threads[i].t = &smoke[idx]; 1802 1803 kthread_init_work(&threads[i].work, 1804 __igt_breadcrumbs_smoketest); 1805 kthread_queue_work(worker, &threads[i].work); 1806 } 1807 1808 idx++; 1809 } 1810 1811 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 1812 1813 out_flush: 1814 idx = 0; 1815 num_waits = 0; 1816 num_fences = 0; 1817 for_each_uabi_engine(engine, i915) { 1818 for (n = 0; n < ncpus; n++) { 1819 unsigned int i = idx * ncpus + n; 1820 int err; 1821 1822 if (!threads[i].worker) 1823 continue; 1824 1825 WRITE_ONCE(threads[i].stop, true); 1826 kthread_flush_work(&threads[i].work); 1827 err = READ_ONCE(threads[i].result); 1828 if (err < 0 && !ret) 1829 ret = err; 1830 1831 kthread_destroy_worker(threads[i].worker); 1832 } 1833 1834 num_waits += atomic_long_read(&smoke[idx].num_waits); 1835 num_fences += atomic_long_read(&smoke[idx].num_fences); 1836 idx++; 1837 } 1838 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", 1839 num_waits, num_fences, idx, ncpus); 1840 1841 ret = igt_live_test_end(&live) ?: ret; 1842 out_contexts: 1843 kfree(smoke[0].contexts); 1844 out_threads: 1845 kfree(threads); 1846 out_smoke: 1847 kfree(smoke); 1848 out_file: 1849 fput(file); 1850 out_rpm: 1851 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1852 1853 return ret; 1854 } 1855 1856 int i915_request_live_selftests(struct drm_i915_private *i915) 1857 { 1858 static const struct i915_subtest tests[] = { 1859 SUBTEST(live_nop_request), 1860 SUBTEST(live_all_engines), 1861 SUBTEST(live_sequential_engines), 1862 SUBTEST(live_parallel_engines), 1863 SUBTEST(live_empty_request), 1864 SUBTEST(live_cancel_request), 1865 SUBTEST(live_breadcrumbs_smoketest), 1866 }; 1867 1868 if (intel_gt_is_wedged(to_gt(i915))) 1869 return 0; 1870 1871 return i915_live_subtests(tests, i915); 1872 } 1873 1874 static int switch_to_kernel_sync(struct intel_context *ce, int err) 1875 { 1876 struct i915_request *rq; 1877 struct dma_fence *fence; 1878 1879 rq = intel_engine_create_kernel_request(ce->engine); 1880 if (IS_ERR(rq)) 1881 return PTR_ERR(rq); 1882 1883 fence = i915_active_fence_get(&ce->timeline->last_request); 1884 if (fence) { 1885 i915_request_await_dma_fence(rq, fence); 1886 dma_fence_put(fence); 1887 } 1888 1889 rq = i915_request_get(rq); 1890 i915_request_add(rq); 1891 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err) 1892 err = -ETIME; 1893 i915_request_put(rq); 1894 1895 while (!err && !intel_engine_is_idle(ce->engine)) 1896 intel_engine_flush_submission(ce->engine); 1897 1898 return err; 1899 } 1900 1901 struct perf_stats { 1902 struct intel_engine_cs *engine; 1903 unsigned long count; 1904 ktime_t time; 1905 ktime_t busy; 1906 u64 runtime; 1907 }; 1908 1909 struct perf_series { 1910 struct drm_i915_private *i915; 1911 unsigned int nengines; 1912 struct intel_context *ce[]; 1913 }; 1914 1915 static int cmp_u32(const void *A, const void *B) 1916 { 1917 const u32 *a = A, *b = B; 1918 1919 return *a - *b; 1920 } 1921 1922 static u32 trifilter(u32 *a) 1923 { 1924 u64 sum; 1925 1926 #define TF_COUNT 5 1927 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); 1928 1929 sum = mul_u32_u32(a[2], 2); 1930 sum += a[1]; 1931 sum += a[3]; 1932 1933 GEM_BUG_ON(sum > U32_MAX); 1934 return sum; 1935 #define TF_BIAS 2 1936 } 1937 1938 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) 1939 { 1940 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles); 1941 1942 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); 1943 } 1944 1945 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) 1946 { 1947 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 1948 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); 1949 *cs++ = offset; 1950 *cs++ = 0; 1951 1952 return cs; 1953 } 1954 1955 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) 1956 { 1957 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1958 *cs++ = offset; 1959 *cs++ = 0; 1960 *cs++ = value; 1961 1962 return cs; 1963 } 1964 1965 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) 1966 { 1967 *cs++ = MI_SEMAPHORE_WAIT | 1968 MI_SEMAPHORE_GLOBAL_GTT | 1969 MI_SEMAPHORE_POLL | 1970 mode; 1971 *cs++ = value; 1972 *cs++ = offset; 1973 *cs++ = 0; 1974 1975 return cs; 1976 } 1977 1978 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) 1979 { 1980 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); 1981 } 1982 1983 static void semaphore_set(u32 *sema, u32 value) 1984 { 1985 WRITE_ONCE(*sema, value); 1986 wmb(); /* flush the update to the cache, and beyond */ 1987 } 1988 1989 static u32 *hwsp_scratch(const struct intel_context *ce) 1990 { 1991 return memset32(ce->engine->status_page.addr + 1000, 0, 21); 1992 } 1993 1994 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) 1995 { 1996 return (i915_ggtt_offset(ce->engine->status_page.vma) + 1997 offset_in_page(dw)); 1998 } 1999 2000 static int measure_semaphore_response(struct intel_context *ce) 2001 { 2002 u32 *sema = hwsp_scratch(ce); 2003 const u32 offset = hwsp_offset(ce, sema); 2004 u32 elapsed[TF_COUNT], cycles; 2005 struct i915_request *rq; 2006 u32 *cs; 2007 int err; 2008 int i; 2009 2010 /* 2011 * Measure how many cycles it takes for the HW to detect the change 2012 * in a semaphore value. 2013 * 2014 * A: read CS_TIMESTAMP from CPU 2015 * poke semaphore 2016 * B: read CS_TIMESTAMP on GPU 2017 * 2018 * Semaphore latency: B - A 2019 */ 2020 2021 semaphore_set(sema, -1); 2022 2023 rq = i915_request_create(ce); 2024 if (IS_ERR(rq)) 2025 return PTR_ERR(rq); 2026 2027 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); 2028 if (IS_ERR(cs)) { 2029 i915_request_add(rq); 2030 err = PTR_ERR(cs); 2031 goto err; 2032 } 2033 2034 cs = emit_store_dw(cs, offset, 0); 2035 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2036 cs = emit_semaphore_poll_until(cs, offset, i); 2037 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2038 cs = emit_store_dw(cs, offset, 0); 2039 } 2040 2041 intel_ring_advance(rq, cs); 2042 i915_request_add(rq); 2043 2044 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2045 err = -EIO; 2046 goto err; 2047 } 2048 2049 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2050 preempt_disable(); 2051 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2052 semaphore_set(sema, i); 2053 preempt_enable(); 2054 2055 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2056 err = -EIO; 2057 goto err; 2058 } 2059 2060 elapsed[i - 1] = sema[i] - cycles; 2061 } 2062 2063 cycles = trifilter(elapsed); 2064 pr_info("%s: semaphore response %d cycles, %lluns\n", 2065 ce->engine->name, cycles >> TF_BIAS, 2066 cycles_to_ns(ce->engine, cycles)); 2067 2068 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2069 2070 err: 2071 intel_gt_set_wedged(ce->engine->gt); 2072 return err; 2073 } 2074 2075 static int measure_idle_dispatch(struct intel_context *ce) 2076 { 2077 u32 *sema = hwsp_scratch(ce); 2078 const u32 offset = hwsp_offset(ce, sema); 2079 u32 elapsed[TF_COUNT], cycles; 2080 u32 *cs; 2081 int err; 2082 int i; 2083 2084 /* 2085 * Measure how long it takes for us to submit a request while the 2086 * engine is idle, but is resting in our context. 2087 * 2088 * A: read CS_TIMESTAMP from CPU 2089 * submit request 2090 * B: read CS_TIMESTAMP on GPU 2091 * 2092 * Submission latency: B - A 2093 */ 2094 2095 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2096 struct i915_request *rq; 2097 2098 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2099 if (err) 2100 return err; 2101 2102 rq = i915_request_create(ce); 2103 if (IS_ERR(rq)) { 2104 err = PTR_ERR(rq); 2105 goto err; 2106 } 2107 2108 cs = intel_ring_begin(rq, 4); 2109 if (IS_ERR(cs)) { 2110 i915_request_add(rq); 2111 err = PTR_ERR(cs); 2112 goto err; 2113 } 2114 2115 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2116 2117 intel_ring_advance(rq, cs); 2118 2119 preempt_disable(); 2120 local_bh_disable(); 2121 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2122 i915_request_add(rq); 2123 local_bh_enable(); 2124 preempt_enable(); 2125 } 2126 2127 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2128 if (err) 2129 goto err; 2130 2131 for (i = 0; i < ARRAY_SIZE(elapsed); i++) 2132 elapsed[i] = sema[i] - elapsed[i]; 2133 2134 cycles = trifilter(elapsed); 2135 pr_info("%s: idle dispatch latency %d cycles, %lluns\n", 2136 ce->engine->name, cycles >> TF_BIAS, 2137 cycles_to_ns(ce->engine, cycles)); 2138 2139 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2140 2141 err: 2142 intel_gt_set_wedged(ce->engine->gt); 2143 return err; 2144 } 2145 2146 static int measure_busy_dispatch(struct intel_context *ce) 2147 { 2148 u32 *sema = hwsp_scratch(ce); 2149 const u32 offset = hwsp_offset(ce, sema); 2150 u32 elapsed[TF_COUNT + 1], cycles; 2151 u32 *cs; 2152 int err; 2153 int i; 2154 2155 /* 2156 * Measure how long it takes for us to submit a request while the 2157 * engine is busy, polling on a semaphore in our context. With 2158 * direct submission, this will include the cost of a lite restore. 2159 * 2160 * A: read CS_TIMESTAMP from CPU 2161 * submit request 2162 * B: read CS_TIMESTAMP on GPU 2163 * 2164 * Submission latency: B - A 2165 */ 2166 2167 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2168 struct i915_request *rq; 2169 2170 rq = i915_request_create(ce); 2171 if (IS_ERR(rq)) { 2172 err = PTR_ERR(rq); 2173 goto err; 2174 } 2175 2176 cs = intel_ring_begin(rq, 12); 2177 if (IS_ERR(cs)) { 2178 i915_request_add(rq); 2179 err = PTR_ERR(cs); 2180 goto err; 2181 } 2182 2183 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2184 cs = emit_semaphore_poll_until(cs, offset, i); 2185 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2186 2187 intel_ring_advance(rq, cs); 2188 2189 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { 2190 err = -EIO; 2191 goto err; 2192 } 2193 2194 preempt_disable(); 2195 local_bh_disable(); 2196 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2197 i915_request_add(rq); 2198 local_bh_enable(); 2199 semaphore_set(sema, i - 1); 2200 preempt_enable(); 2201 } 2202 2203 wait_for(READ_ONCE(sema[i - 1]), 500); 2204 semaphore_set(sema, i - 1); 2205 2206 for (i = 1; i <= TF_COUNT; i++) { 2207 GEM_BUG_ON(sema[i] == -1); 2208 elapsed[i - 1] = sema[i] - elapsed[i]; 2209 } 2210 2211 cycles = trifilter(elapsed); 2212 pr_info("%s: busy dispatch latency %d cycles, %lluns\n", 2213 ce->engine->name, cycles >> TF_BIAS, 2214 cycles_to_ns(ce->engine, cycles)); 2215 2216 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2217 2218 err: 2219 intel_gt_set_wedged(ce->engine->gt); 2220 return err; 2221 } 2222 2223 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) 2224 { 2225 const u32 offset = 2226 i915_ggtt_offset(engine->status_page.vma) + 2227 offset_in_page(sema); 2228 struct i915_request *rq; 2229 u32 *cs; 2230 2231 rq = i915_request_create(engine->kernel_context); 2232 if (IS_ERR(rq)) 2233 return PTR_ERR(rq); 2234 2235 cs = intel_ring_begin(rq, 4); 2236 if (IS_ERR(cs)) { 2237 i915_request_add(rq); 2238 return PTR_ERR(cs); 2239 } 2240 2241 cs = emit_semaphore_poll(cs, mode, value, offset); 2242 2243 intel_ring_advance(rq, cs); 2244 i915_request_add(rq); 2245 2246 return 0; 2247 } 2248 2249 static int measure_inter_request(struct intel_context *ce) 2250 { 2251 u32 *sema = hwsp_scratch(ce); 2252 const u32 offset = hwsp_offset(ce, sema); 2253 u32 elapsed[TF_COUNT + 1], cycles; 2254 struct i915_sw_fence *submit; 2255 int i, err; 2256 2257 /* 2258 * Measure how long it takes to advance from one request into the 2259 * next. Between each request we flush the GPU caches to memory, 2260 * update the breadcrumbs, and then invalidate those caches. 2261 * We queue up all the requests to be submitted in one batch so 2262 * it should be one set of contiguous measurements. 2263 * 2264 * A: read CS_TIMESTAMP on GPU 2265 * advance request 2266 * B: read CS_TIMESTAMP on GPU 2267 * 2268 * Request latency: B - A 2269 */ 2270 2271 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2272 if (err) 2273 return err; 2274 2275 submit = heap_fence_create(GFP_KERNEL); 2276 if (!submit) { 2277 semaphore_set(sema, 1); 2278 return -ENOMEM; 2279 } 2280 2281 intel_engine_flush_submission(ce->engine); 2282 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2283 struct i915_request *rq; 2284 u32 *cs; 2285 2286 rq = i915_request_create(ce); 2287 if (IS_ERR(rq)) { 2288 err = PTR_ERR(rq); 2289 goto err_submit; 2290 } 2291 2292 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 2293 submit, 2294 GFP_KERNEL); 2295 if (err < 0) { 2296 i915_request_add(rq); 2297 goto err_submit; 2298 } 2299 2300 cs = intel_ring_begin(rq, 4); 2301 if (IS_ERR(cs)) { 2302 i915_request_add(rq); 2303 err = PTR_ERR(cs); 2304 goto err_submit; 2305 } 2306 2307 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2308 2309 intel_ring_advance(rq, cs); 2310 i915_request_add(rq); 2311 } 2312 i915_sw_fence_commit(submit); 2313 intel_engine_flush_submission(ce->engine); 2314 heap_fence_put(submit); 2315 2316 semaphore_set(sema, 1); 2317 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2318 if (err) 2319 goto err; 2320 2321 for (i = 1; i <= TF_COUNT; i++) 2322 elapsed[i - 1] = sema[i + 1] - sema[i]; 2323 2324 cycles = trifilter(elapsed); 2325 pr_info("%s: inter-request latency %d cycles, %lluns\n", 2326 ce->engine->name, cycles >> TF_BIAS, 2327 cycles_to_ns(ce->engine, cycles)); 2328 2329 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2330 2331 err_submit: 2332 i915_sw_fence_commit(submit); 2333 heap_fence_put(submit); 2334 semaphore_set(sema, 1); 2335 err: 2336 intel_gt_set_wedged(ce->engine->gt); 2337 return err; 2338 } 2339 2340 static int measure_context_switch(struct intel_context *ce) 2341 { 2342 u32 *sema = hwsp_scratch(ce); 2343 const u32 offset = hwsp_offset(ce, sema); 2344 struct i915_request *fence = NULL; 2345 u32 elapsed[TF_COUNT + 1], cycles; 2346 int i, j, err; 2347 u32 *cs; 2348 2349 /* 2350 * Measure how long it takes to advance from one request in one 2351 * context to a request in another context. This allows us to 2352 * measure how long the context save/restore take, along with all 2353 * the inter-context setup we require. 2354 * 2355 * A: read CS_TIMESTAMP on GPU 2356 * switch context 2357 * B: read CS_TIMESTAMP on GPU 2358 * 2359 * Context switch latency: B - A 2360 */ 2361 2362 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2363 if (err) 2364 return err; 2365 2366 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2367 struct intel_context *arr[] = { 2368 ce, ce->engine->kernel_context 2369 }; 2370 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); 2371 2372 for (j = 0; j < ARRAY_SIZE(arr); j++) { 2373 struct i915_request *rq; 2374 2375 rq = i915_request_create(arr[j]); 2376 if (IS_ERR(rq)) { 2377 err = PTR_ERR(rq); 2378 goto err_fence; 2379 } 2380 2381 if (fence) { 2382 err = i915_request_await_dma_fence(rq, 2383 &fence->fence); 2384 if (err) { 2385 i915_request_add(rq); 2386 goto err_fence; 2387 } 2388 } 2389 2390 cs = intel_ring_begin(rq, 4); 2391 if (IS_ERR(cs)) { 2392 i915_request_add(rq); 2393 err = PTR_ERR(cs); 2394 goto err_fence; 2395 } 2396 2397 cs = emit_timestamp_store(cs, ce, addr); 2398 addr += sizeof(u32); 2399 2400 intel_ring_advance(rq, cs); 2401 2402 i915_request_put(fence); 2403 fence = i915_request_get(rq); 2404 2405 i915_request_add(rq); 2406 } 2407 } 2408 i915_request_put(fence); 2409 intel_engine_flush_submission(ce->engine); 2410 2411 semaphore_set(sema, 1); 2412 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2413 if (err) 2414 goto err; 2415 2416 for (i = 1; i <= TF_COUNT; i++) 2417 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; 2418 2419 cycles = trifilter(elapsed); 2420 pr_info("%s: context switch latency %d cycles, %lluns\n", 2421 ce->engine->name, cycles >> TF_BIAS, 2422 cycles_to_ns(ce->engine, cycles)); 2423 2424 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2425 2426 err_fence: 2427 i915_request_put(fence); 2428 semaphore_set(sema, 1); 2429 err: 2430 intel_gt_set_wedged(ce->engine->gt); 2431 return err; 2432 } 2433 2434 static int measure_preemption(struct intel_context *ce) 2435 { 2436 u32 *sema = hwsp_scratch(ce); 2437 const u32 offset = hwsp_offset(ce, sema); 2438 u32 elapsed[TF_COUNT], cycles; 2439 u32 *cs; 2440 int err; 2441 int i; 2442 2443 /* 2444 * We measure two latencies while triggering preemption. The first 2445 * latency is how long it takes for us to submit a preempting request. 2446 * The second latency is how it takes for us to return from the 2447 * preemption back to the original context. 2448 * 2449 * A: read CS_TIMESTAMP from CPU 2450 * submit preemption 2451 * B: read CS_TIMESTAMP on GPU (in preempting context) 2452 * context switch 2453 * C: read CS_TIMESTAMP on GPU (in original context) 2454 * 2455 * Preemption dispatch latency: B - A 2456 * Preemption switch latency: C - B 2457 */ 2458 2459 if (!intel_engine_has_preemption(ce->engine)) 2460 return 0; 2461 2462 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2463 u32 addr = offset + 2 * i * sizeof(u32); 2464 struct i915_request *rq; 2465 2466 rq = i915_request_create(ce); 2467 if (IS_ERR(rq)) { 2468 err = PTR_ERR(rq); 2469 goto err; 2470 } 2471 2472 cs = intel_ring_begin(rq, 12); 2473 if (IS_ERR(cs)) { 2474 i915_request_add(rq); 2475 err = PTR_ERR(cs); 2476 goto err; 2477 } 2478 2479 cs = emit_store_dw(cs, addr, -1); 2480 cs = emit_semaphore_poll_until(cs, offset, i); 2481 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); 2482 2483 intel_ring_advance(rq, cs); 2484 i915_request_add(rq); 2485 2486 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { 2487 err = -EIO; 2488 goto err; 2489 } 2490 2491 rq = i915_request_create(ce->engine->kernel_context); 2492 if (IS_ERR(rq)) { 2493 err = PTR_ERR(rq); 2494 goto err; 2495 } 2496 2497 cs = intel_ring_begin(rq, 8); 2498 if (IS_ERR(cs)) { 2499 i915_request_add(rq); 2500 err = PTR_ERR(cs); 2501 goto err; 2502 } 2503 2504 cs = emit_timestamp_store(cs, ce, addr); 2505 cs = emit_store_dw(cs, offset, i); 2506 2507 intel_ring_advance(rq, cs); 2508 rq->sched.attr.priority = I915_PRIORITY_BARRIER; 2509 2510 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2511 i915_request_add(rq); 2512 } 2513 2514 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { 2515 err = -EIO; 2516 goto err; 2517 } 2518 2519 for (i = 1; i <= TF_COUNT; i++) 2520 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; 2521 2522 cycles = trifilter(elapsed); 2523 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", 2524 ce->engine->name, cycles >> TF_BIAS, 2525 cycles_to_ns(ce->engine, cycles)); 2526 2527 for (i = 1; i <= TF_COUNT; i++) 2528 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; 2529 2530 cycles = trifilter(elapsed); 2531 pr_info("%s: preemption switch latency %d cycles, %lluns\n", 2532 ce->engine->name, cycles >> TF_BIAS, 2533 cycles_to_ns(ce->engine, cycles)); 2534 2535 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2536 2537 err: 2538 intel_gt_set_wedged(ce->engine->gt); 2539 return err; 2540 } 2541 2542 struct signal_cb { 2543 struct dma_fence_cb base; 2544 bool seen; 2545 }; 2546 2547 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 2548 { 2549 struct signal_cb *s = container_of(cb, typeof(*s), base); 2550 2551 smp_store_mb(s->seen, true); /* be safe, be strong */ 2552 } 2553 2554 static int measure_completion(struct intel_context *ce) 2555 { 2556 u32 *sema = hwsp_scratch(ce); 2557 const u32 offset = hwsp_offset(ce, sema); 2558 u32 elapsed[TF_COUNT], cycles; 2559 u32 *cs; 2560 int err; 2561 int i; 2562 2563 /* 2564 * Measure how long it takes for the signal (interrupt) to be 2565 * sent from the GPU to be processed by the CPU. 2566 * 2567 * A: read CS_TIMESTAMP on GPU 2568 * signal 2569 * B: read CS_TIMESTAMP from CPU 2570 * 2571 * Completion latency: B - A 2572 */ 2573 2574 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2575 struct signal_cb cb = { .seen = false }; 2576 struct i915_request *rq; 2577 2578 rq = i915_request_create(ce); 2579 if (IS_ERR(rq)) { 2580 err = PTR_ERR(rq); 2581 goto err; 2582 } 2583 2584 cs = intel_ring_begin(rq, 12); 2585 if (IS_ERR(cs)) { 2586 i915_request_add(rq); 2587 err = PTR_ERR(cs); 2588 goto err; 2589 } 2590 2591 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2592 cs = emit_semaphore_poll_until(cs, offset, i); 2593 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2594 2595 intel_ring_advance(rq, cs); 2596 2597 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); 2598 i915_request_add(rq); 2599 2600 intel_engine_flush_submission(ce->engine); 2601 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { 2602 err = -EIO; 2603 goto err; 2604 } 2605 2606 preempt_disable(); 2607 semaphore_set(sema, i); 2608 while (!READ_ONCE(cb.seen)) 2609 cpu_relax(); 2610 2611 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2612 preempt_enable(); 2613 } 2614 2615 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2616 if (err) 2617 goto err; 2618 2619 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2620 GEM_BUG_ON(sema[i + 1] == -1); 2621 elapsed[i] = elapsed[i] - sema[i + 1]; 2622 } 2623 2624 cycles = trifilter(elapsed); 2625 pr_info("%s: completion latency %d cycles, %lluns\n", 2626 ce->engine->name, cycles >> TF_BIAS, 2627 cycles_to_ns(ce->engine, cycles)); 2628 2629 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2630 2631 err: 2632 intel_gt_set_wedged(ce->engine->gt); 2633 return err; 2634 } 2635 2636 static void rps_pin(struct intel_gt *gt) 2637 { 2638 /* Pin the frequency to max */ 2639 atomic_inc(>->rps.num_waiters); 2640 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 2641 2642 mutex_lock(>->rps.lock); 2643 intel_rps_set(>->rps, gt->rps.max_freq); 2644 mutex_unlock(>->rps.lock); 2645 } 2646 2647 static void rps_unpin(struct intel_gt *gt) 2648 { 2649 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 2650 atomic_dec(>->rps.num_waiters); 2651 } 2652 2653 static int perf_request_latency(void *arg) 2654 { 2655 struct drm_i915_private *i915 = arg; 2656 struct intel_engine_cs *engine; 2657 struct pm_qos_request qos; 2658 int err = 0; 2659 2660 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */ 2661 return 0; 2662 2663 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2664 2665 for_each_uabi_engine(engine, i915) { 2666 struct intel_context *ce; 2667 2668 ce = intel_context_create(engine); 2669 if (IS_ERR(ce)) { 2670 err = PTR_ERR(ce); 2671 goto out; 2672 } 2673 2674 err = intel_context_pin(ce); 2675 if (err) { 2676 intel_context_put(ce); 2677 goto out; 2678 } 2679 2680 st_engine_heartbeat_disable(engine); 2681 rps_pin(engine->gt); 2682 2683 if (err == 0) 2684 err = measure_semaphore_response(ce); 2685 if (err == 0) 2686 err = measure_idle_dispatch(ce); 2687 if (err == 0) 2688 err = measure_busy_dispatch(ce); 2689 if (err == 0) 2690 err = measure_inter_request(ce); 2691 if (err == 0) 2692 err = measure_context_switch(ce); 2693 if (err == 0) 2694 err = measure_preemption(ce); 2695 if (err == 0) 2696 err = measure_completion(ce); 2697 2698 rps_unpin(engine->gt); 2699 st_engine_heartbeat_enable(engine); 2700 2701 intel_context_unpin(ce); 2702 intel_context_put(ce); 2703 if (err) 2704 goto out; 2705 } 2706 2707 out: 2708 if (igt_flush_test(i915)) 2709 err = -EIO; 2710 2711 cpu_latency_qos_remove_request(&qos); 2712 return err; 2713 } 2714 2715 static int s_sync0(void *arg) 2716 { 2717 struct perf_series *ps = arg; 2718 IGT_TIMEOUT(end_time); 2719 unsigned int idx = 0; 2720 int err = 0; 2721 2722 GEM_BUG_ON(!ps->nengines); 2723 do { 2724 struct i915_request *rq; 2725 2726 rq = i915_request_create(ps->ce[idx]); 2727 if (IS_ERR(rq)) { 2728 err = PTR_ERR(rq); 2729 break; 2730 } 2731 2732 i915_request_get(rq); 2733 i915_request_add(rq); 2734 2735 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2736 err = -ETIME; 2737 i915_request_put(rq); 2738 if (err) 2739 break; 2740 2741 if (++idx == ps->nengines) 2742 idx = 0; 2743 } while (!__igt_timeout(end_time, NULL)); 2744 2745 return err; 2746 } 2747 2748 static int s_sync1(void *arg) 2749 { 2750 struct perf_series *ps = arg; 2751 struct i915_request *prev = NULL; 2752 IGT_TIMEOUT(end_time); 2753 unsigned int idx = 0; 2754 int err = 0; 2755 2756 GEM_BUG_ON(!ps->nengines); 2757 do { 2758 struct i915_request *rq; 2759 2760 rq = i915_request_create(ps->ce[idx]); 2761 if (IS_ERR(rq)) { 2762 err = PTR_ERR(rq); 2763 break; 2764 } 2765 2766 i915_request_get(rq); 2767 i915_request_add(rq); 2768 2769 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2770 err = -ETIME; 2771 i915_request_put(prev); 2772 prev = rq; 2773 if (err) 2774 break; 2775 2776 if (++idx == ps->nengines) 2777 idx = 0; 2778 } while (!__igt_timeout(end_time, NULL)); 2779 i915_request_put(prev); 2780 2781 return err; 2782 } 2783 2784 static int s_many(void *arg) 2785 { 2786 struct perf_series *ps = arg; 2787 IGT_TIMEOUT(end_time); 2788 unsigned int idx = 0; 2789 2790 GEM_BUG_ON(!ps->nengines); 2791 do { 2792 struct i915_request *rq; 2793 2794 rq = i915_request_create(ps->ce[idx]); 2795 if (IS_ERR(rq)) 2796 return PTR_ERR(rq); 2797 2798 i915_request_add(rq); 2799 2800 if (++idx == ps->nengines) 2801 idx = 0; 2802 } while (!__igt_timeout(end_time, NULL)); 2803 2804 return 0; 2805 } 2806 2807 static int perf_series_engines(void *arg) 2808 { 2809 struct drm_i915_private *i915 = arg; 2810 static int (* const func[])(void *arg) = { 2811 s_sync0, 2812 s_sync1, 2813 s_many, 2814 NULL, 2815 }; 2816 const unsigned int nengines = num_uabi_engines(i915); 2817 struct intel_engine_cs *engine; 2818 int (* const *fn)(void *arg); 2819 struct pm_qos_request qos; 2820 struct perf_stats *stats; 2821 struct perf_series *ps; 2822 unsigned int idx; 2823 int err = 0; 2824 2825 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL); 2826 if (!stats) 2827 return -ENOMEM; 2828 2829 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL); 2830 if (!ps) { 2831 kfree(stats); 2832 return -ENOMEM; 2833 } 2834 2835 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2836 2837 ps->i915 = i915; 2838 ps->nengines = nengines; 2839 2840 idx = 0; 2841 for_each_uabi_engine(engine, i915) { 2842 struct intel_context *ce; 2843 2844 ce = intel_context_create(engine); 2845 if (IS_ERR(ce)) { 2846 err = PTR_ERR(ce); 2847 goto out; 2848 } 2849 2850 err = intel_context_pin(ce); 2851 if (err) { 2852 intel_context_put(ce); 2853 goto out; 2854 } 2855 2856 ps->ce[idx++] = ce; 2857 } 2858 GEM_BUG_ON(idx != ps->nengines); 2859 2860 for (fn = func; *fn && !err; fn++) { 2861 char name[KSYM_NAME_LEN]; 2862 struct igt_live_test t; 2863 2864 snprintf(name, sizeof(name), "%ps", *fn); 2865 err = igt_live_test_begin(&t, i915, __func__, name); 2866 if (err) 2867 break; 2868 2869 for (idx = 0; idx < nengines; idx++) { 2870 struct perf_stats *p = 2871 memset(&stats[idx], 0, sizeof(stats[idx])); 2872 struct intel_context *ce = ps->ce[idx]; 2873 2874 p->engine = ps->ce[idx]->engine; 2875 intel_engine_pm_get(p->engine); 2876 2877 if (intel_engine_supports_stats(p->engine)) 2878 p->busy = intel_engine_get_busy_time(p->engine, 2879 &p->time) + 1; 2880 else 2881 p->time = ktime_get(); 2882 p->runtime = -intel_context_get_total_runtime_ns(ce); 2883 } 2884 2885 err = (*fn)(ps); 2886 if (igt_live_test_end(&t)) 2887 err = -EIO; 2888 2889 for (idx = 0; idx < nengines; idx++) { 2890 struct perf_stats *p = &stats[idx]; 2891 struct intel_context *ce = ps->ce[idx]; 2892 int integer, decimal; 2893 u64 busy, dt, now; 2894 2895 if (p->busy) 2896 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine, 2897 &now), 2898 p->busy - 1); 2899 else 2900 now = ktime_get(); 2901 p->time = ktime_sub(now, p->time); 2902 2903 err = switch_to_kernel_sync(ce, err); 2904 p->runtime += intel_context_get_total_runtime_ns(ce); 2905 intel_engine_pm_put(p->engine); 2906 2907 busy = 100 * ktime_to_ns(p->busy); 2908 dt = ktime_to_ns(p->time); 2909 if (dt) { 2910 integer = div64_u64(busy, dt); 2911 busy -= integer * dt; 2912 decimal = div64_u64(100 * busy, dt); 2913 } else { 2914 integer = 0; 2915 decimal = 0; 2916 } 2917 2918 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2919 name, p->engine->name, ce->timeline->seqno, 2920 integer, decimal, 2921 div_u64(p->runtime, 1000 * 1000), 2922 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2923 } 2924 } 2925 2926 out: 2927 for (idx = 0; idx < nengines; idx++) { 2928 if (IS_ERR_OR_NULL(ps->ce[idx])) 2929 break; 2930 2931 intel_context_unpin(ps->ce[idx]); 2932 intel_context_put(ps->ce[idx]); 2933 } 2934 kfree(ps); 2935 2936 cpu_latency_qos_remove_request(&qos); 2937 kfree(stats); 2938 return err; 2939 } 2940 2941 struct p_thread { 2942 struct perf_stats p; 2943 struct kthread_worker *worker; 2944 struct kthread_work work; 2945 struct intel_engine_cs *engine; 2946 int result; 2947 }; 2948 2949 static void p_sync0(struct kthread_work *work) 2950 { 2951 struct p_thread *thread = container_of(work, typeof(*thread), work); 2952 struct perf_stats *p = &thread->p; 2953 struct intel_engine_cs *engine = p->engine; 2954 struct intel_context *ce; 2955 IGT_TIMEOUT(end_time); 2956 unsigned long count; 2957 bool busy; 2958 int err = 0; 2959 2960 ce = intel_context_create(engine); 2961 if (IS_ERR(ce)) { 2962 thread->result = PTR_ERR(ce); 2963 return; 2964 } 2965 2966 err = intel_context_pin(ce); 2967 if (err) { 2968 intel_context_put(ce); 2969 thread->result = err; 2970 return; 2971 } 2972 2973 if (intel_engine_supports_stats(engine)) { 2974 p->busy = intel_engine_get_busy_time(engine, &p->time); 2975 busy = true; 2976 } else { 2977 p->time = ktime_get(); 2978 busy = false; 2979 } 2980 2981 count = 0; 2982 do { 2983 struct i915_request *rq; 2984 2985 rq = i915_request_create(ce); 2986 if (IS_ERR(rq)) { 2987 err = PTR_ERR(rq); 2988 break; 2989 } 2990 2991 i915_request_get(rq); 2992 i915_request_add(rq); 2993 2994 err = 0; 2995 if (i915_request_wait(rq, 0, HZ) < 0) 2996 err = -ETIME; 2997 i915_request_put(rq); 2998 if (err) 2999 break; 3000 3001 count++; 3002 } while (!__igt_timeout(end_time, NULL)); 3003 3004 if (busy) { 3005 ktime_t now; 3006 3007 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3008 p->busy); 3009 p->time = ktime_sub(now, p->time); 3010 } else { 3011 p->time = ktime_sub(ktime_get(), p->time); 3012 } 3013 3014 err = switch_to_kernel_sync(ce, err); 3015 p->runtime = intel_context_get_total_runtime_ns(ce); 3016 p->count = count; 3017 3018 intel_context_unpin(ce); 3019 intel_context_put(ce); 3020 thread->result = err; 3021 } 3022 3023 static void p_sync1(struct kthread_work *work) 3024 { 3025 struct p_thread *thread = container_of(work, typeof(*thread), work); 3026 struct perf_stats *p = &thread->p; 3027 struct intel_engine_cs *engine = p->engine; 3028 struct i915_request *prev = NULL; 3029 struct intel_context *ce; 3030 IGT_TIMEOUT(end_time); 3031 unsigned long count; 3032 bool busy; 3033 int err = 0; 3034 3035 ce = intel_context_create(engine); 3036 if (IS_ERR(ce)) { 3037 thread->result = PTR_ERR(ce); 3038 return; 3039 } 3040 3041 err = intel_context_pin(ce); 3042 if (err) { 3043 intel_context_put(ce); 3044 thread->result = err; 3045 return; 3046 } 3047 3048 if (intel_engine_supports_stats(engine)) { 3049 p->busy = intel_engine_get_busy_time(engine, &p->time); 3050 busy = true; 3051 } else { 3052 p->time = ktime_get(); 3053 busy = false; 3054 } 3055 3056 count = 0; 3057 do { 3058 struct i915_request *rq; 3059 3060 rq = i915_request_create(ce); 3061 if (IS_ERR(rq)) { 3062 err = PTR_ERR(rq); 3063 break; 3064 } 3065 3066 i915_request_get(rq); 3067 i915_request_add(rq); 3068 3069 err = 0; 3070 if (prev && i915_request_wait(prev, 0, HZ) < 0) 3071 err = -ETIME; 3072 i915_request_put(prev); 3073 prev = rq; 3074 if (err) 3075 break; 3076 3077 count++; 3078 } while (!__igt_timeout(end_time, NULL)); 3079 i915_request_put(prev); 3080 3081 if (busy) { 3082 ktime_t now; 3083 3084 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3085 p->busy); 3086 p->time = ktime_sub(now, p->time); 3087 } else { 3088 p->time = ktime_sub(ktime_get(), p->time); 3089 } 3090 3091 err = switch_to_kernel_sync(ce, err); 3092 p->runtime = intel_context_get_total_runtime_ns(ce); 3093 p->count = count; 3094 3095 intel_context_unpin(ce); 3096 intel_context_put(ce); 3097 thread->result = err; 3098 } 3099 3100 static void p_many(struct kthread_work *work) 3101 { 3102 struct p_thread *thread = container_of(work, typeof(*thread), work); 3103 struct perf_stats *p = &thread->p; 3104 struct intel_engine_cs *engine = p->engine; 3105 struct intel_context *ce; 3106 IGT_TIMEOUT(end_time); 3107 unsigned long count; 3108 int err = 0; 3109 bool busy; 3110 3111 ce = intel_context_create(engine); 3112 if (IS_ERR(ce)) { 3113 thread->result = PTR_ERR(ce); 3114 return; 3115 } 3116 3117 err = intel_context_pin(ce); 3118 if (err) { 3119 intel_context_put(ce); 3120 thread->result = err; 3121 return; 3122 } 3123 3124 if (intel_engine_supports_stats(engine)) { 3125 p->busy = intel_engine_get_busy_time(engine, &p->time); 3126 busy = true; 3127 } else { 3128 p->time = ktime_get(); 3129 busy = false; 3130 } 3131 3132 count = 0; 3133 do { 3134 struct i915_request *rq; 3135 3136 rq = i915_request_create(ce); 3137 if (IS_ERR(rq)) { 3138 err = PTR_ERR(rq); 3139 break; 3140 } 3141 3142 i915_request_add(rq); 3143 count++; 3144 } while (!__igt_timeout(end_time, NULL)); 3145 3146 if (busy) { 3147 ktime_t now; 3148 3149 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3150 p->busy); 3151 p->time = ktime_sub(now, p->time); 3152 } else { 3153 p->time = ktime_sub(ktime_get(), p->time); 3154 } 3155 3156 err = switch_to_kernel_sync(ce, err); 3157 p->runtime = intel_context_get_total_runtime_ns(ce); 3158 p->count = count; 3159 3160 intel_context_unpin(ce); 3161 intel_context_put(ce); 3162 thread->result = err; 3163 } 3164 3165 static int perf_parallel_engines(void *arg) 3166 { 3167 struct drm_i915_private *i915 = arg; 3168 static void (* const func[])(struct kthread_work *) = { 3169 p_sync0, 3170 p_sync1, 3171 p_many, 3172 NULL, 3173 }; 3174 const unsigned int nengines = num_uabi_engines(i915); 3175 void (* const *fn)(struct kthread_work *); 3176 struct intel_engine_cs *engine; 3177 struct pm_qos_request qos; 3178 struct p_thread *engines; 3179 int err = 0; 3180 3181 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL); 3182 if (!engines) 3183 return -ENOMEM; 3184 3185 cpu_latency_qos_add_request(&qos, 0); 3186 3187 for (fn = func; *fn; fn++) { 3188 char name[KSYM_NAME_LEN]; 3189 struct igt_live_test t; 3190 unsigned int idx; 3191 3192 snprintf(name, sizeof(name), "%ps", *fn); 3193 err = igt_live_test_begin(&t, i915, __func__, name); 3194 if (err) 3195 break; 3196 3197 atomic_set(&i915->selftest.counter, nengines); 3198 3199 idx = 0; 3200 for_each_uabi_engine(engine, i915) { 3201 struct kthread_worker *worker; 3202 3203 intel_engine_pm_get(engine); 3204 3205 memset(&engines[idx].p, 0, sizeof(engines[idx].p)); 3206 3207 worker = kthread_create_worker(0, "igt:%s", 3208 engine->name); 3209 if (IS_ERR(worker)) { 3210 err = PTR_ERR(worker); 3211 intel_engine_pm_put(engine); 3212 break; 3213 } 3214 engines[idx].worker = worker; 3215 engines[idx].result = 0; 3216 engines[idx].p.engine = engine; 3217 engines[idx].engine = engine; 3218 3219 kthread_init_work(&engines[idx].work, *fn); 3220 kthread_queue_work(worker, &engines[idx].work); 3221 idx++; 3222 } 3223 3224 idx = 0; 3225 for_each_uabi_engine(engine, i915) { 3226 int status; 3227 3228 if (!engines[idx].worker) 3229 break; 3230 3231 kthread_flush_work(&engines[idx].work); 3232 status = READ_ONCE(engines[idx].result); 3233 if (status && !err) 3234 err = status; 3235 3236 intel_engine_pm_put(engine); 3237 3238 kthread_destroy_worker(engines[idx].worker); 3239 idx++; 3240 } 3241 3242 if (igt_live_test_end(&t)) 3243 err = -EIO; 3244 if (err) 3245 break; 3246 3247 idx = 0; 3248 for_each_uabi_engine(engine, i915) { 3249 struct perf_stats *p = &engines[idx].p; 3250 u64 busy = 100 * ktime_to_ns(p->busy); 3251 u64 dt = ktime_to_ns(p->time); 3252 int integer, decimal; 3253 3254 if (dt) { 3255 integer = div64_u64(busy, dt); 3256 busy -= integer * dt; 3257 decimal = div64_u64(100 * busy, dt); 3258 } else { 3259 integer = 0; 3260 decimal = 0; 3261 } 3262 3263 GEM_BUG_ON(engine != p->engine); 3264 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 3265 name, engine->name, p->count, integer, decimal, 3266 div_u64(p->runtime, 1000 * 1000), 3267 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 3268 idx++; 3269 } 3270 } 3271 3272 cpu_latency_qos_remove_request(&qos); 3273 kfree(engines); 3274 return err; 3275 } 3276 3277 int i915_request_perf_selftests(struct drm_i915_private *i915) 3278 { 3279 static const struct i915_subtest tests[] = { 3280 SUBTEST(perf_request_latency), 3281 SUBTEST(perf_series_engines), 3282 SUBTEST(perf_parallel_engines), 3283 }; 3284 3285 if (intel_gt_is_wedged(to_gt(i915))) 3286 return 0; 3287 3288 return i915_subtests(tests, i915); 3289 } 3290