1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/prime_numbers.h> 26 #include <linux/pm_qos.h> 27 #include <linux/sort.h> 28 29 #include "gem/i915_gem_pm.h" 30 #include "gem/selftests/mock_context.h" 31 32 #include "gt/intel_engine_heartbeat.h" 33 #include "gt/intel_engine_pm.h" 34 #include "gt/intel_engine_user.h" 35 #include "gt/intel_gt.h" 36 #include "gt/intel_gt_clock_utils.h" 37 #include "gt/intel_gt_requests.h" 38 #include "gt/selftest_engine_heartbeat.h" 39 40 #include "i915_random.h" 41 #include "i915_selftest.h" 42 #include "igt_flush_test.h" 43 #include "igt_live_test.h" 44 #include "igt_spinner.h" 45 #include "lib_sw_fence.h" 46 47 #include "mock_drm.h" 48 #include "mock_gem_device.h" 49 50 static unsigned int num_uabi_engines(struct drm_i915_private *i915) 51 { 52 struct intel_engine_cs *engine; 53 unsigned int count; 54 55 count = 0; 56 for_each_uabi_engine(engine, i915) 57 count++; 58 59 return count; 60 } 61 62 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915) 63 { 64 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0); 65 } 66 67 static int igt_add_request(void *arg) 68 { 69 struct drm_i915_private *i915 = arg; 70 struct i915_request *request; 71 72 /* Basic preliminary test to create a request and let it loose! */ 73 74 request = mock_request(rcs0(i915)->kernel_context, HZ / 10); 75 if (!request) 76 return -ENOMEM; 77 78 i915_request_add(request); 79 80 return 0; 81 } 82 83 static int igt_wait_request(void *arg) 84 { 85 const long T = HZ / 4; 86 struct drm_i915_private *i915 = arg; 87 struct i915_request *request; 88 int err = -EINVAL; 89 90 /* Submit a request, then wait upon it */ 91 92 request = mock_request(rcs0(i915)->kernel_context, T); 93 if (!request) 94 return -ENOMEM; 95 96 i915_request_get(request); 97 98 if (i915_request_wait(request, 0, 0) != -ETIME) { 99 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n"); 100 goto out_request; 101 } 102 103 if (i915_request_wait(request, 0, T) != -ETIME) { 104 pr_err("request wait succeeded (expected timeout before submit!)\n"); 105 goto out_request; 106 } 107 108 if (i915_request_completed(request)) { 109 pr_err("request completed before submit!!\n"); 110 goto out_request; 111 } 112 113 i915_request_add(request); 114 115 if (i915_request_wait(request, 0, 0) != -ETIME) { 116 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n"); 117 goto out_request; 118 } 119 120 if (i915_request_completed(request)) { 121 pr_err("request completed immediately!\n"); 122 goto out_request; 123 } 124 125 if (i915_request_wait(request, 0, T / 2) != -ETIME) { 126 pr_err("request wait succeeded (expected timeout!)\n"); 127 goto out_request; 128 } 129 130 if (i915_request_wait(request, 0, T) == -ETIME) { 131 pr_err("request wait timed out!\n"); 132 goto out_request; 133 } 134 135 if (!i915_request_completed(request)) { 136 pr_err("request not complete after waiting!\n"); 137 goto out_request; 138 } 139 140 if (i915_request_wait(request, 0, T) == -ETIME) { 141 pr_err("request wait timed out when already complete!\n"); 142 goto out_request; 143 } 144 145 err = 0; 146 out_request: 147 i915_request_put(request); 148 mock_device_flush(i915); 149 return err; 150 } 151 152 static int igt_fence_wait(void *arg) 153 { 154 const long T = HZ / 4; 155 struct drm_i915_private *i915 = arg; 156 struct i915_request *request; 157 int err = -EINVAL; 158 159 /* Submit a request, treat it as a fence and wait upon it */ 160 161 request = mock_request(rcs0(i915)->kernel_context, T); 162 if (!request) 163 return -ENOMEM; 164 165 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) { 166 pr_err("fence wait success before submit (expected timeout)!\n"); 167 goto out; 168 } 169 170 i915_request_add(request); 171 172 if (dma_fence_is_signaled(&request->fence)) { 173 pr_err("fence signaled immediately!\n"); 174 goto out; 175 } 176 177 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) { 178 pr_err("fence wait success after submit (expected timeout)!\n"); 179 goto out; 180 } 181 182 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 183 pr_err("fence wait timed out (expected success)!\n"); 184 goto out; 185 } 186 187 if (!dma_fence_is_signaled(&request->fence)) { 188 pr_err("fence unsignaled after waiting!\n"); 189 goto out; 190 } 191 192 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 193 pr_err("fence wait timed out when complete (expected success)!\n"); 194 goto out; 195 } 196 197 err = 0; 198 out: 199 mock_device_flush(i915); 200 return err; 201 } 202 203 static int igt_request_rewind(void *arg) 204 { 205 struct drm_i915_private *i915 = arg; 206 struct i915_request *request, *vip; 207 struct i915_gem_context *ctx[2]; 208 struct intel_context *ce; 209 int err = -EINVAL; 210 211 ctx[0] = mock_context(i915, "A"); 212 213 ce = i915_gem_context_get_engine(ctx[0], RCS0); 214 GEM_BUG_ON(IS_ERR(ce)); 215 request = mock_request(ce, 2 * HZ); 216 intel_context_put(ce); 217 if (!request) { 218 err = -ENOMEM; 219 goto err_context_0; 220 } 221 222 i915_request_get(request); 223 i915_request_add(request); 224 225 ctx[1] = mock_context(i915, "B"); 226 227 ce = i915_gem_context_get_engine(ctx[1], RCS0); 228 GEM_BUG_ON(IS_ERR(ce)); 229 vip = mock_request(ce, 0); 230 intel_context_put(ce); 231 if (!vip) { 232 err = -ENOMEM; 233 goto err_context_1; 234 } 235 236 /* Simulate preemption by manual reordering */ 237 if (!mock_cancel_request(request)) { 238 pr_err("failed to cancel request (already executed)!\n"); 239 i915_request_add(vip); 240 goto err_context_1; 241 } 242 i915_request_get(vip); 243 i915_request_add(vip); 244 rcu_read_lock(); 245 request->engine->submit_request(request); 246 rcu_read_unlock(); 247 248 249 if (i915_request_wait(vip, 0, HZ) == -ETIME) { 250 pr_err("timed out waiting for high priority request\n"); 251 goto err; 252 } 253 254 if (i915_request_completed(request)) { 255 pr_err("low priority request already completed\n"); 256 goto err; 257 } 258 259 err = 0; 260 err: 261 i915_request_put(vip); 262 err_context_1: 263 mock_context_close(ctx[1]); 264 i915_request_put(request); 265 err_context_0: 266 mock_context_close(ctx[0]); 267 mock_device_flush(i915); 268 return err; 269 } 270 271 struct smoketest { 272 struct intel_engine_cs *engine; 273 struct i915_gem_context **contexts; 274 atomic_long_t num_waits, num_fences; 275 int ncontexts, max_batch; 276 struct i915_request *(*request_alloc)(struct intel_context *ce); 277 }; 278 279 static struct i915_request * 280 __mock_request_alloc(struct intel_context *ce) 281 { 282 return mock_request(ce, 0); 283 } 284 285 static struct i915_request * 286 __live_request_alloc(struct intel_context *ce) 287 { 288 return intel_context_create_request(ce); 289 } 290 291 static int __igt_breadcrumbs_smoketest(void *arg) 292 { 293 struct smoketest *t = arg; 294 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1; 295 const unsigned int total = 4 * t->ncontexts + 1; 296 unsigned int num_waits = 0, num_fences = 0; 297 struct i915_request **requests; 298 I915_RND_STATE(prng); 299 unsigned int *order; 300 int err = 0; 301 302 /* 303 * A very simple test to catch the most egregious of list handling bugs. 304 * 305 * At its heart, we simply create oodles of requests running across 306 * multiple kthreads and enable signaling on them, for the sole purpose 307 * of stressing our breadcrumb handling. The only inspection we do is 308 * that the fences were marked as signaled. 309 */ 310 311 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL); 312 if (!requests) 313 return -ENOMEM; 314 315 order = i915_random_order(total, &prng); 316 if (!order) { 317 err = -ENOMEM; 318 goto out_requests; 319 } 320 321 while (!kthread_should_stop()) { 322 struct i915_sw_fence *submit, *wait; 323 unsigned int n, count; 324 325 submit = heap_fence_create(GFP_KERNEL); 326 if (!submit) { 327 err = -ENOMEM; 328 break; 329 } 330 331 wait = heap_fence_create(GFP_KERNEL); 332 if (!wait) { 333 i915_sw_fence_commit(submit); 334 heap_fence_put(submit); 335 err = -ENOMEM; 336 break; 337 } 338 339 i915_random_reorder(order, total, &prng); 340 count = 1 + i915_prandom_u32_max_state(max_batch, &prng); 341 342 for (n = 0; n < count; n++) { 343 struct i915_gem_context *ctx = 344 t->contexts[order[n] % t->ncontexts]; 345 struct i915_request *rq; 346 struct intel_context *ce; 347 348 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx); 349 GEM_BUG_ON(IS_ERR(ce)); 350 rq = t->request_alloc(ce); 351 intel_context_put(ce); 352 if (IS_ERR(rq)) { 353 err = PTR_ERR(rq); 354 count = n; 355 break; 356 } 357 358 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 359 submit, 360 GFP_KERNEL); 361 362 requests[n] = i915_request_get(rq); 363 i915_request_add(rq); 364 365 if (err >= 0) 366 err = i915_sw_fence_await_dma_fence(wait, 367 &rq->fence, 368 0, 369 GFP_KERNEL); 370 371 if (err < 0) { 372 i915_request_put(rq); 373 count = n; 374 break; 375 } 376 } 377 378 i915_sw_fence_commit(submit); 379 i915_sw_fence_commit(wait); 380 381 if (!wait_event_timeout(wait->wait, 382 i915_sw_fence_done(wait), 383 5 * HZ)) { 384 struct i915_request *rq = requests[count - 1]; 385 386 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", 387 atomic_read(&wait->pending), count, 388 rq->fence.context, rq->fence.seqno, 389 t->engine->name); 390 GEM_TRACE_DUMP(); 391 392 intel_gt_set_wedged(t->engine->gt); 393 GEM_BUG_ON(!i915_request_completed(rq)); 394 i915_sw_fence_wait(wait); 395 err = -EIO; 396 } 397 398 for (n = 0; n < count; n++) { 399 struct i915_request *rq = requests[n]; 400 401 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 402 &rq->fence.flags)) { 403 pr_err("%llu:%llu was not signaled!\n", 404 rq->fence.context, rq->fence.seqno); 405 err = -EINVAL; 406 } 407 408 i915_request_put(rq); 409 } 410 411 heap_fence_put(wait); 412 heap_fence_put(submit); 413 414 if (err < 0) 415 break; 416 417 num_fences += count; 418 num_waits++; 419 420 cond_resched(); 421 } 422 423 atomic_long_add(num_fences, &t->num_fences); 424 atomic_long_add(num_waits, &t->num_waits); 425 426 kfree(order); 427 out_requests: 428 kfree(requests); 429 return err; 430 } 431 432 static int mock_breadcrumbs_smoketest(void *arg) 433 { 434 struct drm_i915_private *i915 = arg; 435 struct smoketest t = { 436 .engine = rcs0(i915), 437 .ncontexts = 1024, 438 .max_batch = 1024, 439 .request_alloc = __mock_request_alloc 440 }; 441 unsigned int ncpus = num_online_cpus(); 442 struct task_struct **threads; 443 unsigned int n; 444 int ret = 0; 445 446 /* 447 * Smoketest our breadcrumb/signal handling for requests across multiple 448 * threads. A very simple test to only catch the most egregious of bugs. 449 * See __igt_breadcrumbs_smoketest(); 450 */ 451 452 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL); 453 if (!threads) 454 return -ENOMEM; 455 456 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL); 457 if (!t.contexts) { 458 ret = -ENOMEM; 459 goto out_threads; 460 } 461 462 for (n = 0; n < t.ncontexts; n++) { 463 t.contexts[n] = mock_context(t.engine->i915, "mock"); 464 if (!t.contexts[n]) { 465 ret = -ENOMEM; 466 goto out_contexts; 467 } 468 } 469 470 for (n = 0; n < ncpus; n++) { 471 threads[n] = kthread_run(__igt_breadcrumbs_smoketest, 472 &t, "igt/%d", n); 473 if (IS_ERR(threads[n])) { 474 ret = PTR_ERR(threads[n]); 475 ncpus = n; 476 break; 477 } 478 479 get_task_struct(threads[n]); 480 } 481 482 yield(); /* start all threads before we begin */ 483 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 484 485 for (n = 0; n < ncpus; n++) { 486 int err; 487 488 err = kthread_stop(threads[n]); 489 if (err < 0 && !ret) 490 ret = err; 491 492 put_task_struct(threads[n]); 493 } 494 pr_info("Completed %lu waits for %lu fence across %d cpus\n", 495 atomic_long_read(&t.num_waits), 496 atomic_long_read(&t.num_fences), 497 ncpus); 498 499 out_contexts: 500 for (n = 0; n < t.ncontexts; n++) { 501 if (!t.contexts[n]) 502 break; 503 mock_context_close(t.contexts[n]); 504 } 505 kfree(t.contexts); 506 out_threads: 507 kfree(threads); 508 return ret; 509 } 510 511 int i915_request_mock_selftests(void) 512 { 513 static const struct i915_subtest tests[] = { 514 SUBTEST(igt_add_request), 515 SUBTEST(igt_wait_request), 516 SUBTEST(igt_fence_wait), 517 SUBTEST(igt_request_rewind), 518 SUBTEST(mock_breadcrumbs_smoketest), 519 }; 520 struct drm_i915_private *i915; 521 intel_wakeref_t wakeref; 522 int err = 0; 523 524 i915 = mock_gem_device(); 525 if (!i915) 526 return -ENOMEM; 527 528 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 529 err = i915_subtests(tests, i915); 530 531 mock_destroy_device(i915); 532 533 return err; 534 } 535 536 static int live_nop_request(void *arg) 537 { 538 struct drm_i915_private *i915 = arg; 539 struct intel_engine_cs *engine; 540 struct igt_live_test t; 541 int err = -ENODEV; 542 543 /* 544 * Submit various sized batches of empty requests, to each engine 545 * (individually), and wait for the batch to complete. We can check 546 * the overhead of submitting requests to the hardware. 547 */ 548 549 for_each_uabi_engine(engine, i915) { 550 unsigned long n, prime; 551 IGT_TIMEOUT(end_time); 552 ktime_t times[2] = {}; 553 554 err = igt_live_test_begin(&t, i915, __func__, engine->name); 555 if (err) 556 return err; 557 558 intel_engine_pm_get(engine); 559 for_each_prime_number_from(prime, 1, 8192) { 560 struct i915_request *request = NULL; 561 562 times[1] = ktime_get_raw(); 563 564 for (n = 0; n < prime; n++) { 565 i915_request_put(request); 566 request = i915_request_create(engine->kernel_context); 567 if (IS_ERR(request)) 568 return PTR_ERR(request); 569 570 /* 571 * This space is left intentionally blank. 572 * 573 * We do not actually want to perform any 574 * action with this request, we just want 575 * to measure the latency in allocation 576 * and submission of our breadcrumbs - 577 * ensuring that the bare request is sufficient 578 * for the system to work (i.e. proper HEAD 579 * tracking of the rings, interrupt handling, 580 * etc). It also gives us the lowest bounds 581 * for latency. 582 */ 583 584 i915_request_get(request); 585 i915_request_add(request); 586 } 587 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 588 i915_request_put(request); 589 590 times[1] = ktime_sub(ktime_get_raw(), times[1]); 591 if (prime == 1) 592 times[0] = times[1]; 593 594 if (__igt_timeout(end_time, NULL)) 595 break; 596 } 597 intel_engine_pm_put(engine); 598 599 err = igt_live_test_end(&t); 600 if (err) 601 return err; 602 603 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n", 604 engine->name, 605 ktime_to_ns(times[0]), 606 prime, div64_u64(ktime_to_ns(times[1]), prime)); 607 } 608 609 return err; 610 } 611 612 static struct i915_vma *empty_batch(struct drm_i915_private *i915) 613 { 614 struct drm_i915_gem_object *obj; 615 struct i915_vma *vma; 616 u32 *cmd; 617 int err; 618 619 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 620 if (IS_ERR(obj)) 621 return ERR_CAST(obj); 622 623 cmd = i915_gem_object_pin_map(obj, I915_MAP_WB); 624 if (IS_ERR(cmd)) { 625 err = PTR_ERR(cmd); 626 goto err; 627 } 628 629 *cmd = MI_BATCH_BUFFER_END; 630 631 __i915_gem_object_flush_map(obj, 0, 64); 632 i915_gem_object_unpin_map(obj); 633 634 intel_gt_chipset_flush(&i915->gt); 635 636 vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL); 637 if (IS_ERR(vma)) { 638 err = PTR_ERR(vma); 639 goto err; 640 } 641 642 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL); 643 if (err) 644 goto err; 645 646 /* Force the wait wait now to avoid including it in the benchmark */ 647 err = i915_vma_sync(vma); 648 if (err) 649 goto err_pin; 650 651 return vma; 652 653 err_pin: 654 i915_vma_unpin(vma); 655 err: 656 i915_gem_object_put(obj); 657 return ERR_PTR(err); 658 } 659 660 static struct i915_request * 661 empty_request(struct intel_engine_cs *engine, 662 struct i915_vma *batch) 663 { 664 struct i915_request *request; 665 int err; 666 667 request = i915_request_create(engine->kernel_context); 668 if (IS_ERR(request)) 669 return request; 670 671 err = engine->emit_bb_start(request, 672 batch->node.start, 673 batch->node.size, 674 I915_DISPATCH_SECURE); 675 if (err) 676 goto out_request; 677 678 i915_request_get(request); 679 out_request: 680 i915_request_add(request); 681 return err ? ERR_PTR(err) : request; 682 } 683 684 static int live_empty_request(void *arg) 685 { 686 struct drm_i915_private *i915 = arg; 687 struct intel_engine_cs *engine; 688 struct igt_live_test t; 689 struct i915_vma *batch; 690 int err = 0; 691 692 /* 693 * Submit various sized batches of empty requests, to each engine 694 * (individually), and wait for the batch to complete. We can check 695 * the overhead of submitting requests to the hardware. 696 */ 697 698 batch = empty_batch(i915); 699 if (IS_ERR(batch)) 700 return PTR_ERR(batch); 701 702 for_each_uabi_engine(engine, i915) { 703 IGT_TIMEOUT(end_time); 704 struct i915_request *request; 705 unsigned long n, prime; 706 ktime_t times[2] = {}; 707 708 err = igt_live_test_begin(&t, i915, __func__, engine->name); 709 if (err) 710 goto out_batch; 711 712 intel_engine_pm_get(engine); 713 714 /* Warmup / preload */ 715 request = empty_request(engine, batch); 716 if (IS_ERR(request)) { 717 err = PTR_ERR(request); 718 intel_engine_pm_put(engine); 719 goto out_batch; 720 } 721 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 722 723 for_each_prime_number_from(prime, 1, 8192) { 724 times[1] = ktime_get_raw(); 725 726 for (n = 0; n < prime; n++) { 727 i915_request_put(request); 728 request = empty_request(engine, batch); 729 if (IS_ERR(request)) { 730 err = PTR_ERR(request); 731 intel_engine_pm_put(engine); 732 goto out_batch; 733 } 734 } 735 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 736 737 times[1] = ktime_sub(ktime_get_raw(), times[1]); 738 if (prime == 1) 739 times[0] = times[1]; 740 741 if (__igt_timeout(end_time, NULL)) 742 break; 743 } 744 i915_request_put(request); 745 intel_engine_pm_put(engine); 746 747 err = igt_live_test_end(&t); 748 if (err) 749 goto out_batch; 750 751 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n", 752 engine->name, 753 ktime_to_ns(times[0]), 754 prime, div64_u64(ktime_to_ns(times[1]), prime)); 755 } 756 757 out_batch: 758 i915_vma_unpin(batch); 759 i915_vma_put(batch); 760 return err; 761 } 762 763 static struct i915_vma *recursive_batch(struct drm_i915_private *i915) 764 { 765 struct drm_i915_gem_object *obj; 766 const int gen = INTEL_GEN(i915); 767 struct i915_vma *vma; 768 u32 *cmd; 769 int err; 770 771 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 772 if (IS_ERR(obj)) 773 return ERR_CAST(obj); 774 775 vma = i915_vma_instance(obj, i915->gt.vm, NULL); 776 if (IS_ERR(vma)) { 777 err = PTR_ERR(vma); 778 goto err; 779 } 780 781 err = i915_vma_pin(vma, 0, 0, PIN_USER); 782 if (err) 783 goto err; 784 785 cmd = i915_gem_object_pin_map(obj, I915_MAP_WC); 786 if (IS_ERR(cmd)) { 787 err = PTR_ERR(cmd); 788 goto err; 789 } 790 791 if (gen >= 8) { 792 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 793 *cmd++ = lower_32_bits(vma->node.start); 794 *cmd++ = upper_32_bits(vma->node.start); 795 } else if (gen >= 6) { 796 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8; 797 *cmd++ = lower_32_bits(vma->node.start); 798 } else { 799 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; 800 *cmd++ = lower_32_bits(vma->node.start); 801 } 802 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */ 803 804 __i915_gem_object_flush_map(obj, 0, 64); 805 i915_gem_object_unpin_map(obj); 806 807 intel_gt_chipset_flush(&i915->gt); 808 809 return vma; 810 811 err: 812 i915_gem_object_put(obj); 813 return ERR_PTR(err); 814 } 815 816 static int recursive_batch_resolve(struct i915_vma *batch) 817 { 818 u32 *cmd; 819 820 cmd = i915_gem_object_pin_map(batch->obj, I915_MAP_WC); 821 if (IS_ERR(cmd)) 822 return PTR_ERR(cmd); 823 824 *cmd = MI_BATCH_BUFFER_END; 825 826 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd)); 827 i915_gem_object_unpin_map(batch->obj); 828 829 intel_gt_chipset_flush(batch->vm->gt); 830 831 return 0; 832 } 833 834 static int live_all_engines(void *arg) 835 { 836 struct drm_i915_private *i915 = arg; 837 const unsigned int nengines = num_uabi_engines(i915); 838 struct intel_engine_cs *engine; 839 struct i915_request **request; 840 struct igt_live_test t; 841 struct i915_vma *batch; 842 unsigned int idx; 843 int err; 844 845 /* 846 * Check we can submit requests to all engines simultaneously. We 847 * send a recursive batch to each engine - checking that we don't 848 * block doing so, and that they don't complete too soon. 849 */ 850 851 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 852 if (!request) 853 return -ENOMEM; 854 855 err = igt_live_test_begin(&t, i915, __func__, ""); 856 if (err) 857 goto out_free; 858 859 batch = recursive_batch(i915); 860 if (IS_ERR(batch)) { 861 err = PTR_ERR(batch); 862 pr_err("%s: Unable to create batch, err=%d\n", __func__, err); 863 goto out_free; 864 } 865 866 i915_vma_lock(batch); 867 868 idx = 0; 869 for_each_uabi_engine(engine, i915) { 870 request[idx] = intel_engine_create_kernel_request(engine); 871 if (IS_ERR(request[idx])) { 872 err = PTR_ERR(request[idx]); 873 pr_err("%s: Request allocation failed with err=%d\n", 874 __func__, err); 875 goto out_request; 876 } 877 878 err = i915_request_await_object(request[idx], batch->obj, 0); 879 if (err == 0) 880 err = i915_vma_move_to_active(batch, request[idx], 0); 881 GEM_BUG_ON(err); 882 883 err = engine->emit_bb_start(request[idx], 884 batch->node.start, 885 batch->node.size, 886 0); 887 GEM_BUG_ON(err); 888 request[idx]->batch = batch; 889 890 i915_request_get(request[idx]); 891 i915_request_add(request[idx]); 892 idx++; 893 } 894 895 i915_vma_unlock(batch); 896 897 idx = 0; 898 for_each_uabi_engine(engine, i915) { 899 if (i915_request_completed(request[idx])) { 900 pr_err("%s(%s): request completed too early!\n", 901 __func__, engine->name); 902 err = -EINVAL; 903 goto out_request; 904 } 905 idx++; 906 } 907 908 err = recursive_batch_resolve(batch); 909 if (err) { 910 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err); 911 goto out_request; 912 } 913 914 idx = 0; 915 for_each_uabi_engine(engine, i915) { 916 long timeout; 917 918 timeout = i915_request_wait(request[idx], 0, 919 MAX_SCHEDULE_TIMEOUT); 920 if (timeout < 0) { 921 err = timeout; 922 pr_err("%s: error waiting for request on %s, err=%d\n", 923 __func__, engine->name, err); 924 goto out_request; 925 } 926 927 GEM_BUG_ON(!i915_request_completed(request[idx])); 928 i915_request_put(request[idx]); 929 request[idx] = NULL; 930 idx++; 931 } 932 933 err = igt_live_test_end(&t); 934 935 out_request: 936 idx = 0; 937 for_each_uabi_engine(engine, i915) { 938 if (request[idx]) 939 i915_request_put(request[idx]); 940 idx++; 941 } 942 i915_vma_unpin(batch); 943 i915_vma_put(batch); 944 out_free: 945 kfree(request); 946 return err; 947 } 948 949 static int live_sequential_engines(void *arg) 950 { 951 struct drm_i915_private *i915 = arg; 952 const unsigned int nengines = num_uabi_engines(i915); 953 struct i915_request **request; 954 struct i915_request *prev = NULL; 955 struct intel_engine_cs *engine; 956 struct igt_live_test t; 957 unsigned int idx; 958 int err; 959 960 /* 961 * Check we can submit requests to all engines sequentially, such 962 * that each successive request waits for the earlier ones. This 963 * tests that we don't execute requests out of order, even though 964 * they are running on independent engines. 965 */ 966 967 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 968 if (!request) 969 return -ENOMEM; 970 971 err = igt_live_test_begin(&t, i915, __func__, ""); 972 if (err) 973 goto out_free; 974 975 idx = 0; 976 for_each_uabi_engine(engine, i915) { 977 struct i915_vma *batch; 978 979 batch = recursive_batch(i915); 980 if (IS_ERR(batch)) { 981 err = PTR_ERR(batch); 982 pr_err("%s: Unable to create batch for %s, err=%d\n", 983 __func__, engine->name, err); 984 goto out_free; 985 } 986 987 i915_vma_lock(batch); 988 request[idx] = intel_engine_create_kernel_request(engine); 989 if (IS_ERR(request[idx])) { 990 err = PTR_ERR(request[idx]); 991 pr_err("%s: Request allocation failed for %s with err=%d\n", 992 __func__, engine->name, err); 993 goto out_unlock; 994 } 995 996 if (prev) { 997 err = i915_request_await_dma_fence(request[idx], 998 &prev->fence); 999 if (err) { 1000 i915_request_add(request[idx]); 1001 pr_err("%s: Request await failed for %s with err=%d\n", 1002 __func__, engine->name, err); 1003 goto out_unlock; 1004 } 1005 } 1006 1007 err = i915_request_await_object(request[idx], 1008 batch->obj, false); 1009 if (err == 0) 1010 err = i915_vma_move_to_active(batch, request[idx], 0); 1011 GEM_BUG_ON(err); 1012 1013 err = engine->emit_bb_start(request[idx], 1014 batch->node.start, 1015 batch->node.size, 1016 0); 1017 GEM_BUG_ON(err); 1018 request[idx]->batch = batch; 1019 1020 i915_request_get(request[idx]); 1021 i915_request_add(request[idx]); 1022 1023 prev = request[idx]; 1024 idx++; 1025 1026 out_unlock: 1027 i915_vma_unlock(batch); 1028 if (err) 1029 goto out_request; 1030 } 1031 1032 idx = 0; 1033 for_each_uabi_engine(engine, i915) { 1034 long timeout; 1035 1036 if (i915_request_completed(request[idx])) { 1037 pr_err("%s(%s): request completed too early!\n", 1038 __func__, engine->name); 1039 err = -EINVAL; 1040 goto out_request; 1041 } 1042 1043 err = recursive_batch_resolve(request[idx]->batch); 1044 if (err) { 1045 pr_err("%s: failed to resolve batch, err=%d\n", 1046 __func__, err); 1047 goto out_request; 1048 } 1049 1050 timeout = i915_request_wait(request[idx], 0, 1051 MAX_SCHEDULE_TIMEOUT); 1052 if (timeout < 0) { 1053 err = timeout; 1054 pr_err("%s: error waiting for request on %s, err=%d\n", 1055 __func__, engine->name, err); 1056 goto out_request; 1057 } 1058 1059 GEM_BUG_ON(!i915_request_completed(request[idx])); 1060 idx++; 1061 } 1062 1063 err = igt_live_test_end(&t); 1064 1065 out_request: 1066 idx = 0; 1067 for_each_uabi_engine(engine, i915) { 1068 u32 *cmd; 1069 1070 if (!request[idx]) 1071 break; 1072 1073 cmd = i915_gem_object_pin_map(request[idx]->batch->obj, 1074 I915_MAP_WC); 1075 if (!IS_ERR(cmd)) { 1076 *cmd = MI_BATCH_BUFFER_END; 1077 1078 __i915_gem_object_flush_map(request[idx]->batch->obj, 1079 0, sizeof(*cmd)); 1080 i915_gem_object_unpin_map(request[idx]->batch->obj); 1081 1082 intel_gt_chipset_flush(engine->gt); 1083 } 1084 1085 i915_vma_put(request[idx]->batch); 1086 i915_request_put(request[idx]); 1087 idx++; 1088 } 1089 out_free: 1090 kfree(request); 1091 return err; 1092 } 1093 1094 static int __live_parallel_engine1(void *arg) 1095 { 1096 struct intel_engine_cs *engine = arg; 1097 IGT_TIMEOUT(end_time); 1098 unsigned long count; 1099 int err = 0; 1100 1101 count = 0; 1102 intel_engine_pm_get(engine); 1103 do { 1104 struct i915_request *rq; 1105 1106 rq = i915_request_create(engine->kernel_context); 1107 if (IS_ERR(rq)) { 1108 err = PTR_ERR(rq); 1109 break; 1110 } 1111 1112 i915_request_get(rq); 1113 i915_request_add(rq); 1114 1115 err = 0; 1116 if (i915_request_wait(rq, 0, HZ / 5) < 0) 1117 err = -ETIME; 1118 i915_request_put(rq); 1119 if (err) 1120 break; 1121 1122 count++; 1123 } while (!__igt_timeout(end_time, NULL)); 1124 intel_engine_pm_put(engine); 1125 1126 pr_info("%s: %lu request + sync\n", engine->name, count); 1127 return err; 1128 } 1129 1130 static int __live_parallel_engineN(void *arg) 1131 { 1132 struct intel_engine_cs *engine = arg; 1133 IGT_TIMEOUT(end_time); 1134 unsigned long count; 1135 int err = 0; 1136 1137 count = 0; 1138 intel_engine_pm_get(engine); 1139 do { 1140 struct i915_request *rq; 1141 1142 rq = i915_request_create(engine->kernel_context); 1143 if (IS_ERR(rq)) { 1144 err = PTR_ERR(rq); 1145 break; 1146 } 1147 1148 i915_request_add(rq); 1149 count++; 1150 } while (!__igt_timeout(end_time, NULL)); 1151 intel_engine_pm_put(engine); 1152 1153 pr_info("%s: %lu requests\n", engine->name, count); 1154 return err; 1155 } 1156 1157 static bool wake_all(struct drm_i915_private *i915) 1158 { 1159 if (atomic_dec_and_test(&i915->selftest.counter)) { 1160 wake_up_var(&i915->selftest.counter); 1161 return true; 1162 } 1163 1164 return false; 1165 } 1166 1167 static int wait_for_all(struct drm_i915_private *i915) 1168 { 1169 if (wake_all(i915)) 1170 return 0; 1171 1172 if (wait_var_event_timeout(&i915->selftest.counter, 1173 !atomic_read(&i915->selftest.counter), 1174 i915_selftest.timeout_jiffies)) 1175 return 0; 1176 1177 return -ETIME; 1178 } 1179 1180 static int __live_parallel_spin(void *arg) 1181 { 1182 struct intel_engine_cs *engine = arg; 1183 struct igt_spinner spin; 1184 struct i915_request *rq; 1185 int err = 0; 1186 1187 /* 1188 * Create a spinner running for eternity on each engine. If a second 1189 * spinner is incorrectly placed on the same engine, it will not be 1190 * able to start in time. 1191 */ 1192 1193 if (igt_spinner_init(&spin, engine->gt)) { 1194 wake_all(engine->i915); 1195 return -ENOMEM; 1196 } 1197 1198 intel_engine_pm_get(engine); 1199 rq = igt_spinner_create_request(&spin, 1200 engine->kernel_context, 1201 MI_NOOP); /* no preemption */ 1202 intel_engine_pm_put(engine); 1203 if (IS_ERR(rq)) { 1204 err = PTR_ERR(rq); 1205 if (err == -ENODEV) 1206 err = 0; 1207 wake_all(engine->i915); 1208 goto out_spin; 1209 } 1210 1211 i915_request_get(rq); 1212 i915_request_add(rq); 1213 if (igt_wait_for_spinner(&spin, rq)) { 1214 /* Occupy this engine for the whole test */ 1215 err = wait_for_all(engine->i915); 1216 } else { 1217 pr_err("Failed to start spinner on %s\n", engine->name); 1218 err = -EINVAL; 1219 } 1220 igt_spinner_end(&spin); 1221 1222 if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0) 1223 err = -EIO; 1224 i915_request_put(rq); 1225 1226 out_spin: 1227 igt_spinner_fini(&spin); 1228 return err; 1229 } 1230 1231 static int live_parallel_engines(void *arg) 1232 { 1233 struct drm_i915_private *i915 = arg; 1234 static int (* const func[])(void *arg) = { 1235 __live_parallel_engine1, 1236 __live_parallel_engineN, 1237 __live_parallel_spin, 1238 NULL, 1239 }; 1240 const unsigned int nengines = num_uabi_engines(i915); 1241 struct intel_engine_cs *engine; 1242 int (* const *fn)(void *arg); 1243 struct task_struct **tsk; 1244 int err = 0; 1245 1246 /* 1247 * Check we can submit requests to all engines concurrently. This 1248 * tests that we load up the system maximally. 1249 */ 1250 1251 tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL); 1252 if (!tsk) 1253 return -ENOMEM; 1254 1255 for (fn = func; !err && *fn; fn++) { 1256 char name[KSYM_NAME_LEN]; 1257 struct igt_live_test t; 1258 unsigned int idx; 1259 1260 snprintf(name, sizeof(name), "%ps", *fn); 1261 err = igt_live_test_begin(&t, i915, __func__, name); 1262 if (err) 1263 break; 1264 1265 atomic_set(&i915->selftest.counter, nengines); 1266 1267 idx = 0; 1268 for_each_uabi_engine(engine, i915) { 1269 tsk[idx] = kthread_run(*fn, engine, 1270 "igt/parallel:%s", 1271 engine->name); 1272 if (IS_ERR(tsk[idx])) { 1273 err = PTR_ERR(tsk[idx]); 1274 break; 1275 } 1276 get_task_struct(tsk[idx++]); 1277 } 1278 1279 yield(); /* start all threads before we kthread_stop() */ 1280 1281 idx = 0; 1282 for_each_uabi_engine(engine, i915) { 1283 int status; 1284 1285 if (IS_ERR(tsk[idx])) 1286 break; 1287 1288 status = kthread_stop(tsk[idx]); 1289 if (status && !err) 1290 err = status; 1291 1292 put_task_struct(tsk[idx++]); 1293 } 1294 1295 if (igt_live_test_end(&t)) 1296 err = -EIO; 1297 } 1298 1299 kfree(tsk); 1300 return err; 1301 } 1302 1303 static int 1304 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine) 1305 { 1306 struct i915_request *rq; 1307 int ret; 1308 1309 /* 1310 * Before execlists, all contexts share the same ringbuffer. With 1311 * execlists, each context/engine has a separate ringbuffer and 1312 * for the purposes of this test, inexhaustible. 1313 * 1314 * For the global ringbuffer though, we have to be very careful 1315 * that we do not wrap while preventing the execution of requests 1316 * with a unsignaled fence. 1317 */ 1318 if (HAS_EXECLISTS(ctx->i915)) 1319 return INT_MAX; 1320 1321 rq = igt_request_alloc(ctx, engine); 1322 if (IS_ERR(rq)) { 1323 ret = PTR_ERR(rq); 1324 } else { 1325 int sz; 1326 1327 ret = rq->ring->size - rq->reserved_space; 1328 i915_request_add(rq); 1329 1330 sz = rq->ring->emit - rq->head; 1331 if (sz < 0) 1332 sz += rq->ring->size; 1333 ret /= sz; 1334 ret /= 2; /* leave half spare, in case of emergency! */ 1335 } 1336 1337 return ret; 1338 } 1339 1340 static int live_breadcrumbs_smoketest(void *arg) 1341 { 1342 struct drm_i915_private *i915 = arg; 1343 const unsigned int nengines = num_uabi_engines(i915); 1344 const unsigned int ncpus = num_online_cpus(); 1345 unsigned long num_waits, num_fences; 1346 struct intel_engine_cs *engine; 1347 struct task_struct **threads; 1348 struct igt_live_test live; 1349 intel_wakeref_t wakeref; 1350 struct smoketest *smoke; 1351 unsigned int n, idx; 1352 struct file *file; 1353 int ret = 0; 1354 1355 /* 1356 * Smoketest our breadcrumb/signal handling for requests across multiple 1357 * threads. A very simple test to only catch the most egregious of bugs. 1358 * See __igt_breadcrumbs_smoketest(); 1359 * 1360 * On real hardware this time. 1361 */ 1362 1363 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1364 1365 file = mock_file(i915); 1366 if (IS_ERR(file)) { 1367 ret = PTR_ERR(file); 1368 goto out_rpm; 1369 } 1370 1371 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL); 1372 if (!smoke) { 1373 ret = -ENOMEM; 1374 goto out_file; 1375 } 1376 1377 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL); 1378 if (!threads) { 1379 ret = -ENOMEM; 1380 goto out_smoke; 1381 } 1382 1383 smoke[0].request_alloc = __live_request_alloc; 1384 smoke[0].ncontexts = 64; 1385 smoke[0].contexts = kcalloc(smoke[0].ncontexts, 1386 sizeof(*smoke[0].contexts), 1387 GFP_KERNEL); 1388 if (!smoke[0].contexts) { 1389 ret = -ENOMEM; 1390 goto out_threads; 1391 } 1392 1393 for (n = 0; n < smoke[0].ncontexts; n++) { 1394 smoke[0].contexts[n] = live_context(i915, file); 1395 if (!smoke[0].contexts[n]) { 1396 ret = -ENOMEM; 1397 goto out_contexts; 1398 } 1399 } 1400 1401 ret = igt_live_test_begin(&live, i915, __func__, ""); 1402 if (ret) 1403 goto out_contexts; 1404 1405 idx = 0; 1406 for_each_uabi_engine(engine, i915) { 1407 smoke[idx] = smoke[0]; 1408 smoke[idx].engine = engine; 1409 smoke[idx].max_batch = 1410 max_batches(smoke[0].contexts[0], engine); 1411 if (smoke[idx].max_batch < 0) { 1412 ret = smoke[idx].max_batch; 1413 goto out_flush; 1414 } 1415 /* One ring interleaved between requests from all cpus */ 1416 smoke[idx].max_batch /= num_online_cpus() + 1; 1417 pr_debug("Limiting batches to %d requests on %s\n", 1418 smoke[idx].max_batch, engine->name); 1419 1420 for (n = 0; n < ncpus; n++) { 1421 struct task_struct *tsk; 1422 1423 tsk = kthread_run(__igt_breadcrumbs_smoketest, 1424 &smoke[idx], "igt/%d.%d", idx, n); 1425 if (IS_ERR(tsk)) { 1426 ret = PTR_ERR(tsk); 1427 goto out_flush; 1428 } 1429 1430 get_task_struct(tsk); 1431 threads[idx * ncpus + n] = tsk; 1432 } 1433 1434 idx++; 1435 } 1436 1437 yield(); /* start all threads before we begin */ 1438 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 1439 1440 out_flush: 1441 idx = 0; 1442 num_waits = 0; 1443 num_fences = 0; 1444 for_each_uabi_engine(engine, i915) { 1445 for (n = 0; n < ncpus; n++) { 1446 struct task_struct *tsk = threads[idx * ncpus + n]; 1447 int err; 1448 1449 if (!tsk) 1450 continue; 1451 1452 err = kthread_stop(tsk); 1453 if (err < 0 && !ret) 1454 ret = err; 1455 1456 put_task_struct(tsk); 1457 } 1458 1459 num_waits += atomic_long_read(&smoke[idx].num_waits); 1460 num_fences += atomic_long_read(&smoke[idx].num_fences); 1461 idx++; 1462 } 1463 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", 1464 num_waits, num_fences, idx, ncpus); 1465 1466 ret = igt_live_test_end(&live) ?: ret; 1467 out_contexts: 1468 kfree(smoke[0].contexts); 1469 out_threads: 1470 kfree(threads); 1471 out_smoke: 1472 kfree(smoke); 1473 out_file: 1474 fput(file); 1475 out_rpm: 1476 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1477 1478 return ret; 1479 } 1480 1481 int i915_request_live_selftests(struct drm_i915_private *i915) 1482 { 1483 static const struct i915_subtest tests[] = { 1484 SUBTEST(live_nop_request), 1485 SUBTEST(live_all_engines), 1486 SUBTEST(live_sequential_engines), 1487 SUBTEST(live_parallel_engines), 1488 SUBTEST(live_empty_request), 1489 SUBTEST(live_breadcrumbs_smoketest), 1490 }; 1491 1492 if (intel_gt_is_wedged(&i915->gt)) 1493 return 0; 1494 1495 return i915_subtests(tests, i915); 1496 } 1497 1498 static int switch_to_kernel_sync(struct intel_context *ce, int err) 1499 { 1500 struct i915_request *rq; 1501 struct dma_fence *fence; 1502 1503 rq = intel_engine_create_kernel_request(ce->engine); 1504 if (IS_ERR(rq)) 1505 return PTR_ERR(rq); 1506 1507 fence = i915_active_fence_get(&ce->timeline->last_request); 1508 if (fence) { 1509 i915_request_await_dma_fence(rq, fence); 1510 dma_fence_put(fence); 1511 } 1512 1513 rq = i915_request_get(rq); 1514 i915_request_add(rq); 1515 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err) 1516 err = -ETIME; 1517 i915_request_put(rq); 1518 1519 while (!err && !intel_engine_is_idle(ce->engine)) 1520 intel_engine_flush_submission(ce->engine); 1521 1522 return err; 1523 } 1524 1525 struct perf_stats { 1526 struct intel_engine_cs *engine; 1527 unsigned long count; 1528 ktime_t time; 1529 ktime_t busy; 1530 u64 runtime; 1531 }; 1532 1533 struct perf_series { 1534 struct drm_i915_private *i915; 1535 unsigned int nengines; 1536 struct intel_context *ce[]; 1537 }; 1538 1539 static int cmp_u32(const void *A, const void *B) 1540 { 1541 const u32 *a = A, *b = B; 1542 1543 return *a - *b; 1544 } 1545 1546 static u32 trifilter(u32 *a) 1547 { 1548 u64 sum; 1549 1550 #define TF_COUNT 5 1551 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); 1552 1553 sum = mul_u32_u32(a[2], 2); 1554 sum += a[1]; 1555 sum += a[3]; 1556 1557 GEM_BUG_ON(sum > U32_MAX); 1558 return sum; 1559 #define TF_BIAS 2 1560 } 1561 1562 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) 1563 { 1564 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles); 1565 1566 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); 1567 } 1568 1569 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) 1570 { 1571 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 1572 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); 1573 *cs++ = offset; 1574 *cs++ = 0; 1575 1576 return cs; 1577 } 1578 1579 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) 1580 { 1581 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1582 *cs++ = offset; 1583 *cs++ = 0; 1584 *cs++ = value; 1585 1586 return cs; 1587 } 1588 1589 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) 1590 { 1591 *cs++ = MI_SEMAPHORE_WAIT | 1592 MI_SEMAPHORE_GLOBAL_GTT | 1593 MI_SEMAPHORE_POLL | 1594 mode; 1595 *cs++ = value; 1596 *cs++ = offset; 1597 *cs++ = 0; 1598 1599 return cs; 1600 } 1601 1602 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) 1603 { 1604 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); 1605 } 1606 1607 static void semaphore_set(u32 *sema, u32 value) 1608 { 1609 WRITE_ONCE(*sema, value); 1610 wmb(); /* flush the update to the cache, and beyond */ 1611 } 1612 1613 static u32 *hwsp_scratch(const struct intel_context *ce) 1614 { 1615 return memset32(ce->engine->status_page.addr + 1000, 0, 21); 1616 } 1617 1618 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) 1619 { 1620 return (i915_ggtt_offset(ce->engine->status_page.vma) + 1621 offset_in_page(dw)); 1622 } 1623 1624 static int measure_semaphore_response(struct intel_context *ce) 1625 { 1626 u32 *sema = hwsp_scratch(ce); 1627 const u32 offset = hwsp_offset(ce, sema); 1628 u32 elapsed[TF_COUNT], cycles; 1629 struct i915_request *rq; 1630 u32 *cs; 1631 int err; 1632 int i; 1633 1634 /* 1635 * Measure how many cycles it takes for the HW to detect the change 1636 * in a semaphore value. 1637 * 1638 * A: read CS_TIMESTAMP from CPU 1639 * poke semaphore 1640 * B: read CS_TIMESTAMP on GPU 1641 * 1642 * Semaphore latency: B - A 1643 */ 1644 1645 semaphore_set(sema, -1); 1646 1647 rq = i915_request_create(ce); 1648 if (IS_ERR(rq)) 1649 return PTR_ERR(rq); 1650 1651 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); 1652 if (IS_ERR(cs)) { 1653 i915_request_add(rq); 1654 err = PTR_ERR(cs); 1655 goto err; 1656 } 1657 1658 cs = emit_store_dw(cs, offset, 0); 1659 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1660 cs = emit_semaphore_poll_until(cs, offset, i); 1661 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1662 cs = emit_store_dw(cs, offset, 0); 1663 } 1664 1665 intel_ring_advance(rq, cs); 1666 i915_request_add(rq); 1667 1668 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 1669 err = -EIO; 1670 goto err; 1671 } 1672 1673 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1674 preempt_disable(); 1675 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 1676 semaphore_set(sema, i); 1677 preempt_enable(); 1678 1679 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 1680 err = -EIO; 1681 goto err; 1682 } 1683 1684 elapsed[i - 1] = sema[i] - cycles; 1685 } 1686 1687 cycles = trifilter(elapsed); 1688 pr_info("%s: semaphore response %d cycles, %lluns\n", 1689 ce->engine->name, cycles >> TF_BIAS, 1690 cycles_to_ns(ce->engine, cycles)); 1691 1692 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1693 1694 err: 1695 intel_gt_set_wedged(ce->engine->gt); 1696 return err; 1697 } 1698 1699 static int measure_idle_dispatch(struct intel_context *ce) 1700 { 1701 u32 *sema = hwsp_scratch(ce); 1702 const u32 offset = hwsp_offset(ce, sema); 1703 u32 elapsed[TF_COUNT], cycles; 1704 u32 *cs; 1705 int err; 1706 int i; 1707 1708 /* 1709 * Measure how long it takes for us to submit a request while the 1710 * engine is idle, but is resting in our context. 1711 * 1712 * A: read CS_TIMESTAMP from CPU 1713 * submit request 1714 * B: read CS_TIMESTAMP on GPU 1715 * 1716 * Submission latency: B - A 1717 */ 1718 1719 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 1720 struct i915_request *rq; 1721 1722 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 1723 if (err) 1724 return err; 1725 1726 rq = i915_request_create(ce); 1727 if (IS_ERR(rq)) { 1728 err = PTR_ERR(rq); 1729 goto err; 1730 } 1731 1732 cs = intel_ring_begin(rq, 4); 1733 if (IS_ERR(cs)) { 1734 i915_request_add(rq); 1735 err = PTR_ERR(cs); 1736 goto err; 1737 } 1738 1739 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1740 1741 intel_ring_advance(rq, cs); 1742 1743 preempt_disable(); 1744 local_bh_disable(); 1745 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 1746 i915_request_add(rq); 1747 local_bh_enable(); 1748 preempt_enable(); 1749 } 1750 1751 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 1752 if (err) 1753 goto err; 1754 1755 for (i = 0; i < ARRAY_SIZE(elapsed); i++) 1756 elapsed[i] = sema[i] - elapsed[i]; 1757 1758 cycles = trifilter(elapsed); 1759 pr_info("%s: idle dispatch latency %d cycles, %lluns\n", 1760 ce->engine->name, cycles >> TF_BIAS, 1761 cycles_to_ns(ce->engine, cycles)); 1762 1763 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1764 1765 err: 1766 intel_gt_set_wedged(ce->engine->gt); 1767 return err; 1768 } 1769 1770 static int measure_busy_dispatch(struct intel_context *ce) 1771 { 1772 u32 *sema = hwsp_scratch(ce); 1773 const u32 offset = hwsp_offset(ce, sema); 1774 u32 elapsed[TF_COUNT + 1], cycles; 1775 u32 *cs; 1776 int err; 1777 int i; 1778 1779 /* 1780 * Measure how long it takes for us to submit a request while the 1781 * engine is busy, polling on a semaphore in our context. With 1782 * direct submission, this will include the cost of a lite restore. 1783 * 1784 * A: read CS_TIMESTAMP from CPU 1785 * submit request 1786 * B: read CS_TIMESTAMP on GPU 1787 * 1788 * Submission latency: B - A 1789 */ 1790 1791 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1792 struct i915_request *rq; 1793 1794 rq = i915_request_create(ce); 1795 if (IS_ERR(rq)) { 1796 err = PTR_ERR(rq); 1797 goto err; 1798 } 1799 1800 cs = intel_ring_begin(rq, 12); 1801 if (IS_ERR(cs)) { 1802 i915_request_add(rq); 1803 err = PTR_ERR(cs); 1804 goto err; 1805 } 1806 1807 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 1808 cs = emit_semaphore_poll_until(cs, offset, i); 1809 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1810 1811 intel_ring_advance(rq, cs); 1812 1813 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { 1814 err = -EIO; 1815 goto err; 1816 } 1817 1818 preempt_disable(); 1819 local_bh_disable(); 1820 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 1821 i915_request_add(rq); 1822 local_bh_enable(); 1823 semaphore_set(sema, i - 1); 1824 preempt_enable(); 1825 } 1826 1827 wait_for(READ_ONCE(sema[i - 1]), 500); 1828 semaphore_set(sema, i - 1); 1829 1830 for (i = 1; i <= TF_COUNT; i++) { 1831 GEM_BUG_ON(sema[i] == -1); 1832 elapsed[i - 1] = sema[i] - elapsed[i]; 1833 } 1834 1835 cycles = trifilter(elapsed); 1836 pr_info("%s: busy dispatch latency %d cycles, %lluns\n", 1837 ce->engine->name, cycles >> TF_BIAS, 1838 cycles_to_ns(ce->engine, cycles)); 1839 1840 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1841 1842 err: 1843 intel_gt_set_wedged(ce->engine->gt); 1844 return err; 1845 } 1846 1847 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) 1848 { 1849 const u32 offset = 1850 i915_ggtt_offset(engine->status_page.vma) + 1851 offset_in_page(sema); 1852 struct i915_request *rq; 1853 u32 *cs; 1854 1855 rq = i915_request_create(engine->kernel_context); 1856 if (IS_ERR(rq)) 1857 return PTR_ERR(rq); 1858 1859 cs = intel_ring_begin(rq, 4); 1860 if (IS_ERR(cs)) { 1861 i915_request_add(rq); 1862 return PTR_ERR(cs); 1863 } 1864 1865 cs = emit_semaphore_poll(cs, mode, value, offset); 1866 1867 intel_ring_advance(rq, cs); 1868 i915_request_add(rq); 1869 1870 return 0; 1871 } 1872 1873 static int measure_inter_request(struct intel_context *ce) 1874 { 1875 u32 *sema = hwsp_scratch(ce); 1876 const u32 offset = hwsp_offset(ce, sema); 1877 u32 elapsed[TF_COUNT + 1], cycles; 1878 struct i915_sw_fence *submit; 1879 int i, err; 1880 1881 /* 1882 * Measure how long it takes to advance from one request into the 1883 * next. Between each request we flush the GPU caches to memory, 1884 * update the breadcrumbs, and then invalidate those caches. 1885 * We queue up all the requests to be submitted in one batch so 1886 * it should be one set of contiguous measurements. 1887 * 1888 * A: read CS_TIMESTAMP on GPU 1889 * advance request 1890 * B: read CS_TIMESTAMP on GPU 1891 * 1892 * Request latency: B - A 1893 */ 1894 1895 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 1896 if (err) 1897 return err; 1898 1899 submit = heap_fence_create(GFP_KERNEL); 1900 if (!submit) { 1901 semaphore_set(sema, 1); 1902 return -ENOMEM; 1903 } 1904 1905 intel_engine_flush_submission(ce->engine); 1906 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1907 struct i915_request *rq; 1908 u32 *cs; 1909 1910 rq = i915_request_create(ce); 1911 if (IS_ERR(rq)) { 1912 err = PTR_ERR(rq); 1913 goto err_submit; 1914 } 1915 1916 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 1917 submit, 1918 GFP_KERNEL); 1919 if (err < 0) { 1920 i915_request_add(rq); 1921 goto err_submit; 1922 } 1923 1924 cs = intel_ring_begin(rq, 4); 1925 if (IS_ERR(cs)) { 1926 i915_request_add(rq); 1927 err = PTR_ERR(cs); 1928 goto err_submit; 1929 } 1930 1931 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1932 1933 intel_ring_advance(rq, cs); 1934 i915_request_add(rq); 1935 } 1936 i915_sw_fence_commit(submit); 1937 intel_engine_flush_submission(ce->engine); 1938 heap_fence_put(submit); 1939 1940 semaphore_set(sema, 1); 1941 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 1942 if (err) 1943 goto err; 1944 1945 for (i = 1; i <= TF_COUNT; i++) 1946 elapsed[i - 1] = sema[i + 1] - sema[i]; 1947 1948 cycles = trifilter(elapsed); 1949 pr_info("%s: inter-request latency %d cycles, %lluns\n", 1950 ce->engine->name, cycles >> TF_BIAS, 1951 cycles_to_ns(ce->engine, cycles)); 1952 1953 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1954 1955 err_submit: 1956 i915_sw_fence_commit(submit); 1957 heap_fence_put(submit); 1958 semaphore_set(sema, 1); 1959 err: 1960 intel_gt_set_wedged(ce->engine->gt); 1961 return err; 1962 } 1963 1964 static int measure_context_switch(struct intel_context *ce) 1965 { 1966 u32 *sema = hwsp_scratch(ce); 1967 const u32 offset = hwsp_offset(ce, sema); 1968 struct i915_request *fence = NULL; 1969 u32 elapsed[TF_COUNT + 1], cycles; 1970 int i, j, err; 1971 u32 *cs; 1972 1973 /* 1974 * Measure how long it takes to advance from one request in one 1975 * context to a request in another context. This allows us to 1976 * measure how long the context save/restore take, along with all 1977 * the inter-context setup we require. 1978 * 1979 * A: read CS_TIMESTAMP on GPU 1980 * switch context 1981 * B: read CS_TIMESTAMP on GPU 1982 * 1983 * Context switch latency: B - A 1984 */ 1985 1986 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 1987 if (err) 1988 return err; 1989 1990 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1991 struct intel_context *arr[] = { 1992 ce, ce->engine->kernel_context 1993 }; 1994 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); 1995 1996 for (j = 0; j < ARRAY_SIZE(arr); j++) { 1997 struct i915_request *rq; 1998 1999 rq = i915_request_create(arr[j]); 2000 if (IS_ERR(rq)) { 2001 err = PTR_ERR(rq); 2002 goto err_fence; 2003 } 2004 2005 if (fence) { 2006 err = i915_request_await_dma_fence(rq, 2007 &fence->fence); 2008 if (err) { 2009 i915_request_add(rq); 2010 goto err_fence; 2011 } 2012 } 2013 2014 cs = intel_ring_begin(rq, 4); 2015 if (IS_ERR(cs)) { 2016 i915_request_add(rq); 2017 err = PTR_ERR(cs); 2018 goto err_fence; 2019 } 2020 2021 cs = emit_timestamp_store(cs, ce, addr); 2022 addr += sizeof(u32); 2023 2024 intel_ring_advance(rq, cs); 2025 2026 i915_request_put(fence); 2027 fence = i915_request_get(rq); 2028 2029 i915_request_add(rq); 2030 } 2031 } 2032 i915_request_put(fence); 2033 intel_engine_flush_submission(ce->engine); 2034 2035 semaphore_set(sema, 1); 2036 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2037 if (err) 2038 goto err; 2039 2040 for (i = 1; i <= TF_COUNT; i++) 2041 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; 2042 2043 cycles = trifilter(elapsed); 2044 pr_info("%s: context switch latency %d cycles, %lluns\n", 2045 ce->engine->name, cycles >> TF_BIAS, 2046 cycles_to_ns(ce->engine, cycles)); 2047 2048 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2049 2050 err_fence: 2051 i915_request_put(fence); 2052 semaphore_set(sema, 1); 2053 err: 2054 intel_gt_set_wedged(ce->engine->gt); 2055 return err; 2056 } 2057 2058 static int measure_preemption(struct intel_context *ce) 2059 { 2060 u32 *sema = hwsp_scratch(ce); 2061 const u32 offset = hwsp_offset(ce, sema); 2062 u32 elapsed[TF_COUNT], cycles; 2063 u32 *cs; 2064 int err; 2065 int i; 2066 2067 /* 2068 * We measure two latencies while triggering preemption. The first 2069 * latency is how long it takes for us to submit a preempting request. 2070 * The second latency is how it takes for us to return from the 2071 * preemption back to the original context. 2072 * 2073 * A: read CS_TIMESTAMP from CPU 2074 * submit preemption 2075 * B: read CS_TIMESTAMP on GPU (in preempting context) 2076 * context switch 2077 * C: read CS_TIMESTAMP on GPU (in original context) 2078 * 2079 * Preemption dispatch latency: B - A 2080 * Preemption switch latency: C - B 2081 */ 2082 2083 if (!intel_engine_has_preemption(ce->engine)) 2084 return 0; 2085 2086 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2087 u32 addr = offset + 2 * i * sizeof(u32); 2088 struct i915_request *rq; 2089 2090 rq = i915_request_create(ce); 2091 if (IS_ERR(rq)) { 2092 err = PTR_ERR(rq); 2093 goto err; 2094 } 2095 2096 cs = intel_ring_begin(rq, 12); 2097 if (IS_ERR(cs)) { 2098 i915_request_add(rq); 2099 err = PTR_ERR(cs); 2100 goto err; 2101 } 2102 2103 cs = emit_store_dw(cs, addr, -1); 2104 cs = emit_semaphore_poll_until(cs, offset, i); 2105 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); 2106 2107 intel_ring_advance(rq, cs); 2108 i915_request_add(rq); 2109 2110 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { 2111 err = -EIO; 2112 goto err; 2113 } 2114 2115 rq = i915_request_create(ce->engine->kernel_context); 2116 if (IS_ERR(rq)) { 2117 err = PTR_ERR(rq); 2118 goto err; 2119 } 2120 2121 cs = intel_ring_begin(rq, 8); 2122 if (IS_ERR(cs)) { 2123 i915_request_add(rq); 2124 err = PTR_ERR(cs); 2125 goto err; 2126 } 2127 2128 cs = emit_timestamp_store(cs, ce, addr); 2129 cs = emit_store_dw(cs, offset, i); 2130 2131 intel_ring_advance(rq, cs); 2132 rq->sched.attr.priority = I915_PRIORITY_BARRIER; 2133 2134 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2135 i915_request_add(rq); 2136 } 2137 2138 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { 2139 err = -EIO; 2140 goto err; 2141 } 2142 2143 for (i = 1; i <= TF_COUNT; i++) 2144 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; 2145 2146 cycles = trifilter(elapsed); 2147 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", 2148 ce->engine->name, cycles >> TF_BIAS, 2149 cycles_to_ns(ce->engine, cycles)); 2150 2151 for (i = 1; i <= TF_COUNT; i++) 2152 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; 2153 2154 cycles = trifilter(elapsed); 2155 pr_info("%s: preemption switch latency %d cycles, %lluns\n", 2156 ce->engine->name, cycles >> TF_BIAS, 2157 cycles_to_ns(ce->engine, cycles)); 2158 2159 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2160 2161 err: 2162 intel_gt_set_wedged(ce->engine->gt); 2163 return err; 2164 } 2165 2166 struct signal_cb { 2167 struct dma_fence_cb base; 2168 bool seen; 2169 }; 2170 2171 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 2172 { 2173 struct signal_cb *s = container_of(cb, typeof(*s), base); 2174 2175 smp_store_mb(s->seen, true); /* be safe, be strong */ 2176 } 2177 2178 static int measure_completion(struct intel_context *ce) 2179 { 2180 u32 *sema = hwsp_scratch(ce); 2181 const u32 offset = hwsp_offset(ce, sema); 2182 u32 elapsed[TF_COUNT], cycles; 2183 u32 *cs; 2184 int err; 2185 int i; 2186 2187 /* 2188 * Measure how long it takes for the signal (interrupt) to be 2189 * sent from the GPU to be processed by the CPU. 2190 * 2191 * A: read CS_TIMESTAMP on GPU 2192 * signal 2193 * B: read CS_TIMESTAMP from CPU 2194 * 2195 * Completion latency: B - A 2196 */ 2197 2198 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2199 struct signal_cb cb = { .seen = false }; 2200 struct i915_request *rq; 2201 2202 rq = i915_request_create(ce); 2203 if (IS_ERR(rq)) { 2204 err = PTR_ERR(rq); 2205 goto err; 2206 } 2207 2208 cs = intel_ring_begin(rq, 12); 2209 if (IS_ERR(cs)) { 2210 i915_request_add(rq); 2211 err = PTR_ERR(cs); 2212 goto err; 2213 } 2214 2215 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2216 cs = emit_semaphore_poll_until(cs, offset, i); 2217 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2218 2219 intel_ring_advance(rq, cs); 2220 2221 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); 2222 i915_request_add(rq); 2223 2224 intel_engine_flush_submission(ce->engine); 2225 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { 2226 err = -EIO; 2227 goto err; 2228 } 2229 2230 preempt_disable(); 2231 semaphore_set(sema, i); 2232 while (!READ_ONCE(cb.seen)) 2233 cpu_relax(); 2234 2235 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2236 preempt_enable(); 2237 } 2238 2239 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2240 if (err) 2241 goto err; 2242 2243 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2244 GEM_BUG_ON(sema[i + 1] == -1); 2245 elapsed[i] = elapsed[i] - sema[i + 1]; 2246 } 2247 2248 cycles = trifilter(elapsed); 2249 pr_info("%s: completion latency %d cycles, %lluns\n", 2250 ce->engine->name, cycles >> TF_BIAS, 2251 cycles_to_ns(ce->engine, cycles)); 2252 2253 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2254 2255 err: 2256 intel_gt_set_wedged(ce->engine->gt); 2257 return err; 2258 } 2259 2260 static void rps_pin(struct intel_gt *gt) 2261 { 2262 /* Pin the frequency to max */ 2263 atomic_inc(>->rps.num_waiters); 2264 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 2265 2266 mutex_lock(>->rps.lock); 2267 intel_rps_set(>->rps, gt->rps.max_freq); 2268 mutex_unlock(>->rps.lock); 2269 } 2270 2271 static void rps_unpin(struct intel_gt *gt) 2272 { 2273 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 2274 atomic_dec(>->rps.num_waiters); 2275 } 2276 2277 static int perf_request_latency(void *arg) 2278 { 2279 struct drm_i915_private *i915 = arg; 2280 struct intel_engine_cs *engine; 2281 struct pm_qos_request qos; 2282 int err = 0; 2283 2284 if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */ 2285 return 0; 2286 2287 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2288 2289 for_each_uabi_engine(engine, i915) { 2290 struct intel_context *ce; 2291 2292 ce = intel_context_create(engine); 2293 if (IS_ERR(ce)) { 2294 err = PTR_ERR(ce); 2295 goto out; 2296 } 2297 2298 err = intel_context_pin(ce); 2299 if (err) { 2300 intel_context_put(ce); 2301 goto out; 2302 } 2303 2304 st_engine_heartbeat_disable(engine); 2305 rps_pin(engine->gt); 2306 2307 if (err == 0) 2308 err = measure_semaphore_response(ce); 2309 if (err == 0) 2310 err = measure_idle_dispatch(ce); 2311 if (err == 0) 2312 err = measure_busy_dispatch(ce); 2313 if (err == 0) 2314 err = measure_inter_request(ce); 2315 if (err == 0) 2316 err = measure_context_switch(ce); 2317 if (err == 0) 2318 err = measure_preemption(ce); 2319 if (err == 0) 2320 err = measure_completion(ce); 2321 2322 rps_unpin(engine->gt); 2323 st_engine_heartbeat_enable(engine); 2324 2325 intel_context_unpin(ce); 2326 intel_context_put(ce); 2327 if (err) 2328 goto out; 2329 } 2330 2331 out: 2332 if (igt_flush_test(i915)) 2333 err = -EIO; 2334 2335 cpu_latency_qos_remove_request(&qos); 2336 return err; 2337 } 2338 2339 static int s_sync0(void *arg) 2340 { 2341 struct perf_series *ps = arg; 2342 IGT_TIMEOUT(end_time); 2343 unsigned int idx = 0; 2344 int err = 0; 2345 2346 GEM_BUG_ON(!ps->nengines); 2347 do { 2348 struct i915_request *rq; 2349 2350 rq = i915_request_create(ps->ce[idx]); 2351 if (IS_ERR(rq)) { 2352 err = PTR_ERR(rq); 2353 break; 2354 } 2355 2356 i915_request_get(rq); 2357 i915_request_add(rq); 2358 2359 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2360 err = -ETIME; 2361 i915_request_put(rq); 2362 if (err) 2363 break; 2364 2365 if (++idx == ps->nengines) 2366 idx = 0; 2367 } while (!__igt_timeout(end_time, NULL)); 2368 2369 return err; 2370 } 2371 2372 static int s_sync1(void *arg) 2373 { 2374 struct perf_series *ps = arg; 2375 struct i915_request *prev = NULL; 2376 IGT_TIMEOUT(end_time); 2377 unsigned int idx = 0; 2378 int err = 0; 2379 2380 GEM_BUG_ON(!ps->nengines); 2381 do { 2382 struct i915_request *rq; 2383 2384 rq = i915_request_create(ps->ce[idx]); 2385 if (IS_ERR(rq)) { 2386 err = PTR_ERR(rq); 2387 break; 2388 } 2389 2390 i915_request_get(rq); 2391 i915_request_add(rq); 2392 2393 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2394 err = -ETIME; 2395 i915_request_put(prev); 2396 prev = rq; 2397 if (err) 2398 break; 2399 2400 if (++idx == ps->nengines) 2401 idx = 0; 2402 } while (!__igt_timeout(end_time, NULL)); 2403 i915_request_put(prev); 2404 2405 return err; 2406 } 2407 2408 static int s_many(void *arg) 2409 { 2410 struct perf_series *ps = arg; 2411 IGT_TIMEOUT(end_time); 2412 unsigned int idx = 0; 2413 2414 GEM_BUG_ON(!ps->nengines); 2415 do { 2416 struct i915_request *rq; 2417 2418 rq = i915_request_create(ps->ce[idx]); 2419 if (IS_ERR(rq)) 2420 return PTR_ERR(rq); 2421 2422 i915_request_add(rq); 2423 2424 if (++idx == ps->nengines) 2425 idx = 0; 2426 } while (!__igt_timeout(end_time, NULL)); 2427 2428 return 0; 2429 } 2430 2431 static int perf_series_engines(void *arg) 2432 { 2433 struct drm_i915_private *i915 = arg; 2434 static int (* const func[])(void *arg) = { 2435 s_sync0, 2436 s_sync1, 2437 s_many, 2438 NULL, 2439 }; 2440 const unsigned int nengines = num_uabi_engines(i915); 2441 struct intel_engine_cs *engine; 2442 int (* const *fn)(void *arg); 2443 struct pm_qos_request qos; 2444 struct perf_stats *stats; 2445 struct perf_series *ps; 2446 unsigned int idx; 2447 int err = 0; 2448 2449 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL); 2450 if (!stats) 2451 return -ENOMEM; 2452 2453 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL); 2454 if (!ps) { 2455 kfree(stats); 2456 return -ENOMEM; 2457 } 2458 2459 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2460 2461 ps->i915 = i915; 2462 ps->nengines = nengines; 2463 2464 idx = 0; 2465 for_each_uabi_engine(engine, i915) { 2466 struct intel_context *ce; 2467 2468 ce = intel_context_create(engine); 2469 if (IS_ERR(ce)) { 2470 err = PTR_ERR(ce); 2471 goto out; 2472 } 2473 2474 err = intel_context_pin(ce); 2475 if (err) { 2476 intel_context_put(ce); 2477 goto out; 2478 } 2479 2480 ps->ce[idx++] = ce; 2481 } 2482 GEM_BUG_ON(idx != ps->nengines); 2483 2484 for (fn = func; *fn && !err; fn++) { 2485 char name[KSYM_NAME_LEN]; 2486 struct igt_live_test t; 2487 2488 snprintf(name, sizeof(name), "%ps", *fn); 2489 err = igt_live_test_begin(&t, i915, __func__, name); 2490 if (err) 2491 break; 2492 2493 for (idx = 0; idx < nengines; idx++) { 2494 struct perf_stats *p = 2495 memset(&stats[idx], 0, sizeof(stats[idx])); 2496 struct intel_context *ce = ps->ce[idx]; 2497 2498 p->engine = ps->ce[idx]->engine; 2499 intel_engine_pm_get(p->engine); 2500 2501 if (intel_engine_supports_stats(p->engine)) 2502 p->busy = intel_engine_get_busy_time(p->engine, 2503 &p->time) + 1; 2504 else 2505 p->time = ktime_get(); 2506 p->runtime = -intel_context_get_total_runtime_ns(ce); 2507 } 2508 2509 err = (*fn)(ps); 2510 if (igt_live_test_end(&t)) 2511 err = -EIO; 2512 2513 for (idx = 0; idx < nengines; idx++) { 2514 struct perf_stats *p = &stats[idx]; 2515 struct intel_context *ce = ps->ce[idx]; 2516 int integer, decimal; 2517 u64 busy, dt, now; 2518 2519 if (p->busy) 2520 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine, 2521 &now), 2522 p->busy - 1); 2523 else 2524 now = ktime_get(); 2525 p->time = ktime_sub(now, p->time); 2526 2527 err = switch_to_kernel_sync(ce, err); 2528 p->runtime += intel_context_get_total_runtime_ns(ce); 2529 intel_engine_pm_put(p->engine); 2530 2531 busy = 100 * ktime_to_ns(p->busy); 2532 dt = ktime_to_ns(p->time); 2533 if (dt) { 2534 integer = div64_u64(busy, dt); 2535 busy -= integer * dt; 2536 decimal = div64_u64(100 * busy, dt); 2537 } else { 2538 integer = 0; 2539 decimal = 0; 2540 } 2541 2542 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2543 name, p->engine->name, ce->timeline->seqno, 2544 integer, decimal, 2545 div_u64(p->runtime, 1000 * 1000), 2546 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2547 } 2548 } 2549 2550 out: 2551 for (idx = 0; idx < nengines; idx++) { 2552 if (IS_ERR_OR_NULL(ps->ce[idx])) 2553 break; 2554 2555 intel_context_unpin(ps->ce[idx]); 2556 intel_context_put(ps->ce[idx]); 2557 } 2558 kfree(ps); 2559 2560 cpu_latency_qos_remove_request(&qos); 2561 kfree(stats); 2562 return err; 2563 } 2564 2565 static int p_sync0(void *arg) 2566 { 2567 struct perf_stats *p = arg; 2568 struct intel_engine_cs *engine = p->engine; 2569 struct intel_context *ce; 2570 IGT_TIMEOUT(end_time); 2571 unsigned long count; 2572 bool busy; 2573 int err = 0; 2574 2575 ce = intel_context_create(engine); 2576 if (IS_ERR(ce)) 2577 return PTR_ERR(ce); 2578 2579 err = intel_context_pin(ce); 2580 if (err) { 2581 intel_context_put(ce); 2582 return err; 2583 } 2584 2585 if (intel_engine_supports_stats(engine)) { 2586 p->busy = intel_engine_get_busy_time(engine, &p->time); 2587 busy = true; 2588 } else { 2589 p->time = ktime_get(); 2590 busy = false; 2591 } 2592 2593 count = 0; 2594 do { 2595 struct i915_request *rq; 2596 2597 rq = i915_request_create(ce); 2598 if (IS_ERR(rq)) { 2599 err = PTR_ERR(rq); 2600 break; 2601 } 2602 2603 i915_request_get(rq); 2604 i915_request_add(rq); 2605 2606 err = 0; 2607 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2608 err = -ETIME; 2609 i915_request_put(rq); 2610 if (err) 2611 break; 2612 2613 count++; 2614 } while (!__igt_timeout(end_time, NULL)); 2615 2616 if (busy) { 2617 ktime_t now; 2618 2619 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 2620 p->busy); 2621 p->time = ktime_sub(now, p->time); 2622 } else { 2623 p->time = ktime_sub(ktime_get(), p->time); 2624 } 2625 2626 err = switch_to_kernel_sync(ce, err); 2627 p->runtime = intel_context_get_total_runtime_ns(ce); 2628 p->count = count; 2629 2630 intel_context_unpin(ce); 2631 intel_context_put(ce); 2632 return err; 2633 } 2634 2635 static int p_sync1(void *arg) 2636 { 2637 struct perf_stats *p = arg; 2638 struct intel_engine_cs *engine = p->engine; 2639 struct i915_request *prev = NULL; 2640 struct intel_context *ce; 2641 IGT_TIMEOUT(end_time); 2642 unsigned long count; 2643 bool busy; 2644 int err = 0; 2645 2646 ce = intel_context_create(engine); 2647 if (IS_ERR(ce)) 2648 return PTR_ERR(ce); 2649 2650 err = intel_context_pin(ce); 2651 if (err) { 2652 intel_context_put(ce); 2653 return err; 2654 } 2655 2656 if (intel_engine_supports_stats(engine)) { 2657 p->busy = intel_engine_get_busy_time(engine, &p->time); 2658 busy = true; 2659 } else { 2660 p->time = ktime_get(); 2661 busy = false; 2662 } 2663 2664 count = 0; 2665 do { 2666 struct i915_request *rq; 2667 2668 rq = i915_request_create(ce); 2669 if (IS_ERR(rq)) { 2670 err = PTR_ERR(rq); 2671 break; 2672 } 2673 2674 i915_request_get(rq); 2675 i915_request_add(rq); 2676 2677 err = 0; 2678 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2679 err = -ETIME; 2680 i915_request_put(prev); 2681 prev = rq; 2682 if (err) 2683 break; 2684 2685 count++; 2686 } while (!__igt_timeout(end_time, NULL)); 2687 i915_request_put(prev); 2688 2689 if (busy) { 2690 ktime_t now; 2691 2692 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 2693 p->busy); 2694 p->time = ktime_sub(now, p->time); 2695 } else { 2696 p->time = ktime_sub(ktime_get(), p->time); 2697 } 2698 2699 err = switch_to_kernel_sync(ce, err); 2700 p->runtime = intel_context_get_total_runtime_ns(ce); 2701 p->count = count; 2702 2703 intel_context_unpin(ce); 2704 intel_context_put(ce); 2705 return err; 2706 } 2707 2708 static int p_many(void *arg) 2709 { 2710 struct perf_stats *p = arg; 2711 struct intel_engine_cs *engine = p->engine; 2712 struct intel_context *ce; 2713 IGT_TIMEOUT(end_time); 2714 unsigned long count; 2715 int err = 0; 2716 bool busy; 2717 2718 ce = intel_context_create(engine); 2719 if (IS_ERR(ce)) 2720 return PTR_ERR(ce); 2721 2722 err = intel_context_pin(ce); 2723 if (err) { 2724 intel_context_put(ce); 2725 return err; 2726 } 2727 2728 if (intel_engine_supports_stats(engine)) { 2729 p->busy = intel_engine_get_busy_time(engine, &p->time); 2730 busy = true; 2731 } else { 2732 p->time = ktime_get(); 2733 busy = false; 2734 } 2735 2736 count = 0; 2737 do { 2738 struct i915_request *rq; 2739 2740 rq = i915_request_create(ce); 2741 if (IS_ERR(rq)) { 2742 err = PTR_ERR(rq); 2743 break; 2744 } 2745 2746 i915_request_add(rq); 2747 count++; 2748 } while (!__igt_timeout(end_time, NULL)); 2749 2750 if (busy) { 2751 ktime_t now; 2752 2753 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 2754 p->busy); 2755 p->time = ktime_sub(now, p->time); 2756 } else { 2757 p->time = ktime_sub(ktime_get(), p->time); 2758 } 2759 2760 err = switch_to_kernel_sync(ce, err); 2761 p->runtime = intel_context_get_total_runtime_ns(ce); 2762 p->count = count; 2763 2764 intel_context_unpin(ce); 2765 intel_context_put(ce); 2766 return err; 2767 } 2768 2769 static int perf_parallel_engines(void *arg) 2770 { 2771 struct drm_i915_private *i915 = arg; 2772 static int (* const func[])(void *arg) = { 2773 p_sync0, 2774 p_sync1, 2775 p_many, 2776 NULL, 2777 }; 2778 const unsigned int nengines = num_uabi_engines(i915); 2779 struct intel_engine_cs *engine; 2780 int (* const *fn)(void *arg); 2781 struct pm_qos_request qos; 2782 struct { 2783 struct perf_stats p; 2784 struct task_struct *tsk; 2785 } *engines; 2786 int err = 0; 2787 2788 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL); 2789 if (!engines) 2790 return -ENOMEM; 2791 2792 cpu_latency_qos_add_request(&qos, 0); 2793 2794 for (fn = func; *fn; fn++) { 2795 char name[KSYM_NAME_LEN]; 2796 struct igt_live_test t; 2797 unsigned int idx; 2798 2799 snprintf(name, sizeof(name), "%ps", *fn); 2800 err = igt_live_test_begin(&t, i915, __func__, name); 2801 if (err) 2802 break; 2803 2804 atomic_set(&i915->selftest.counter, nengines); 2805 2806 idx = 0; 2807 for_each_uabi_engine(engine, i915) { 2808 intel_engine_pm_get(engine); 2809 2810 memset(&engines[idx].p, 0, sizeof(engines[idx].p)); 2811 engines[idx].p.engine = engine; 2812 2813 engines[idx].tsk = kthread_run(*fn, &engines[idx].p, 2814 "igt:%s", engine->name); 2815 if (IS_ERR(engines[idx].tsk)) { 2816 err = PTR_ERR(engines[idx].tsk); 2817 intel_engine_pm_put(engine); 2818 break; 2819 } 2820 get_task_struct(engines[idx++].tsk); 2821 } 2822 2823 yield(); /* start all threads before we kthread_stop() */ 2824 2825 idx = 0; 2826 for_each_uabi_engine(engine, i915) { 2827 int status; 2828 2829 if (IS_ERR(engines[idx].tsk)) 2830 break; 2831 2832 status = kthread_stop(engines[idx].tsk); 2833 if (status && !err) 2834 err = status; 2835 2836 intel_engine_pm_put(engine); 2837 put_task_struct(engines[idx++].tsk); 2838 } 2839 2840 if (igt_live_test_end(&t)) 2841 err = -EIO; 2842 if (err) 2843 break; 2844 2845 idx = 0; 2846 for_each_uabi_engine(engine, i915) { 2847 struct perf_stats *p = &engines[idx].p; 2848 u64 busy = 100 * ktime_to_ns(p->busy); 2849 u64 dt = ktime_to_ns(p->time); 2850 int integer, decimal; 2851 2852 if (dt) { 2853 integer = div64_u64(busy, dt); 2854 busy -= integer * dt; 2855 decimal = div64_u64(100 * busy, dt); 2856 } else { 2857 integer = 0; 2858 decimal = 0; 2859 } 2860 2861 GEM_BUG_ON(engine != p->engine); 2862 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2863 name, engine->name, p->count, integer, decimal, 2864 div_u64(p->runtime, 1000 * 1000), 2865 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2866 idx++; 2867 } 2868 } 2869 2870 cpu_latency_qos_remove_request(&qos); 2871 kfree(engines); 2872 return err; 2873 } 2874 2875 int i915_request_perf_selftests(struct drm_i915_private *i915) 2876 { 2877 static const struct i915_subtest tests[] = { 2878 SUBTEST(perf_request_latency), 2879 SUBTEST(perf_series_engines), 2880 SUBTEST(perf_parallel_engines), 2881 }; 2882 2883 if (intel_gt_is_wedged(&i915->gt)) 2884 return 0; 2885 2886 return i915_subtests(tests, i915); 2887 } 2888