1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/prime_numbers.h> 26 #include <linux/pm_qos.h> 27 #include <linux/sort.h> 28 29 #include "gem/i915_gem_pm.h" 30 #include "gem/selftests/mock_context.h" 31 32 #include "gt/intel_engine_heartbeat.h" 33 #include "gt/intel_engine_pm.h" 34 #include "gt/intel_engine_user.h" 35 #include "gt/intel_gt.h" 36 #include "gt/intel_gt_requests.h" 37 #include "gt/selftest_engine_heartbeat.h" 38 39 #include "i915_random.h" 40 #include "i915_selftest.h" 41 #include "igt_flush_test.h" 42 #include "igt_live_test.h" 43 #include "igt_spinner.h" 44 #include "lib_sw_fence.h" 45 46 #include "mock_drm.h" 47 #include "mock_gem_device.h" 48 49 static unsigned int num_uabi_engines(struct drm_i915_private *i915) 50 { 51 struct intel_engine_cs *engine; 52 unsigned int count; 53 54 count = 0; 55 for_each_uabi_engine(engine, i915) 56 count++; 57 58 return count; 59 } 60 61 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915) 62 { 63 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0); 64 } 65 66 static int igt_add_request(void *arg) 67 { 68 struct drm_i915_private *i915 = arg; 69 struct i915_request *request; 70 71 /* Basic preliminary test to create a request and let it loose! */ 72 73 request = mock_request(rcs0(i915)->kernel_context, HZ / 10); 74 if (!request) 75 return -ENOMEM; 76 77 i915_request_add(request); 78 79 return 0; 80 } 81 82 static int igt_wait_request(void *arg) 83 { 84 const long T = HZ / 4; 85 struct drm_i915_private *i915 = arg; 86 struct i915_request *request; 87 int err = -EINVAL; 88 89 /* Submit a request, then wait upon it */ 90 91 request = mock_request(rcs0(i915)->kernel_context, T); 92 if (!request) 93 return -ENOMEM; 94 95 i915_request_get(request); 96 97 if (i915_request_wait(request, 0, 0) != -ETIME) { 98 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n"); 99 goto out_request; 100 } 101 102 if (i915_request_wait(request, 0, T) != -ETIME) { 103 pr_err("request wait succeeded (expected timeout before submit!)\n"); 104 goto out_request; 105 } 106 107 if (i915_request_completed(request)) { 108 pr_err("request completed before submit!!\n"); 109 goto out_request; 110 } 111 112 i915_request_add(request); 113 114 if (i915_request_wait(request, 0, 0) != -ETIME) { 115 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n"); 116 goto out_request; 117 } 118 119 if (i915_request_completed(request)) { 120 pr_err("request completed immediately!\n"); 121 goto out_request; 122 } 123 124 if (i915_request_wait(request, 0, T / 2) != -ETIME) { 125 pr_err("request wait succeeded (expected timeout!)\n"); 126 goto out_request; 127 } 128 129 if (i915_request_wait(request, 0, T) == -ETIME) { 130 pr_err("request wait timed out!\n"); 131 goto out_request; 132 } 133 134 if (!i915_request_completed(request)) { 135 pr_err("request not complete after waiting!\n"); 136 goto out_request; 137 } 138 139 if (i915_request_wait(request, 0, T) == -ETIME) { 140 pr_err("request wait timed out when already complete!\n"); 141 goto out_request; 142 } 143 144 err = 0; 145 out_request: 146 i915_request_put(request); 147 mock_device_flush(i915); 148 return err; 149 } 150 151 static int igt_fence_wait(void *arg) 152 { 153 const long T = HZ / 4; 154 struct drm_i915_private *i915 = arg; 155 struct i915_request *request; 156 int err = -EINVAL; 157 158 /* Submit a request, treat it as a fence and wait upon it */ 159 160 request = mock_request(rcs0(i915)->kernel_context, T); 161 if (!request) 162 return -ENOMEM; 163 164 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) { 165 pr_err("fence wait success before submit (expected timeout)!\n"); 166 goto out; 167 } 168 169 i915_request_add(request); 170 171 if (dma_fence_is_signaled(&request->fence)) { 172 pr_err("fence signaled immediately!\n"); 173 goto out; 174 } 175 176 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) { 177 pr_err("fence wait success after submit (expected timeout)!\n"); 178 goto out; 179 } 180 181 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 182 pr_err("fence wait timed out (expected success)!\n"); 183 goto out; 184 } 185 186 if (!dma_fence_is_signaled(&request->fence)) { 187 pr_err("fence unsignaled after waiting!\n"); 188 goto out; 189 } 190 191 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 192 pr_err("fence wait timed out when complete (expected success)!\n"); 193 goto out; 194 } 195 196 err = 0; 197 out: 198 mock_device_flush(i915); 199 return err; 200 } 201 202 static int igt_request_rewind(void *arg) 203 { 204 struct drm_i915_private *i915 = arg; 205 struct i915_request *request, *vip; 206 struct i915_gem_context *ctx[2]; 207 struct intel_context *ce; 208 int err = -EINVAL; 209 210 ctx[0] = mock_context(i915, "A"); 211 212 ce = i915_gem_context_get_engine(ctx[0], RCS0); 213 GEM_BUG_ON(IS_ERR(ce)); 214 request = mock_request(ce, 2 * HZ); 215 intel_context_put(ce); 216 if (!request) { 217 err = -ENOMEM; 218 goto err_context_0; 219 } 220 221 i915_request_get(request); 222 i915_request_add(request); 223 224 ctx[1] = mock_context(i915, "B"); 225 226 ce = i915_gem_context_get_engine(ctx[1], RCS0); 227 GEM_BUG_ON(IS_ERR(ce)); 228 vip = mock_request(ce, 0); 229 intel_context_put(ce); 230 if (!vip) { 231 err = -ENOMEM; 232 goto err_context_1; 233 } 234 235 /* Simulate preemption by manual reordering */ 236 if (!mock_cancel_request(request)) { 237 pr_err("failed to cancel request (already executed)!\n"); 238 i915_request_add(vip); 239 goto err_context_1; 240 } 241 i915_request_get(vip); 242 i915_request_add(vip); 243 rcu_read_lock(); 244 request->engine->submit_request(request); 245 rcu_read_unlock(); 246 247 248 if (i915_request_wait(vip, 0, HZ) == -ETIME) { 249 pr_err("timed out waiting for high priority request\n"); 250 goto err; 251 } 252 253 if (i915_request_completed(request)) { 254 pr_err("low priority request already completed\n"); 255 goto err; 256 } 257 258 err = 0; 259 err: 260 i915_request_put(vip); 261 err_context_1: 262 mock_context_close(ctx[1]); 263 i915_request_put(request); 264 err_context_0: 265 mock_context_close(ctx[0]); 266 mock_device_flush(i915); 267 return err; 268 } 269 270 struct smoketest { 271 struct intel_engine_cs *engine; 272 struct i915_gem_context **contexts; 273 atomic_long_t num_waits, num_fences; 274 int ncontexts, max_batch; 275 struct i915_request *(*request_alloc)(struct intel_context *ce); 276 }; 277 278 static struct i915_request * 279 __mock_request_alloc(struct intel_context *ce) 280 { 281 return mock_request(ce, 0); 282 } 283 284 static struct i915_request * 285 __live_request_alloc(struct intel_context *ce) 286 { 287 return intel_context_create_request(ce); 288 } 289 290 static int __igt_breadcrumbs_smoketest(void *arg) 291 { 292 struct smoketest *t = arg; 293 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1; 294 const unsigned int total = 4 * t->ncontexts + 1; 295 unsigned int num_waits = 0, num_fences = 0; 296 struct i915_request **requests; 297 I915_RND_STATE(prng); 298 unsigned int *order; 299 int err = 0; 300 301 /* 302 * A very simple test to catch the most egregious of list handling bugs. 303 * 304 * At its heart, we simply create oodles of requests running across 305 * multiple kthreads and enable signaling on them, for the sole purpose 306 * of stressing our breadcrumb handling. The only inspection we do is 307 * that the fences were marked as signaled. 308 */ 309 310 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL); 311 if (!requests) 312 return -ENOMEM; 313 314 order = i915_random_order(total, &prng); 315 if (!order) { 316 err = -ENOMEM; 317 goto out_requests; 318 } 319 320 while (!kthread_should_stop()) { 321 struct i915_sw_fence *submit, *wait; 322 unsigned int n, count; 323 324 submit = heap_fence_create(GFP_KERNEL); 325 if (!submit) { 326 err = -ENOMEM; 327 break; 328 } 329 330 wait = heap_fence_create(GFP_KERNEL); 331 if (!wait) { 332 i915_sw_fence_commit(submit); 333 heap_fence_put(submit); 334 err = ENOMEM; 335 break; 336 } 337 338 i915_random_reorder(order, total, &prng); 339 count = 1 + i915_prandom_u32_max_state(max_batch, &prng); 340 341 for (n = 0; n < count; n++) { 342 struct i915_gem_context *ctx = 343 t->contexts[order[n] % t->ncontexts]; 344 struct i915_request *rq; 345 struct intel_context *ce; 346 347 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx); 348 GEM_BUG_ON(IS_ERR(ce)); 349 rq = t->request_alloc(ce); 350 intel_context_put(ce); 351 if (IS_ERR(rq)) { 352 err = PTR_ERR(rq); 353 count = n; 354 break; 355 } 356 357 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 358 submit, 359 GFP_KERNEL); 360 361 requests[n] = i915_request_get(rq); 362 i915_request_add(rq); 363 364 if (err >= 0) 365 err = i915_sw_fence_await_dma_fence(wait, 366 &rq->fence, 367 0, 368 GFP_KERNEL); 369 370 if (err < 0) { 371 i915_request_put(rq); 372 count = n; 373 break; 374 } 375 } 376 377 i915_sw_fence_commit(submit); 378 i915_sw_fence_commit(wait); 379 380 if (!wait_event_timeout(wait->wait, 381 i915_sw_fence_done(wait), 382 5 * HZ)) { 383 struct i915_request *rq = requests[count - 1]; 384 385 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", 386 atomic_read(&wait->pending), count, 387 rq->fence.context, rq->fence.seqno, 388 t->engine->name); 389 GEM_TRACE_DUMP(); 390 391 intel_gt_set_wedged(t->engine->gt); 392 GEM_BUG_ON(!i915_request_completed(rq)); 393 i915_sw_fence_wait(wait); 394 err = -EIO; 395 } 396 397 for (n = 0; n < count; n++) { 398 struct i915_request *rq = requests[n]; 399 400 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 401 &rq->fence.flags)) { 402 pr_err("%llu:%llu was not signaled!\n", 403 rq->fence.context, rq->fence.seqno); 404 err = -EINVAL; 405 } 406 407 i915_request_put(rq); 408 } 409 410 heap_fence_put(wait); 411 heap_fence_put(submit); 412 413 if (err < 0) 414 break; 415 416 num_fences += count; 417 num_waits++; 418 419 cond_resched(); 420 } 421 422 atomic_long_add(num_fences, &t->num_fences); 423 atomic_long_add(num_waits, &t->num_waits); 424 425 kfree(order); 426 out_requests: 427 kfree(requests); 428 return err; 429 } 430 431 static int mock_breadcrumbs_smoketest(void *arg) 432 { 433 struct drm_i915_private *i915 = arg; 434 struct smoketest t = { 435 .engine = rcs0(i915), 436 .ncontexts = 1024, 437 .max_batch = 1024, 438 .request_alloc = __mock_request_alloc 439 }; 440 unsigned int ncpus = num_online_cpus(); 441 struct task_struct **threads; 442 unsigned int n; 443 int ret = 0; 444 445 /* 446 * Smoketest our breadcrumb/signal handling for requests across multiple 447 * threads. A very simple test to only catch the most egregious of bugs. 448 * See __igt_breadcrumbs_smoketest(); 449 */ 450 451 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL); 452 if (!threads) 453 return -ENOMEM; 454 455 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL); 456 if (!t.contexts) { 457 ret = -ENOMEM; 458 goto out_threads; 459 } 460 461 for (n = 0; n < t.ncontexts; n++) { 462 t.contexts[n] = mock_context(t.engine->i915, "mock"); 463 if (!t.contexts[n]) { 464 ret = -ENOMEM; 465 goto out_contexts; 466 } 467 } 468 469 for (n = 0; n < ncpus; n++) { 470 threads[n] = kthread_run(__igt_breadcrumbs_smoketest, 471 &t, "igt/%d", n); 472 if (IS_ERR(threads[n])) { 473 ret = PTR_ERR(threads[n]); 474 ncpus = n; 475 break; 476 } 477 478 get_task_struct(threads[n]); 479 } 480 481 yield(); /* start all threads before we begin */ 482 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 483 484 for (n = 0; n < ncpus; n++) { 485 int err; 486 487 err = kthread_stop(threads[n]); 488 if (err < 0 && !ret) 489 ret = err; 490 491 put_task_struct(threads[n]); 492 } 493 pr_info("Completed %lu waits for %lu fence across %d cpus\n", 494 atomic_long_read(&t.num_waits), 495 atomic_long_read(&t.num_fences), 496 ncpus); 497 498 out_contexts: 499 for (n = 0; n < t.ncontexts; n++) { 500 if (!t.contexts[n]) 501 break; 502 mock_context_close(t.contexts[n]); 503 } 504 kfree(t.contexts); 505 out_threads: 506 kfree(threads); 507 return ret; 508 } 509 510 int i915_request_mock_selftests(void) 511 { 512 static const struct i915_subtest tests[] = { 513 SUBTEST(igt_add_request), 514 SUBTEST(igt_wait_request), 515 SUBTEST(igt_fence_wait), 516 SUBTEST(igt_request_rewind), 517 SUBTEST(mock_breadcrumbs_smoketest), 518 }; 519 struct drm_i915_private *i915; 520 intel_wakeref_t wakeref; 521 int err = 0; 522 523 i915 = mock_gem_device(); 524 if (!i915) 525 return -ENOMEM; 526 527 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 528 err = i915_subtests(tests, i915); 529 530 drm_dev_put(&i915->drm); 531 532 return err; 533 } 534 535 static int live_nop_request(void *arg) 536 { 537 struct drm_i915_private *i915 = arg; 538 struct intel_engine_cs *engine; 539 struct igt_live_test t; 540 int err = -ENODEV; 541 542 /* 543 * Submit various sized batches of empty requests, to each engine 544 * (individually), and wait for the batch to complete. We can check 545 * the overhead of submitting requests to the hardware. 546 */ 547 548 for_each_uabi_engine(engine, i915) { 549 unsigned long n, prime; 550 IGT_TIMEOUT(end_time); 551 ktime_t times[2] = {}; 552 553 err = igt_live_test_begin(&t, i915, __func__, engine->name); 554 if (err) 555 return err; 556 557 intel_engine_pm_get(engine); 558 for_each_prime_number_from(prime, 1, 8192) { 559 struct i915_request *request = NULL; 560 561 times[1] = ktime_get_raw(); 562 563 for (n = 0; n < prime; n++) { 564 i915_request_put(request); 565 request = i915_request_create(engine->kernel_context); 566 if (IS_ERR(request)) 567 return PTR_ERR(request); 568 569 /* 570 * This space is left intentionally blank. 571 * 572 * We do not actually want to perform any 573 * action with this request, we just want 574 * to measure the latency in allocation 575 * and submission of our breadcrumbs - 576 * ensuring that the bare request is sufficient 577 * for the system to work (i.e. proper HEAD 578 * tracking of the rings, interrupt handling, 579 * etc). It also gives us the lowest bounds 580 * for latency. 581 */ 582 583 i915_request_get(request); 584 i915_request_add(request); 585 } 586 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 587 i915_request_put(request); 588 589 times[1] = ktime_sub(ktime_get_raw(), times[1]); 590 if (prime == 1) 591 times[0] = times[1]; 592 593 if (__igt_timeout(end_time, NULL)) 594 break; 595 } 596 intel_engine_pm_put(engine); 597 598 err = igt_live_test_end(&t); 599 if (err) 600 return err; 601 602 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n", 603 engine->name, 604 ktime_to_ns(times[0]), 605 prime, div64_u64(ktime_to_ns(times[1]), prime)); 606 } 607 608 return err; 609 } 610 611 static struct i915_vma *empty_batch(struct drm_i915_private *i915) 612 { 613 struct drm_i915_gem_object *obj; 614 struct i915_vma *vma; 615 u32 *cmd; 616 int err; 617 618 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 619 if (IS_ERR(obj)) 620 return ERR_CAST(obj); 621 622 cmd = i915_gem_object_pin_map(obj, I915_MAP_WB); 623 if (IS_ERR(cmd)) { 624 err = PTR_ERR(cmd); 625 goto err; 626 } 627 628 *cmd = MI_BATCH_BUFFER_END; 629 630 __i915_gem_object_flush_map(obj, 0, 64); 631 i915_gem_object_unpin_map(obj); 632 633 intel_gt_chipset_flush(&i915->gt); 634 635 vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL); 636 if (IS_ERR(vma)) { 637 err = PTR_ERR(vma); 638 goto err; 639 } 640 641 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL); 642 if (err) 643 goto err; 644 645 /* Force the wait wait now to avoid including it in the benchmark */ 646 err = i915_vma_sync(vma); 647 if (err) 648 goto err_pin; 649 650 return vma; 651 652 err_pin: 653 i915_vma_unpin(vma); 654 err: 655 i915_gem_object_put(obj); 656 return ERR_PTR(err); 657 } 658 659 static struct i915_request * 660 empty_request(struct intel_engine_cs *engine, 661 struct i915_vma *batch) 662 { 663 struct i915_request *request; 664 int err; 665 666 request = i915_request_create(engine->kernel_context); 667 if (IS_ERR(request)) 668 return request; 669 670 err = engine->emit_bb_start(request, 671 batch->node.start, 672 batch->node.size, 673 I915_DISPATCH_SECURE); 674 if (err) 675 goto out_request; 676 677 i915_request_get(request); 678 out_request: 679 i915_request_add(request); 680 return err ? ERR_PTR(err) : request; 681 } 682 683 static int live_empty_request(void *arg) 684 { 685 struct drm_i915_private *i915 = arg; 686 struct intel_engine_cs *engine; 687 struct igt_live_test t; 688 struct i915_vma *batch; 689 int err = 0; 690 691 /* 692 * Submit various sized batches of empty requests, to each engine 693 * (individually), and wait for the batch to complete. We can check 694 * the overhead of submitting requests to the hardware. 695 */ 696 697 batch = empty_batch(i915); 698 if (IS_ERR(batch)) 699 return PTR_ERR(batch); 700 701 for_each_uabi_engine(engine, i915) { 702 IGT_TIMEOUT(end_time); 703 struct i915_request *request; 704 unsigned long n, prime; 705 ktime_t times[2] = {}; 706 707 err = igt_live_test_begin(&t, i915, __func__, engine->name); 708 if (err) 709 goto out_batch; 710 711 intel_engine_pm_get(engine); 712 713 /* Warmup / preload */ 714 request = empty_request(engine, batch); 715 if (IS_ERR(request)) { 716 err = PTR_ERR(request); 717 intel_engine_pm_put(engine); 718 goto out_batch; 719 } 720 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 721 722 for_each_prime_number_from(prime, 1, 8192) { 723 times[1] = ktime_get_raw(); 724 725 for (n = 0; n < prime; n++) { 726 i915_request_put(request); 727 request = empty_request(engine, batch); 728 if (IS_ERR(request)) { 729 err = PTR_ERR(request); 730 intel_engine_pm_put(engine); 731 goto out_batch; 732 } 733 } 734 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 735 736 times[1] = ktime_sub(ktime_get_raw(), times[1]); 737 if (prime == 1) 738 times[0] = times[1]; 739 740 if (__igt_timeout(end_time, NULL)) 741 break; 742 } 743 i915_request_put(request); 744 intel_engine_pm_put(engine); 745 746 err = igt_live_test_end(&t); 747 if (err) 748 goto out_batch; 749 750 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n", 751 engine->name, 752 ktime_to_ns(times[0]), 753 prime, div64_u64(ktime_to_ns(times[1]), prime)); 754 } 755 756 out_batch: 757 i915_vma_unpin(batch); 758 i915_vma_put(batch); 759 return err; 760 } 761 762 static struct i915_vma *recursive_batch(struct drm_i915_private *i915) 763 { 764 struct drm_i915_gem_object *obj; 765 const int gen = INTEL_GEN(i915); 766 struct i915_vma *vma; 767 u32 *cmd; 768 int err; 769 770 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 771 if (IS_ERR(obj)) 772 return ERR_CAST(obj); 773 774 vma = i915_vma_instance(obj, i915->gt.vm, NULL); 775 if (IS_ERR(vma)) { 776 err = PTR_ERR(vma); 777 goto err; 778 } 779 780 err = i915_vma_pin(vma, 0, 0, PIN_USER); 781 if (err) 782 goto err; 783 784 cmd = i915_gem_object_pin_map(obj, I915_MAP_WC); 785 if (IS_ERR(cmd)) { 786 err = PTR_ERR(cmd); 787 goto err; 788 } 789 790 if (gen >= 8) { 791 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 792 *cmd++ = lower_32_bits(vma->node.start); 793 *cmd++ = upper_32_bits(vma->node.start); 794 } else if (gen >= 6) { 795 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8; 796 *cmd++ = lower_32_bits(vma->node.start); 797 } else { 798 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; 799 *cmd++ = lower_32_bits(vma->node.start); 800 } 801 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */ 802 803 __i915_gem_object_flush_map(obj, 0, 64); 804 i915_gem_object_unpin_map(obj); 805 806 intel_gt_chipset_flush(&i915->gt); 807 808 return vma; 809 810 err: 811 i915_gem_object_put(obj); 812 return ERR_PTR(err); 813 } 814 815 static int recursive_batch_resolve(struct i915_vma *batch) 816 { 817 u32 *cmd; 818 819 cmd = i915_gem_object_pin_map(batch->obj, I915_MAP_WC); 820 if (IS_ERR(cmd)) 821 return PTR_ERR(cmd); 822 823 *cmd = MI_BATCH_BUFFER_END; 824 825 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd)); 826 i915_gem_object_unpin_map(batch->obj); 827 828 intel_gt_chipset_flush(batch->vm->gt); 829 830 return 0; 831 } 832 833 static int live_all_engines(void *arg) 834 { 835 struct drm_i915_private *i915 = arg; 836 const unsigned int nengines = num_uabi_engines(i915); 837 struct intel_engine_cs *engine; 838 struct i915_request **request; 839 struct igt_live_test t; 840 struct i915_vma *batch; 841 unsigned int idx; 842 int err; 843 844 /* 845 * Check we can submit requests to all engines simultaneously. We 846 * send a recursive batch to each engine - checking that we don't 847 * block doing so, and that they don't complete too soon. 848 */ 849 850 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 851 if (!request) 852 return -ENOMEM; 853 854 err = igt_live_test_begin(&t, i915, __func__, ""); 855 if (err) 856 goto out_free; 857 858 batch = recursive_batch(i915); 859 if (IS_ERR(batch)) { 860 err = PTR_ERR(batch); 861 pr_err("%s: Unable to create batch, err=%d\n", __func__, err); 862 goto out_free; 863 } 864 865 idx = 0; 866 for_each_uabi_engine(engine, i915) { 867 request[idx] = intel_engine_create_kernel_request(engine); 868 if (IS_ERR(request[idx])) { 869 err = PTR_ERR(request[idx]); 870 pr_err("%s: Request allocation failed with err=%d\n", 871 __func__, err); 872 goto out_request; 873 } 874 875 i915_vma_lock(batch); 876 err = i915_request_await_object(request[idx], batch->obj, 0); 877 if (err == 0) 878 err = i915_vma_move_to_active(batch, request[idx], 0); 879 i915_vma_unlock(batch); 880 GEM_BUG_ON(err); 881 882 err = engine->emit_bb_start(request[idx], 883 batch->node.start, 884 batch->node.size, 885 0); 886 GEM_BUG_ON(err); 887 request[idx]->batch = batch; 888 889 i915_request_get(request[idx]); 890 i915_request_add(request[idx]); 891 idx++; 892 } 893 894 idx = 0; 895 for_each_uabi_engine(engine, i915) { 896 if (i915_request_completed(request[idx])) { 897 pr_err("%s(%s): request completed too early!\n", 898 __func__, engine->name); 899 err = -EINVAL; 900 goto out_request; 901 } 902 idx++; 903 } 904 905 err = recursive_batch_resolve(batch); 906 if (err) { 907 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err); 908 goto out_request; 909 } 910 911 idx = 0; 912 for_each_uabi_engine(engine, i915) { 913 long timeout; 914 915 timeout = i915_request_wait(request[idx], 0, 916 MAX_SCHEDULE_TIMEOUT); 917 if (timeout < 0) { 918 err = timeout; 919 pr_err("%s: error waiting for request on %s, err=%d\n", 920 __func__, engine->name, err); 921 goto out_request; 922 } 923 924 GEM_BUG_ON(!i915_request_completed(request[idx])); 925 i915_request_put(request[idx]); 926 request[idx] = NULL; 927 idx++; 928 } 929 930 err = igt_live_test_end(&t); 931 932 out_request: 933 idx = 0; 934 for_each_uabi_engine(engine, i915) { 935 if (request[idx]) 936 i915_request_put(request[idx]); 937 idx++; 938 } 939 i915_vma_unpin(batch); 940 i915_vma_put(batch); 941 out_free: 942 kfree(request); 943 return err; 944 } 945 946 static int live_sequential_engines(void *arg) 947 { 948 struct drm_i915_private *i915 = arg; 949 const unsigned int nengines = num_uabi_engines(i915); 950 struct i915_request **request; 951 struct i915_request *prev = NULL; 952 struct intel_engine_cs *engine; 953 struct igt_live_test t; 954 unsigned int idx; 955 int err; 956 957 /* 958 * Check we can submit requests to all engines sequentially, such 959 * that each successive request waits for the earlier ones. This 960 * tests that we don't execute requests out of order, even though 961 * they are running on independent engines. 962 */ 963 964 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 965 if (!request) 966 return -ENOMEM; 967 968 err = igt_live_test_begin(&t, i915, __func__, ""); 969 if (err) 970 goto out_free; 971 972 idx = 0; 973 for_each_uabi_engine(engine, i915) { 974 struct i915_vma *batch; 975 976 batch = recursive_batch(i915); 977 if (IS_ERR(batch)) { 978 err = PTR_ERR(batch); 979 pr_err("%s: Unable to create batch for %s, err=%d\n", 980 __func__, engine->name, err); 981 goto out_free; 982 } 983 984 request[idx] = intel_engine_create_kernel_request(engine); 985 if (IS_ERR(request[idx])) { 986 err = PTR_ERR(request[idx]); 987 pr_err("%s: Request allocation failed for %s with err=%d\n", 988 __func__, engine->name, err); 989 goto out_request; 990 } 991 992 if (prev) { 993 err = i915_request_await_dma_fence(request[idx], 994 &prev->fence); 995 if (err) { 996 i915_request_add(request[idx]); 997 pr_err("%s: Request await failed for %s with err=%d\n", 998 __func__, engine->name, err); 999 goto out_request; 1000 } 1001 } 1002 1003 i915_vma_lock(batch); 1004 err = i915_request_await_object(request[idx], 1005 batch->obj, false); 1006 if (err == 0) 1007 err = i915_vma_move_to_active(batch, request[idx], 0); 1008 i915_vma_unlock(batch); 1009 GEM_BUG_ON(err); 1010 1011 err = engine->emit_bb_start(request[idx], 1012 batch->node.start, 1013 batch->node.size, 1014 0); 1015 GEM_BUG_ON(err); 1016 request[idx]->batch = batch; 1017 1018 i915_request_get(request[idx]); 1019 i915_request_add(request[idx]); 1020 1021 prev = request[idx]; 1022 idx++; 1023 } 1024 1025 idx = 0; 1026 for_each_uabi_engine(engine, i915) { 1027 long timeout; 1028 1029 if (i915_request_completed(request[idx])) { 1030 pr_err("%s(%s): request completed too early!\n", 1031 __func__, engine->name); 1032 err = -EINVAL; 1033 goto out_request; 1034 } 1035 1036 err = recursive_batch_resolve(request[idx]->batch); 1037 if (err) { 1038 pr_err("%s: failed to resolve batch, err=%d\n", 1039 __func__, err); 1040 goto out_request; 1041 } 1042 1043 timeout = i915_request_wait(request[idx], 0, 1044 MAX_SCHEDULE_TIMEOUT); 1045 if (timeout < 0) { 1046 err = timeout; 1047 pr_err("%s: error waiting for request on %s, err=%d\n", 1048 __func__, engine->name, err); 1049 goto out_request; 1050 } 1051 1052 GEM_BUG_ON(!i915_request_completed(request[idx])); 1053 idx++; 1054 } 1055 1056 err = igt_live_test_end(&t); 1057 1058 out_request: 1059 idx = 0; 1060 for_each_uabi_engine(engine, i915) { 1061 u32 *cmd; 1062 1063 if (!request[idx]) 1064 break; 1065 1066 cmd = i915_gem_object_pin_map(request[idx]->batch->obj, 1067 I915_MAP_WC); 1068 if (!IS_ERR(cmd)) { 1069 *cmd = MI_BATCH_BUFFER_END; 1070 1071 __i915_gem_object_flush_map(request[idx]->batch->obj, 1072 0, sizeof(*cmd)); 1073 i915_gem_object_unpin_map(request[idx]->batch->obj); 1074 1075 intel_gt_chipset_flush(engine->gt); 1076 } 1077 1078 i915_vma_put(request[idx]->batch); 1079 i915_request_put(request[idx]); 1080 idx++; 1081 } 1082 out_free: 1083 kfree(request); 1084 return err; 1085 } 1086 1087 static int __live_parallel_engine1(void *arg) 1088 { 1089 struct intel_engine_cs *engine = arg; 1090 IGT_TIMEOUT(end_time); 1091 unsigned long count; 1092 int err = 0; 1093 1094 count = 0; 1095 intel_engine_pm_get(engine); 1096 do { 1097 struct i915_request *rq; 1098 1099 rq = i915_request_create(engine->kernel_context); 1100 if (IS_ERR(rq)) { 1101 err = PTR_ERR(rq); 1102 break; 1103 } 1104 1105 i915_request_get(rq); 1106 i915_request_add(rq); 1107 1108 err = 0; 1109 if (i915_request_wait(rq, 0, HZ / 5) < 0) 1110 err = -ETIME; 1111 i915_request_put(rq); 1112 if (err) 1113 break; 1114 1115 count++; 1116 } while (!__igt_timeout(end_time, NULL)); 1117 intel_engine_pm_put(engine); 1118 1119 pr_info("%s: %lu request + sync\n", engine->name, count); 1120 return err; 1121 } 1122 1123 static int __live_parallel_engineN(void *arg) 1124 { 1125 struct intel_engine_cs *engine = arg; 1126 IGT_TIMEOUT(end_time); 1127 unsigned long count; 1128 int err = 0; 1129 1130 count = 0; 1131 intel_engine_pm_get(engine); 1132 do { 1133 struct i915_request *rq; 1134 1135 rq = i915_request_create(engine->kernel_context); 1136 if (IS_ERR(rq)) { 1137 err = PTR_ERR(rq); 1138 break; 1139 } 1140 1141 i915_request_add(rq); 1142 count++; 1143 } while (!__igt_timeout(end_time, NULL)); 1144 intel_engine_pm_put(engine); 1145 1146 pr_info("%s: %lu requests\n", engine->name, count); 1147 return err; 1148 } 1149 1150 static bool wake_all(struct drm_i915_private *i915) 1151 { 1152 if (atomic_dec_and_test(&i915->selftest.counter)) { 1153 wake_up_var(&i915->selftest.counter); 1154 return true; 1155 } 1156 1157 return false; 1158 } 1159 1160 static int wait_for_all(struct drm_i915_private *i915) 1161 { 1162 if (wake_all(i915)) 1163 return 0; 1164 1165 if (wait_var_event_timeout(&i915->selftest.counter, 1166 !atomic_read(&i915->selftest.counter), 1167 i915_selftest.timeout_jiffies)) 1168 return 0; 1169 1170 return -ETIME; 1171 } 1172 1173 static int __live_parallel_spin(void *arg) 1174 { 1175 struct intel_engine_cs *engine = arg; 1176 struct igt_spinner spin; 1177 struct i915_request *rq; 1178 int err = 0; 1179 1180 /* 1181 * Create a spinner running for eternity on each engine. If a second 1182 * spinner is incorrectly placed on the same engine, it will not be 1183 * able to start in time. 1184 */ 1185 1186 if (igt_spinner_init(&spin, engine->gt)) { 1187 wake_all(engine->i915); 1188 return -ENOMEM; 1189 } 1190 1191 intel_engine_pm_get(engine); 1192 rq = igt_spinner_create_request(&spin, 1193 engine->kernel_context, 1194 MI_NOOP); /* no preemption */ 1195 intel_engine_pm_put(engine); 1196 if (IS_ERR(rq)) { 1197 err = PTR_ERR(rq); 1198 if (err == -ENODEV) 1199 err = 0; 1200 wake_all(engine->i915); 1201 goto out_spin; 1202 } 1203 1204 i915_request_get(rq); 1205 i915_request_add(rq); 1206 if (igt_wait_for_spinner(&spin, rq)) { 1207 /* Occupy this engine for the whole test */ 1208 err = wait_for_all(engine->i915); 1209 } else { 1210 pr_err("Failed to start spinner on %s\n", engine->name); 1211 err = -EINVAL; 1212 } 1213 igt_spinner_end(&spin); 1214 1215 if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0) 1216 err = -EIO; 1217 i915_request_put(rq); 1218 1219 out_spin: 1220 igt_spinner_fini(&spin); 1221 return err; 1222 } 1223 1224 static int live_parallel_engines(void *arg) 1225 { 1226 struct drm_i915_private *i915 = arg; 1227 static int (* const func[])(void *arg) = { 1228 __live_parallel_engine1, 1229 __live_parallel_engineN, 1230 __live_parallel_spin, 1231 NULL, 1232 }; 1233 const unsigned int nengines = num_uabi_engines(i915); 1234 struct intel_engine_cs *engine; 1235 int (* const *fn)(void *arg); 1236 struct task_struct **tsk; 1237 int err = 0; 1238 1239 /* 1240 * Check we can submit requests to all engines concurrently. This 1241 * tests that we load up the system maximally. 1242 */ 1243 1244 tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL); 1245 if (!tsk) 1246 return -ENOMEM; 1247 1248 for (fn = func; !err && *fn; fn++) { 1249 char name[KSYM_NAME_LEN]; 1250 struct igt_live_test t; 1251 unsigned int idx; 1252 1253 snprintf(name, sizeof(name), "%ps", *fn); 1254 err = igt_live_test_begin(&t, i915, __func__, name); 1255 if (err) 1256 break; 1257 1258 atomic_set(&i915->selftest.counter, nengines); 1259 1260 idx = 0; 1261 for_each_uabi_engine(engine, i915) { 1262 tsk[idx] = kthread_run(*fn, engine, 1263 "igt/parallel:%s", 1264 engine->name); 1265 if (IS_ERR(tsk[idx])) { 1266 err = PTR_ERR(tsk[idx]); 1267 break; 1268 } 1269 get_task_struct(tsk[idx++]); 1270 } 1271 1272 yield(); /* start all threads before we kthread_stop() */ 1273 1274 idx = 0; 1275 for_each_uabi_engine(engine, i915) { 1276 int status; 1277 1278 if (IS_ERR(tsk[idx])) 1279 break; 1280 1281 status = kthread_stop(tsk[idx]); 1282 if (status && !err) 1283 err = status; 1284 1285 put_task_struct(tsk[idx++]); 1286 } 1287 1288 if (igt_live_test_end(&t)) 1289 err = -EIO; 1290 } 1291 1292 kfree(tsk); 1293 return err; 1294 } 1295 1296 static int 1297 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine) 1298 { 1299 struct i915_request *rq; 1300 int ret; 1301 1302 /* 1303 * Before execlists, all contexts share the same ringbuffer. With 1304 * execlists, each context/engine has a separate ringbuffer and 1305 * for the purposes of this test, inexhaustible. 1306 * 1307 * For the global ringbuffer though, we have to be very careful 1308 * that we do not wrap while preventing the execution of requests 1309 * with a unsignaled fence. 1310 */ 1311 if (HAS_EXECLISTS(ctx->i915)) 1312 return INT_MAX; 1313 1314 rq = igt_request_alloc(ctx, engine); 1315 if (IS_ERR(rq)) { 1316 ret = PTR_ERR(rq); 1317 } else { 1318 int sz; 1319 1320 ret = rq->ring->size - rq->reserved_space; 1321 i915_request_add(rq); 1322 1323 sz = rq->ring->emit - rq->head; 1324 if (sz < 0) 1325 sz += rq->ring->size; 1326 ret /= sz; 1327 ret /= 2; /* leave half spare, in case of emergency! */ 1328 } 1329 1330 return ret; 1331 } 1332 1333 static int live_breadcrumbs_smoketest(void *arg) 1334 { 1335 struct drm_i915_private *i915 = arg; 1336 const unsigned int nengines = num_uabi_engines(i915); 1337 const unsigned int ncpus = num_online_cpus(); 1338 unsigned long num_waits, num_fences; 1339 struct intel_engine_cs *engine; 1340 struct task_struct **threads; 1341 struct igt_live_test live; 1342 intel_wakeref_t wakeref; 1343 struct smoketest *smoke; 1344 unsigned int n, idx; 1345 struct file *file; 1346 int ret = 0; 1347 1348 /* 1349 * Smoketest our breadcrumb/signal handling for requests across multiple 1350 * threads. A very simple test to only catch the most egregious of bugs. 1351 * See __igt_breadcrumbs_smoketest(); 1352 * 1353 * On real hardware this time. 1354 */ 1355 1356 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1357 1358 file = mock_file(i915); 1359 if (IS_ERR(file)) { 1360 ret = PTR_ERR(file); 1361 goto out_rpm; 1362 } 1363 1364 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL); 1365 if (!smoke) { 1366 ret = -ENOMEM; 1367 goto out_file; 1368 } 1369 1370 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL); 1371 if (!threads) { 1372 ret = -ENOMEM; 1373 goto out_smoke; 1374 } 1375 1376 smoke[0].request_alloc = __live_request_alloc; 1377 smoke[0].ncontexts = 64; 1378 smoke[0].contexts = kcalloc(smoke[0].ncontexts, 1379 sizeof(*smoke[0].contexts), 1380 GFP_KERNEL); 1381 if (!smoke[0].contexts) { 1382 ret = -ENOMEM; 1383 goto out_threads; 1384 } 1385 1386 for (n = 0; n < smoke[0].ncontexts; n++) { 1387 smoke[0].contexts[n] = live_context(i915, file); 1388 if (!smoke[0].contexts[n]) { 1389 ret = -ENOMEM; 1390 goto out_contexts; 1391 } 1392 } 1393 1394 ret = igt_live_test_begin(&live, i915, __func__, ""); 1395 if (ret) 1396 goto out_contexts; 1397 1398 idx = 0; 1399 for_each_uabi_engine(engine, i915) { 1400 smoke[idx] = smoke[0]; 1401 smoke[idx].engine = engine; 1402 smoke[idx].max_batch = 1403 max_batches(smoke[0].contexts[0], engine); 1404 if (smoke[idx].max_batch < 0) { 1405 ret = smoke[idx].max_batch; 1406 goto out_flush; 1407 } 1408 /* One ring interleaved between requests from all cpus */ 1409 smoke[idx].max_batch /= num_online_cpus() + 1; 1410 pr_debug("Limiting batches to %d requests on %s\n", 1411 smoke[idx].max_batch, engine->name); 1412 1413 for (n = 0; n < ncpus; n++) { 1414 struct task_struct *tsk; 1415 1416 tsk = kthread_run(__igt_breadcrumbs_smoketest, 1417 &smoke[idx], "igt/%d.%d", idx, n); 1418 if (IS_ERR(tsk)) { 1419 ret = PTR_ERR(tsk); 1420 goto out_flush; 1421 } 1422 1423 get_task_struct(tsk); 1424 threads[idx * ncpus + n] = tsk; 1425 } 1426 1427 idx++; 1428 } 1429 1430 yield(); /* start all threads before we begin */ 1431 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 1432 1433 out_flush: 1434 idx = 0; 1435 num_waits = 0; 1436 num_fences = 0; 1437 for_each_uabi_engine(engine, i915) { 1438 for (n = 0; n < ncpus; n++) { 1439 struct task_struct *tsk = threads[idx * ncpus + n]; 1440 int err; 1441 1442 if (!tsk) 1443 continue; 1444 1445 err = kthread_stop(tsk); 1446 if (err < 0 && !ret) 1447 ret = err; 1448 1449 put_task_struct(tsk); 1450 } 1451 1452 num_waits += atomic_long_read(&smoke[idx].num_waits); 1453 num_fences += atomic_long_read(&smoke[idx].num_fences); 1454 idx++; 1455 } 1456 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", 1457 num_waits, num_fences, idx, ncpus); 1458 1459 ret = igt_live_test_end(&live) ?: ret; 1460 out_contexts: 1461 kfree(smoke[0].contexts); 1462 out_threads: 1463 kfree(threads); 1464 out_smoke: 1465 kfree(smoke); 1466 out_file: 1467 fput(file); 1468 out_rpm: 1469 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1470 1471 return ret; 1472 } 1473 1474 int i915_request_live_selftests(struct drm_i915_private *i915) 1475 { 1476 static const struct i915_subtest tests[] = { 1477 SUBTEST(live_nop_request), 1478 SUBTEST(live_all_engines), 1479 SUBTEST(live_sequential_engines), 1480 SUBTEST(live_parallel_engines), 1481 SUBTEST(live_empty_request), 1482 SUBTEST(live_breadcrumbs_smoketest), 1483 }; 1484 1485 if (intel_gt_is_wedged(&i915->gt)) 1486 return 0; 1487 1488 return i915_subtests(tests, i915); 1489 } 1490 1491 static int switch_to_kernel_sync(struct intel_context *ce, int err) 1492 { 1493 struct i915_request *rq; 1494 struct dma_fence *fence; 1495 1496 rq = intel_engine_create_kernel_request(ce->engine); 1497 if (IS_ERR(rq)) 1498 return PTR_ERR(rq); 1499 1500 fence = i915_active_fence_get(&ce->timeline->last_request); 1501 if (fence) { 1502 i915_request_await_dma_fence(rq, fence); 1503 dma_fence_put(fence); 1504 } 1505 1506 rq = i915_request_get(rq); 1507 i915_request_add(rq); 1508 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err) 1509 err = -ETIME; 1510 i915_request_put(rq); 1511 1512 while (!err && !intel_engine_is_idle(ce->engine)) 1513 intel_engine_flush_submission(ce->engine); 1514 1515 return err; 1516 } 1517 1518 struct perf_stats { 1519 struct intel_engine_cs *engine; 1520 unsigned long count; 1521 ktime_t time; 1522 ktime_t busy; 1523 u64 runtime; 1524 }; 1525 1526 struct perf_series { 1527 struct drm_i915_private *i915; 1528 unsigned int nengines; 1529 struct intel_context *ce[]; 1530 }; 1531 1532 static int cmp_u32(const void *A, const void *B) 1533 { 1534 const u32 *a = A, *b = B; 1535 1536 return *a - *b; 1537 } 1538 1539 static u32 trifilter(u32 *a) 1540 { 1541 u64 sum; 1542 1543 #define TF_COUNT 5 1544 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); 1545 1546 sum = mul_u32_u32(a[2], 2); 1547 sum += a[1]; 1548 sum += a[3]; 1549 1550 GEM_BUG_ON(sum > U32_MAX); 1551 return sum; 1552 #define TF_BIAS 2 1553 } 1554 1555 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) 1556 { 1557 u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles); 1558 1559 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); 1560 } 1561 1562 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) 1563 { 1564 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 1565 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); 1566 *cs++ = offset; 1567 *cs++ = 0; 1568 1569 return cs; 1570 } 1571 1572 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) 1573 { 1574 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1575 *cs++ = offset; 1576 *cs++ = 0; 1577 *cs++ = value; 1578 1579 return cs; 1580 } 1581 1582 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) 1583 { 1584 *cs++ = MI_SEMAPHORE_WAIT | 1585 MI_SEMAPHORE_GLOBAL_GTT | 1586 MI_SEMAPHORE_POLL | 1587 mode; 1588 *cs++ = value; 1589 *cs++ = offset; 1590 *cs++ = 0; 1591 1592 return cs; 1593 } 1594 1595 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) 1596 { 1597 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); 1598 } 1599 1600 static void semaphore_set(u32 *sema, u32 value) 1601 { 1602 WRITE_ONCE(*sema, value); 1603 wmb(); /* flush the update to the cache, and beyond */ 1604 } 1605 1606 static u32 *hwsp_scratch(const struct intel_context *ce) 1607 { 1608 return memset32(ce->engine->status_page.addr + 1000, 0, 21); 1609 } 1610 1611 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) 1612 { 1613 return (i915_ggtt_offset(ce->engine->status_page.vma) + 1614 offset_in_page(dw)); 1615 } 1616 1617 static int measure_semaphore_response(struct intel_context *ce) 1618 { 1619 u32 *sema = hwsp_scratch(ce); 1620 const u32 offset = hwsp_offset(ce, sema); 1621 u32 elapsed[TF_COUNT], cycles; 1622 struct i915_request *rq; 1623 u32 *cs; 1624 int err; 1625 int i; 1626 1627 /* 1628 * Measure how many cycles it takes for the HW to detect the change 1629 * in a semaphore value. 1630 * 1631 * A: read CS_TIMESTAMP from CPU 1632 * poke semaphore 1633 * B: read CS_TIMESTAMP on GPU 1634 * 1635 * Semaphore latency: B - A 1636 */ 1637 1638 semaphore_set(sema, -1); 1639 1640 rq = i915_request_create(ce); 1641 if (IS_ERR(rq)) 1642 return PTR_ERR(rq); 1643 1644 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); 1645 if (IS_ERR(cs)) { 1646 i915_request_add(rq); 1647 err = PTR_ERR(cs); 1648 goto err; 1649 } 1650 1651 cs = emit_store_dw(cs, offset, 0); 1652 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1653 cs = emit_semaphore_poll_until(cs, offset, i); 1654 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1655 cs = emit_store_dw(cs, offset, 0); 1656 } 1657 1658 intel_ring_advance(rq, cs); 1659 i915_request_add(rq); 1660 1661 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 1662 err = -EIO; 1663 goto err; 1664 } 1665 1666 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1667 preempt_disable(); 1668 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 1669 semaphore_set(sema, i); 1670 preempt_enable(); 1671 1672 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 1673 err = -EIO; 1674 goto err; 1675 } 1676 1677 elapsed[i - 1] = sema[i] - cycles; 1678 } 1679 1680 cycles = trifilter(elapsed); 1681 pr_info("%s: semaphore response %d cycles, %lluns\n", 1682 ce->engine->name, cycles >> TF_BIAS, 1683 cycles_to_ns(ce->engine, cycles)); 1684 1685 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1686 1687 err: 1688 intel_gt_set_wedged(ce->engine->gt); 1689 return err; 1690 } 1691 1692 static int measure_idle_dispatch(struct intel_context *ce) 1693 { 1694 u32 *sema = hwsp_scratch(ce); 1695 const u32 offset = hwsp_offset(ce, sema); 1696 u32 elapsed[TF_COUNT], cycles; 1697 u32 *cs; 1698 int err; 1699 int i; 1700 1701 /* 1702 * Measure how long it takes for us to submit a request while the 1703 * engine is idle, but is resting in our context. 1704 * 1705 * A: read CS_TIMESTAMP from CPU 1706 * submit request 1707 * B: read CS_TIMESTAMP on GPU 1708 * 1709 * Submission latency: B - A 1710 */ 1711 1712 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 1713 struct i915_request *rq; 1714 1715 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 1716 if (err) 1717 return err; 1718 1719 rq = i915_request_create(ce); 1720 if (IS_ERR(rq)) { 1721 err = PTR_ERR(rq); 1722 goto err; 1723 } 1724 1725 cs = intel_ring_begin(rq, 4); 1726 if (IS_ERR(cs)) { 1727 i915_request_add(rq); 1728 err = PTR_ERR(cs); 1729 goto err; 1730 } 1731 1732 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1733 1734 intel_ring_advance(rq, cs); 1735 1736 preempt_disable(); 1737 local_bh_disable(); 1738 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 1739 i915_request_add(rq); 1740 local_bh_enable(); 1741 preempt_enable(); 1742 } 1743 1744 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 1745 if (err) 1746 goto err; 1747 1748 for (i = 0; i < ARRAY_SIZE(elapsed); i++) 1749 elapsed[i] = sema[i] - elapsed[i]; 1750 1751 cycles = trifilter(elapsed); 1752 pr_info("%s: idle dispatch latency %d cycles, %lluns\n", 1753 ce->engine->name, cycles >> TF_BIAS, 1754 cycles_to_ns(ce->engine, cycles)); 1755 1756 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1757 1758 err: 1759 intel_gt_set_wedged(ce->engine->gt); 1760 return err; 1761 } 1762 1763 static int measure_busy_dispatch(struct intel_context *ce) 1764 { 1765 u32 *sema = hwsp_scratch(ce); 1766 const u32 offset = hwsp_offset(ce, sema); 1767 u32 elapsed[TF_COUNT + 1], cycles; 1768 u32 *cs; 1769 int err; 1770 int i; 1771 1772 /* 1773 * Measure how long it takes for us to submit a request while the 1774 * engine is busy, polling on a semaphore in our context. With 1775 * direct submission, this will include the cost of a lite restore. 1776 * 1777 * A: read CS_TIMESTAMP from CPU 1778 * submit request 1779 * B: read CS_TIMESTAMP on GPU 1780 * 1781 * Submission latency: B - A 1782 */ 1783 1784 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1785 struct i915_request *rq; 1786 1787 rq = i915_request_create(ce); 1788 if (IS_ERR(rq)) { 1789 err = PTR_ERR(rq); 1790 goto err; 1791 } 1792 1793 cs = intel_ring_begin(rq, 12); 1794 if (IS_ERR(cs)) { 1795 i915_request_add(rq); 1796 err = PTR_ERR(cs); 1797 goto err; 1798 } 1799 1800 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 1801 cs = emit_semaphore_poll_until(cs, offset, i); 1802 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1803 1804 intel_ring_advance(rq, cs); 1805 1806 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { 1807 err = -EIO; 1808 goto err; 1809 } 1810 1811 preempt_disable(); 1812 local_bh_disable(); 1813 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 1814 i915_request_add(rq); 1815 local_bh_enable(); 1816 semaphore_set(sema, i - 1); 1817 preempt_enable(); 1818 } 1819 1820 wait_for(READ_ONCE(sema[i - 1]), 500); 1821 semaphore_set(sema, i - 1); 1822 1823 for (i = 1; i <= TF_COUNT; i++) { 1824 GEM_BUG_ON(sema[i] == -1); 1825 elapsed[i - 1] = sema[i] - elapsed[i]; 1826 } 1827 1828 cycles = trifilter(elapsed); 1829 pr_info("%s: busy dispatch latency %d cycles, %lluns\n", 1830 ce->engine->name, cycles >> TF_BIAS, 1831 cycles_to_ns(ce->engine, cycles)); 1832 1833 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1834 1835 err: 1836 intel_gt_set_wedged(ce->engine->gt); 1837 return err; 1838 } 1839 1840 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) 1841 { 1842 const u32 offset = 1843 i915_ggtt_offset(engine->status_page.vma) + 1844 offset_in_page(sema); 1845 struct i915_request *rq; 1846 u32 *cs; 1847 1848 rq = i915_request_create(engine->kernel_context); 1849 if (IS_ERR(rq)) 1850 return PTR_ERR(rq); 1851 1852 cs = intel_ring_begin(rq, 4); 1853 if (IS_ERR(cs)) { 1854 i915_request_add(rq); 1855 return PTR_ERR(cs); 1856 } 1857 1858 cs = emit_semaphore_poll(cs, mode, value, offset); 1859 1860 intel_ring_advance(rq, cs); 1861 i915_request_add(rq); 1862 1863 return 0; 1864 } 1865 1866 static int measure_inter_request(struct intel_context *ce) 1867 { 1868 u32 *sema = hwsp_scratch(ce); 1869 const u32 offset = hwsp_offset(ce, sema); 1870 u32 elapsed[TF_COUNT + 1], cycles; 1871 struct i915_sw_fence *submit; 1872 int i, err; 1873 1874 /* 1875 * Measure how long it takes to advance from one request into the 1876 * next. Between each request we flush the GPU caches to memory, 1877 * update the breadcrumbs, and then invalidate those caches. 1878 * We queue up all the requests to be submitted in one batch so 1879 * it should be one set of contiguous measurements. 1880 * 1881 * A: read CS_TIMESTAMP on GPU 1882 * advance request 1883 * B: read CS_TIMESTAMP on GPU 1884 * 1885 * Request latency: B - A 1886 */ 1887 1888 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 1889 if (err) 1890 return err; 1891 1892 submit = heap_fence_create(GFP_KERNEL); 1893 if (!submit) { 1894 semaphore_set(sema, 1); 1895 return -ENOMEM; 1896 } 1897 1898 intel_engine_flush_submission(ce->engine); 1899 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1900 struct i915_request *rq; 1901 u32 *cs; 1902 1903 rq = i915_request_create(ce); 1904 if (IS_ERR(rq)) { 1905 err = PTR_ERR(rq); 1906 goto err_submit; 1907 } 1908 1909 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 1910 submit, 1911 GFP_KERNEL); 1912 if (err < 0) { 1913 i915_request_add(rq); 1914 goto err_submit; 1915 } 1916 1917 cs = intel_ring_begin(rq, 4); 1918 if (IS_ERR(cs)) { 1919 i915_request_add(rq); 1920 err = PTR_ERR(cs); 1921 goto err_submit; 1922 } 1923 1924 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1925 1926 intel_ring_advance(rq, cs); 1927 i915_request_add(rq); 1928 } 1929 local_bh_disable(); 1930 i915_sw_fence_commit(submit); 1931 local_bh_enable(); 1932 intel_engine_flush_submission(ce->engine); 1933 heap_fence_put(submit); 1934 1935 semaphore_set(sema, 1); 1936 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 1937 if (err) 1938 goto err; 1939 1940 for (i = 1; i <= TF_COUNT; i++) 1941 elapsed[i - 1] = sema[i + 1] - sema[i]; 1942 1943 cycles = trifilter(elapsed); 1944 pr_info("%s: inter-request latency %d cycles, %lluns\n", 1945 ce->engine->name, cycles >> TF_BIAS, 1946 cycles_to_ns(ce->engine, cycles)); 1947 1948 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1949 1950 err_submit: 1951 i915_sw_fence_commit(submit); 1952 heap_fence_put(submit); 1953 semaphore_set(sema, 1); 1954 err: 1955 intel_gt_set_wedged(ce->engine->gt); 1956 return err; 1957 } 1958 1959 static int measure_context_switch(struct intel_context *ce) 1960 { 1961 u32 *sema = hwsp_scratch(ce); 1962 const u32 offset = hwsp_offset(ce, sema); 1963 struct i915_request *fence = NULL; 1964 u32 elapsed[TF_COUNT + 1], cycles; 1965 int i, j, err; 1966 u32 *cs; 1967 1968 /* 1969 * Measure how long it takes to advance from one request in one 1970 * context to a request in another context. This allows us to 1971 * measure how long the context save/restore take, along with all 1972 * the inter-context setup we require. 1973 * 1974 * A: read CS_TIMESTAMP on GPU 1975 * switch context 1976 * B: read CS_TIMESTAMP on GPU 1977 * 1978 * Context switch latency: B - A 1979 */ 1980 1981 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 1982 if (err) 1983 return err; 1984 1985 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1986 struct intel_context *arr[] = { 1987 ce, ce->engine->kernel_context 1988 }; 1989 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); 1990 1991 for (j = 0; j < ARRAY_SIZE(arr); j++) { 1992 struct i915_request *rq; 1993 1994 rq = i915_request_create(arr[j]); 1995 if (IS_ERR(rq)) { 1996 err = PTR_ERR(rq); 1997 goto err_fence; 1998 } 1999 2000 if (fence) { 2001 err = i915_request_await_dma_fence(rq, 2002 &fence->fence); 2003 if (err) { 2004 i915_request_add(rq); 2005 goto err_fence; 2006 } 2007 } 2008 2009 cs = intel_ring_begin(rq, 4); 2010 if (IS_ERR(cs)) { 2011 i915_request_add(rq); 2012 err = PTR_ERR(cs); 2013 goto err_fence; 2014 } 2015 2016 cs = emit_timestamp_store(cs, ce, addr); 2017 addr += sizeof(u32); 2018 2019 intel_ring_advance(rq, cs); 2020 2021 i915_request_put(fence); 2022 fence = i915_request_get(rq); 2023 2024 i915_request_add(rq); 2025 } 2026 } 2027 i915_request_put(fence); 2028 intel_engine_flush_submission(ce->engine); 2029 2030 semaphore_set(sema, 1); 2031 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2032 if (err) 2033 goto err; 2034 2035 for (i = 1; i <= TF_COUNT; i++) 2036 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; 2037 2038 cycles = trifilter(elapsed); 2039 pr_info("%s: context switch latency %d cycles, %lluns\n", 2040 ce->engine->name, cycles >> TF_BIAS, 2041 cycles_to_ns(ce->engine, cycles)); 2042 2043 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2044 2045 err_fence: 2046 i915_request_put(fence); 2047 semaphore_set(sema, 1); 2048 err: 2049 intel_gt_set_wedged(ce->engine->gt); 2050 return err; 2051 } 2052 2053 static int measure_preemption(struct intel_context *ce) 2054 { 2055 u32 *sema = hwsp_scratch(ce); 2056 const u32 offset = hwsp_offset(ce, sema); 2057 u32 elapsed[TF_COUNT], cycles; 2058 u32 *cs; 2059 int err; 2060 int i; 2061 2062 /* 2063 * We measure two latencies while triggering preemption. The first 2064 * latency is how long it takes for us to submit a preempting request. 2065 * The second latency is how it takes for us to return from the 2066 * preemption back to the original context. 2067 * 2068 * A: read CS_TIMESTAMP from CPU 2069 * submit preemption 2070 * B: read CS_TIMESTAMP on GPU (in preempting context) 2071 * context switch 2072 * C: read CS_TIMESTAMP on GPU (in original context) 2073 * 2074 * Preemption dispatch latency: B - A 2075 * Preemption switch latency: C - B 2076 */ 2077 2078 if (!intel_engine_has_preemption(ce->engine)) 2079 return 0; 2080 2081 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2082 u32 addr = offset + 2 * i * sizeof(u32); 2083 struct i915_request *rq; 2084 2085 rq = i915_request_create(ce); 2086 if (IS_ERR(rq)) { 2087 err = PTR_ERR(rq); 2088 goto err; 2089 } 2090 2091 cs = intel_ring_begin(rq, 12); 2092 if (IS_ERR(cs)) { 2093 i915_request_add(rq); 2094 err = PTR_ERR(cs); 2095 goto err; 2096 } 2097 2098 cs = emit_store_dw(cs, addr, -1); 2099 cs = emit_semaphore_poll_until(cs, offset, i); 2100 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); 2101 2102 intel_ring_advance(rq, cs); 2103 i915_request_add(rq); 2104 2105 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { 2106 err = -EIO; 2107 goto err; 2108 } 2109 2110 rq = i915_request_create(ce->engine->kernel_context); 2111 if (IS_ERR(rq)) { 2112 err = PTR_ERR(rq); 2113 goto err; 2114 } 2115 2116 cs = intel_ring_begin(rq, 8); 2117 if (IS_ERR(cs)) { 2118 i915_request_add(rq); 2119 err = PTR_ERR(cs); 2120 goto err; 2121 } 2122 2123 cs = emit_timestamp_store(cs, ce, addr); 2124 cs = emit_store_dw(cs, offset, i); 2125 2126 intel_ring_advance(rq, cs); 2127 rq->sched.attr.priority = I915_PRIORITY_BARRIER; 2128 2129 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2130 i915_request_add(rq); 2131 } 2132 2133 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { 2134 err = -EIO; 2135 goto err; 2136 } 2137 2138 for (i = 1; i <= TF_COUNT; i++) 2139 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; 2140 2141 cycles = trifilter(elapsed); 2142 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", 2143 ce->engine->name, cycles >> TF_BIAS, 2144 cycles_to_ns(ce->engine, cycles)); 2145 2146 for (i = 1; i <= TF_COUNT; i++) 2147 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; 2148 2149 cycles = trifilter(elapsed); 2150 pr_info("%s: preemption switch latency %d cycles, %lluns\n", 2151 ce->engine->name, cycles >> TF_BIAS, 2152 cycles_to_ns(ce->engine, cycles)); 2153 2154 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2155 2156 err: 2157 intel_gt_set_wedged(ce->engine->gt); 2158 return err; 2159 } 2160 2161 struct signal_cb { 2162 struct dma_fence_cb base; 2163 bool seen; 2164 }; 2165 2166 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 2167 { 2168 struct signal_cb *s = container_of(cb, typeof(*s), base); 2169 2170 smp_store_mb(s->seen, true); /* be safe, be strong */ 2171 } 2172 2173 static int measure_completion(struct intel_context *ce) 2174 { 2175 u32 *sema = hwsp_scratch(ce); 2176 const u32 offset = hwsp_offset(ce, sema); 2177 u32 elapsed[TF_COUNT], cycles; 2178 u32 *cs; 2179 int err; 2180 int i; 2181 2182 /* 2183 * Measure how long it takes for the signal (interrupt) to be 2184 * sent from the GPU to be processed by the CPU. 2185 * 2186 * A: read CS_TIMESTAMP on GPU 2187 * signal 2188 * B: read CS_TIMESTAMP from CPU 2189 * 2190 * Completion latency: B - A 2191 */ 2192 2193 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2194 struct signal_cb cb = { .seen = false }; 2195 struct i915_request *rq; 2196 2197 rq = i915_request_create(ce); 2198 if (IS_ERR(rq)) { 2199 err = PTR_ERR(rq); 2200 goto err; 2201 } 2202 2203 cs = intel_ring_begin(rq, 12); 2204 if (IS_ERR(cs)) { 2205 i915_request_add(rq); 2206 err = PTR_ERR(cs); 2207 goto err; 2208 } 2209 2210 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2211 cs = emit_semaphore_poll_until(cs, offset, i); 2212 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2213 2214 intel_ring_advance(rq, cs); 2215 2216 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); 2217 2218 local_bh_disable(); 2219 i915_request_add(rq); 2220 local_bh_enable(); 2221 2222 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { 2223 err = -EIO; 2224 goto err; 2225 } 2226 2227 preempt_disable(); 2228 semaphore_set(sema, i); 2229 while (!READ_ONCE(cb.seen)) 2230 cpu_relax(); 2231 2232 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2233 preempt_enable(); 2234 } 2235 2236 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2237 if (err) 2238 goto err; 2239 2240 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2241 GEM_BUG_ON(sema[i + 1] == -1); 2242 elapsed[i] = elapsed[i] - sema[i + 1]; 2243 } 2244 2245 cycles = trifilter(elapsed); 2246 pr_info("%s: completion latency %d cycles, %lluns\n", 2247 ce->engine->name, cycles >> TF_BIAS, 2248 cycles_to_ns(ce->engine, cycles)); 2249 2250 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2251 2252 err: 2253 intel_gt_set_wedged(ce->engine->gt); 2254 return err; 2255 } 2256 2257 static void rps_pin(struct intel_gt *gt) 2258 { 2259 /* Pin the frequency to max */ 2260 atomic_inc(>->rps.num_waiters); 2261 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 2262 2263 mutex_lock(>->rps.lock); 2264 intel_rps_set(>->rps, gt->rps.max_freq); 2265 mutex_unlock(>->rps.lock); 2266 } 2267 2268 static void rps_unpin(struct intel_gt *gt) 2269 { 2270 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 2271 atomic_dec(>->rps.num_waiters); 2272 } 2273 2274 static int perf_request_latency(void *arg) 2275 { 2276 struct drm_i915_private *i915 = arg; 2277 struct intel_engine_cs *engine; 2278 struct pm_qos_request qos; 2279 int err = 0; 2280 2281 if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */ 2282 return 0; 2283 2284 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2285 2286 for_each_uabi_engine(engine, i915) { 2287 struct intel_context *ce; 2288 2289 ce = intel_context_create(engine); 2290 if (IS_ERR(ce)) 2291 goto out; 2292 2293 err = intel_context_pin(ce); 2294 if (err) { 2295 intel_context_put(ce); 2296 goto out; 2297 } 2298 2299 st_engine_heartbeat_disable(engine); 2300 rps_pin(engine->gt); 2301 2302 if (err == 0) 2303 err = measure_semaphore_response(ce); 2304 if (err == 0) 2305 err = measure_idle_dispatch(ce); 2306 if (err == 0) 2307 err = measure_busy_dispatch(ce); 2308 if (err == 0) 2309 err = measure_inter_request(ce); 2310 if (err == 0) 2311 err = measure_context_switch(ce); 2312 if (err == 0) 2313 err = measure_preemption(ce); 2314 if (err == 0) 2315 err = measure_completion(ce); 2316 2317 rps_unpin(engine->gt); 2318 st_engine_heartbeat_enable(engine); 2319 2320 intel_context_unpin(ce); 2321 intel_context_put(ce); 2322 if (err) 2323 goto out; 2324 } 2325 2326 out: 2327 if (igt_flush_test(i915)) 2328 err = -EIO; 2329 2330 cpu_latency_qos_remove_request(&qos); 2331 return err; 2332 } 2333 2334 static int s_sync0(void *arg) 2335 { 2336 struct perf_series *ps = arg; 2337 IGT_TIMEOUT(end_time); 2338 unsigned int idx = 0; 2339 int err = 0; 2340 2341 GEM_BUG_ON(!ps->nengines); 2342 do { 2343 struct i915_request *rq; 2344 2345 rq = i915_request_create(ps->ce[idx]); 2346 if (IS_ERR(rq)) { 2347 err = PTR_ERR(rq); 2348 break; 2349 } 2350 2351 i915_request_get(rq); 2352 i915_request_add(rq); 2353 2354 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2355 err = -ETIME; 2356 i915_request_put(rq); 2357 if (err) 2358 break; 2359 2360 if (++idx == ps->nengines) 2361 idx = 0; 2362 } while (!__igt_timeout(end_time, NULL)); 2363 2364 return err; 2365 } 2366 2367 static int s_sync1(void *arg) 2368 { 2369 struct perf_series *ps = arg; 2370 struct i915_request *prev = NULL; 2371 IGT_TIMEOUT(end_time); 2372 unsigned int idx = 0; 2373 int err = 0; 2374 2375 GEM_BUG_ON(!ps->nengines); 2376 do { 2377 struct i915_request *rq; 2378 2379 rq = i915_request_create(ps->ce[idx]); 2380 if (IS_ERR(rq)) { 2381 err = PTR_ERR(rq); 2382 break; 2383 } 2384 2385 i915_request_get(rq); 2386 i915_request_add(rq); 2387 2388 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2389 err = -ETIME; 2390 i915_request_put(prev); 2391 prev = rq; 2392 if (err) 2393 break; 2394 2395 if (++idx == ps->nengines) 2396 idx = 0; 2397 } while (!__igt_timeout(end_time, NULL)); 2398 i915_request_put(prev); 2399 2400 return err; 2401 } 2402 2403 static int s_many(void *arg) 2404 { 2405 struct perf_series *ps = arg; 2406 IGT_TIMEOUT(end_time); 2407 unsigned int idx = 0; 2408 2409 GEM_BUG_ON(!ps->nengines); 2410 do { 2411 struct i915_request *rq; 2412 2413 rq = i915_request_create(ps->ce[idx]); 2414 if (IS_ERR(rq)) 2415 return PTR_ERR(rq); 2416 2417 i915_request_add(rq); 2418 2419 if (++idx == ps->nengines) 2420 idx = 0; 2421 } while (!__igt_timeout(end_time, NULL)); 2422 2423 return 0; 2424 } 2425 2426 static int perf_series_engines(void *arg) 2427 { 2428 struct drm_i915_private *i915 = arg; 2429 static int (* const func[])(void *arg) = { 2430 s_sync0, 2431 s_sync1, 2432 s_many, 2433 NULL, 2434 }; 2435 const unsigned int nengines = num_uabi_engines(i915); 2436 struct intel_engine_cs *engine; 2437 int (* const *fn)(void *arg); 2438 struct pm_qos_request qos; 2439 struct perf_stats *stats; 2440 struct perf_series *ps; 2441 unsigned int idx; 2442 int err = 0; 2443 2444 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL); 2445 if (!stats) 2446 return -ENOMEM; 2447 2448 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL); 2449 if (!ps) { 2450 kfree(stats); 2451 return -ENOMEM; 2452 } 2453 2454 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2455 2456 ps->i915 = i915; 2457 ps->nengines = nengines; 2458 2459 idx = 0; 2460 for_each_uabi_engine(engine, i915) { 2461 struct intel_context *ce; 2462 2463 ce = intel_context_create(engine); 2464 if (IS_ERR(ce)) 2465 goto out; 2466 2467 err = intel_context_pin(ce); 2468 if (err) { 2469 intel_context_put(ce); 2470 goto out; 2471 } 2472 2473 ps->ce[idx++] = ce; 2474 } 2475 GEM_BUG_ON(idx != ps->nengines); 2476 2477 for (fn = func; *fn && !err; fn++) { 2478 char name[KSYM_NAME_LEN]; 2479 struct igt_live_test t; 2480 2481 snprintf(name, sizeof(name), "%ps", *fn); 2482 err = igt_live_test_begin(&t, i915, __func__, name); 2483 if (err) 2484 break; 2485 2486 for (idx = 0; idx < nengines; idx++) { 2487 struct perf_stats *p = 2488 memset(&stats[idx], 0, sizeof(stats[idx])); 2489 struct intel_context *ce = ps->ce[idx]; 2490 2491 p->engine = ps->ce[idx]->engine; 2492 intel_engine_pm_get(p->engine); 2493 2494 if (intel_engine_supports_stats(p->engine)) 2495 p->busy = intel_engine_get_busy_time(p->engine, 2496 &p->time) + 1; 2497 else 2498 p->time = ktime_get(); 2499 p->runtime = -intel_context_get_total_runtime_ns(ce); 2500 } 2501 2502 err = (*fn)(ps); 2503 if (igt_live_test_end(&t)) 2504 err = -EIO; 2505 2506 for (idx = 0; idx < nengines; idx++) { 2507 struct perf_stats *p = &stats[idx]; 2508 struct intel_context *ce = ps->ce[idx]; 2509 int integer, decimal; 2510 u64 busy, dt, now; 2511 2512 if (p->busy) 2513 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine, 2514 &now), 2515 p->busy - 1); 2516 else 2517 now = ktime_get(); 2518 p->time = ktime_sub(now, p->time); 2519 2520 err = switch_to_kernel_sync(ce, err); 2521 p->runtime += intel_context_get_total_runtime_ns(ce); 2522 intel_engine_pm_put(p->engine); 2523 2524 busy = 100 * ktime_to_ns(p->busy); 2525 dt = ktime_to_ns(p->time); 2526 if (dt) { 2527 integer = div64_u64(busy, dt); 2528 busy -= integer * dt; 2529 decimal = div64_u64(100 * busy, dt); 2530 } else { 2531 integer = 0; 2532 decimal = 0; 2533 } 2534 2535 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2536 name, p->engine->name, ce->timeline->seqno, 2537 integer, decimal, 2538 div_u64(p->runtime, 1000 * 1000), 2539 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2540 } 2541 } 2542 2543 out: 2544 for (idx = 0; idx < nengines; idx++) { 2545 if (IS_ERR_OR_NULL(ps->ce[idx])) 2546 break; 2547 2548 intel_context_unpin(ps->ce[idx]); 2549 intel_context_put(ps->ce[idx]); 2550 } 2551 kfree(ps); 2552 2553 cpu_latency_qos_remove_request(&qos); 2554 kfree(stats); 2555 return err; 2556 } 2557 2558 static int p_sync0(void *arg) 2559 { 2560 struct perf_stats *p = arg; 2561 struct intel_engine_cs *engine = p->engine; 2562 struct intel_context *ce; 2563 IGT_TIMEOUT(end_time); 2564 unsigned long count; 2565 bool busy; 2566 int err = 0; 2567 2568 ce = intel_context_create(engine); 2569 if (IS_ERR(ce)) 2570 return PTR_ERR(ce); 2571 2572 err = intel_context_pin(ce); 2573 if (err) { 2574 intel_context_put(ce); 2575 return err; 2576 } 2577 2578 if (intel_engine_supports_stats(engine)) { 2579 p->busy = intel_engine_get_busy_time(engine, &p->time); 2580 busy = true; 2581 } else { 2582 p->time = ktime_get(); 2583 busy = false; 2584 } 2585 2586 count = 0; 2587 do { 2588 struct i915_request *rq; 2589 2590 rq = i915_request_create(ce); 2591 if (IS_ERR(rq)) { 2592 err = PTR_ERR(rq); 2593 break; 2594 } 2595 2596 i915_request_get(rq); 2597 i915_request_add(rq); 2598 2599 err = 0; 2600 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2601 err = -ETIME; 2602 i915_request_put(rq); 2603 if (err) 2604 break; 2605 2606 count++; 2607 } while (!__igt_timeout(end_time, NULL)); 2608 2609 if (busy) { 2610 ktime_t now; 2611 2612 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 2613 p->busy); 2614 p->time = ktime_sub(now, p->time); 2615 } else { 2616 p->time = ktime_sub(ktime_get(), p->time); 2617 } 2618 2619 err = switch_to_kernel_sync(ce, err); 2620 p->runtime = intel_context_get_total_runtime_ns(ce); 2621 p->count = count; 2622 2623 intel_context_unpin(ce); 2624 intel_context_put(ce); 2625 return err; 2626 } 2627 2628 static int p_sync1(void *arg) 2629 { 2630 struct perf_stats *p = arg; 2631 struct intel_engine_cs *engine = p->engine; 2632 struct i915_request *prev = NULL; 2633 struct intel_context *ce; 2634 IGT_TIMEOUT(end_time); 2635 unsigned long count; 2636 bool busy; 2637 int err = 0; 2638 2639 ce = intel_context_create(engine); 2640 if (IS_ERR(ce)) 2641 return PTR_ERR(ce); 2642 2643 err = intel_context_pin(ce); 2644 if (err) { 2645 intel_context_put(ce); 2646 return err; 2647 } 2648 2649 if (intel_engine_supports_stats(engine)) { 2650 p->busy = intel_engine_get_busy_time(engine, &p->time); 2651 busy = true; 2652 } else { 2653 p->time = ktime_get(); 2654 busy = false; 2655 } 2656 2657 count = 0; 2658 do { 2659 struct i915_request *rq; 2660 2661 rq = i915_request_create(ce); 2662 if (IS_ERR(rq)) { 2663 err = PTR_ERR(rq); 2664 break; 2665 } 2666 2667 i915_request_get(rq); 2668 i915_request_add(rq); 2669 2670 err = 0; 2671 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2672 err = -ETIME; 2673 i915_request_put(prev); 2674 prev = rq; 2675 if (err) 2676 break; 2677 2678 count++; 2679 } while (!__igt_timeout(end_time, NULL)); 2680 i915_request_put(prev); 2681 2682 if (busy) { 2683 ktime_t now; 2684 2685 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 2686 p->busy); 2687 p->time = ktime_sub(now, p->time); 2688 } else { 2689 p->time = ktime_sub(ktime_get(), p->time); 2690 } 2691 2692 err = switch_to_kernel_sync(ce, err); 2693 p->runtime = intel_context_get_total_runtime_ns(ce); 2694 p->count = count; 2695 2696 intel_context_unpin(ce); 2697 intel_context_put(ce); 2698 return err; 2699 } 2700 2701 static int p_many(void *arg) 2702 { 2703 struct perf_stats *p = arg; 2704 struct intel_engine_cs *engine = p->engine; 2705 struct intel_context *ce; 2706 IGT_TIMEOUT(end_time); 2707 unsigned long count; 2708 int err = 0; 2709 bool busy; 2710 2711 ce = intel_context_create(engine); 2712 if (IS_ERR(ce)) 2713 return PTR_ERR(ce); 2714 2715 err = intel_context_pin(ce); 2716 if (err) { 2717 intel_context_put(ce); 2718 return err; 2719 } 2720 2721 if (intel_engine_supports_stats(engine)) { 2722 p->busy = intel_engine_get_busy_time(engine, &p->time); 2723 busy = true; 2724 } else { 2725 p->time = ktime_get(); 2726 busy = false; 2727 } 2728 2729 count = 0; 2730 do { 2731 struct i915_request *rq; 2732 2733 rq = i915_request_create(ce); 2734 if (IS_ERR(rq)) { 2735 err = PTR_ERR(rq); 2736 break; 2737 } 2738 2739 i915_request_add(rq); 2740 count++; 2741 } while (!__igt_timeout(end_time, NULL)); 2742 2743 if (busy) { 2744 ktime_t now; 2745 2746 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 2747 p->busy); 2748 p->time = ktime_sub(now, p->time); 2749 } else { 2750 p->time = ktime_sub(ktime_get(), p->time); 2751 } 2752 2753 err = switch_to_kernel_sync(ce, err); 2754 p->runtime = intel_context_get_total_runtime_ns(ce); 2755 p->count = count; 2756 2757 intel_context_unpin(ce); 2758 intel_context_put(ce); 2759 return err; 2760 } 2761 2762 static int perf_parallel_engines(void *arg) 2763 { 2764 struct drm_i915_private *i915 = arg; 2765 static int (* const func[])(void *arg) = { 2766 p_sync0, 2767 p_sync1, 2768 p_many, 2769 NULL, 2770 }; 2771 const unsigned int nengines = num_uabi_engines(i915); 2772 struct intel_engine_cs *engine; 2773 int (* const *fn)(void *arg); 2774 struct pm_qos_request qos; 2775 struct { 2776 struct perf_stats p; 2777 struct task_struct *tsk; 2778 } *engines; 2779 int err = 0; 2780 2781 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL); 2782 if (!engines) 2783 return -ENOMEM; 2784 2785 cpu_latency_qos_add_request(&qos, 0); 2786 2787 for (fn = func; *fn; fn++) { 2788 char name[KSYM_NAME_LEN]; 2789 struct igt_live_test t; 2790 unsigned int idx; 2791 2792 snprintf(name, sizeof(name), "%ps", *fn); 2793 err = igt_live_test_begin(&t, i915, __func__, name); 2794 if (err) 2795 break; 2796 2797 atomic_set(&i915->selftest.counter, nengines); 2798 2799 idx = 0; 2800 for_each_uabi_engine(engine, i915) { 2801 intel_engine_pm_get(engine); 2802 2803 memset(&engines[idx].p, 0, sizeof(engines[idx].p)); 2804 engines[idx].p.engine = engine; 2805 2806 engines[idx].tsk = kthread_run(*fn, &engines[idx].p, 2807 "igt:%s", engine->name); 2808 if (IS_ERR(engines[idx].tsk)) { 2809 err = PTR_ERR(engines[idx].tsk); 2810 intel_engine_pm_put(engine); 2811 break; 2812 } 2813 get_task_struct(engines[idx++].tsk); 2814 } 2815 2816 yield(); /* start all threads before we kthread_stop() */ 2817 2818 idx = 0; 2819 for_each_uabi_engine(engine, i915) { 2820 int status; 2821 2822 if (IS_ERR(engines[idx].tsk)) 2823 break; 2824 2825 status = kthread_stop(engines[idx].tsk); 2826 if (status && !err) 2827 err = status; 2828 2829 intel_engine_pm_put(engine); 2830 put_task_struct(engines[idx++].tsk); 2831 } 2832 2833 if (igt_live_test_end(&t)) 2834 err = -EIO; 2835 if (err) 2836 break; 2837 2838 idx = 0; 2839 for_each_uabi_engine(engine, i915) { 2840 struct perf_stats *p = &engines[idx].p; 2841 u64 busy = 100 * ktime_to_ns(p->busy); 2842 u64 dt = ktime_to_ns(p->time); 2843 int integer, decimal; 2844 2845 if (dt) { 2846 integer = div64_u64(busy, dt); 2847 busy -= integer * dt; 2848 decimal = div64_u64(100 * busy, dt); 2849 } else { 2850 integer = 0; 2851 decimal = 0; 2852 } 2853 2854 GEM_BUG_ON(engine != p->engine); 2855 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2856 name, engine->name, p->count, integer, decimal, 2857 div_u64(p->runtime, 1000 * 1000), 2858 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2859 idx++; 2860 } 2861 } 2862 2863 cpu_latency_qos_remove_request(&qos); 2864 kfree(engines); 2865 return err; 2866 } 2867 2868 int i915_request_perf_selftests(struct drm_i915_private *i915) 2869 { 2870 static const struct i915_subtest tests[] = { 2871 SUBTEST(perf_request_latency), 2872 SUBTEST(perf_series_engines), 2873 SUBTEST(perf_parallel_engines), 2874 }; 2875 2876 if (intel_gt_is_wedged(&i915->gt)) 2877 return 0; 2878 2879 return i915_subtests(tests, i915); 2880 } 2881