1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/prime_numbers.h> 26 #include <linux/pm_qos.h> 27 #include <linux/sort.h> 28 29 #include "gem/i915_gem_pm.h" 30 #include "gem/selftests/mock_context.h" 31 32 #include "gt/intel_engine_heartbeat.h" 33 #include "gt/intel_engine_pm.h" 34 #include "gt/intel_engine_user.h" 35 #include "gt/intel_gt.h" 36 #include "gt/intel_gt_requests.h" 37 #include "gt/selftest_engine_heartbeat.h" 38 39 #include "i915_random.h" 40 #include "i915_selftest.h" 41 #include "igt_flush_test.h" 42 #include "igt_live_test.h" 43 #include "igt_spinner.h" 44 #include "lib_sw_fence.h" 45 46 #include "mock_drm.h" 47 #include "mock_gem_device.h" 48 49 static unsigned int num_uabi_engines(struct drm_i915_private *i915) 50 { 51 struct intel_engine_cs *engine; 52 unsigned int count; 53 54 count = 0; 55 for_each_uabi_engine(engine, i915) 56 count++; 57 58 return count; 59 } 60 61 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915) 62 { 63 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0); 64 } 65 66 static int igt_add_request(void *arg) 67 { 68 struct drm_i915_private *i915 = arg; 69 struct i915_request *request; 70 71 /* Basic preliminary test to create a request and let it loose! */ 72 73 request = mock_request(rcs0(i915)->kernel_context, HZ / 10); 74 if (!request) 75 return -ENOMEM; 76 77 i915_request_add(request); 78 79 return 0; 80 } 81 82 static int igt_wait_request(void *arg) 83 { 84 const long T = HZ / 4; 85 struct drm_i915_private *i915 = arg; 86 struct i915_request *request; 87 int err = -EINVAL; 88 89 /* Submit a request, then wait upon it */ 90 91 request = mock_request(rcs0(i915)->kernel_context, T); 92 if (!request) 93 return -ENOMEM; 94 95 i915_request_get(request); 96 97 if (i915_request_wait(request, 0, 0) != -ETIME) { 98 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n"); 99 goto out_request; 100 } 101 102 if (i915_request_wait(request, 0, T) != -ETIME) { 103 pr_err("request wait succeeded (expected timeout before submit!)\n"); 104 goto out_request; 105 } 106 107 if (i915_request_completed(request)) { 108 pr_err("request completed before submit!!\n"); 109 goto out_request; 110 } 111 112 i915_request_add(request); 113 114 if (i915_request_wait(request, 0, 0) != -ETIME) { 115 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n"); 116 goto out_request; 117 } 118 119 if (i915_request_completed(request)) { 120 pr_err("request completed immediately!\n"); 121 goto out_request; 122 } 123 124 if (i915_request_wait(request, 0, T / 2) != -ETIME) { 125 pr_err("request wait succeeded (expected timeout!)\n"); 126 goto out_request; 127 } 128 129 if (i915_request_wait(request, 0, T) == -ETIME) { 130 pr_err("request wait timed out!\n"); 131 goto out_request; 132 } 133 134 if (!i915_request_completed(request)) { 135 pr_err("request not complete after waiting!\n"); 136 goto out_request; 137 } 138 139 if (i915_request_wait(request, 0, T) == -ETIME) { 140 pr_err("request wait timed out when already complete!\n"); 141 goto out_request; 142 } 143 144 err = 0; 145 out_request: 146 i915_request_put(request); 147 mock_device_flush(i915); 148 return err; 149 } 150 151 static int igt_fence_wait(void *arg) 152 { 153 const long T = HZ / 4; 154 struct drm_i915_private *i915 = arg; 155 struct i915_request *request; 156 int err = -EINVAL; 157 158 /* Submit a request, treat it as a fence and wait upon it */ 159 160 request = mock_request(rcs0(i915)->kernel_context, T); 161 if (!request) 162 return -ENOMEM; 163 164 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) { 165 pr_err("fence wait success before submit (expected timeout)!\n"); 166 goto out; 167 } 168 169 i915_request_add(request); 170 171 if (dma_fence_is_signaled(&request->fence)) { 172 pr_err("fence signaled immediately!\n"); 173 goto out; 174 } 175 176 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) { 177 pr_err("fence wait success after submit (expected timeout)!\n"); 178 goto out; 179 } 180 181 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 182 pr_err("fence wait timed out (expected success)!\n"); 183 goto out; 184 } 185 186 if (!dma_fence_is_signaled(&request->fence)) { 187 pr_err("fence unsignaled after waiting!\n"); 188 goto out; 189 } 190 191 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 192 pr_err("fence wait timed out when complete (expected success)!\n"); 193 goto out; 194 } 195 196 err = 0; 197 out: 198 mock_device_flush(i915); 199 return err; 200 } 201 202 static int igt_request_rewind(void *arg) 203 { 204 struct drm_i915_private *i915 = arg; 205 struct i915_request *request, *vip; 206 struct i915_gem_context *ctx[2]; 207 struct intel_context *ce; 208 int err = -EINVAL; 209 210 ctx[0] = mock_context(i915, "A"); 211 212 ce = i915_gem_context_get_engine(ctx[0], RCS0); 213 GEM_BUG_ON(IS_ERR(ce)); 214 request = mock_request(ce, 2 * HZ); 215 intel_context_put(ce); 216 if (!request) { 217 err = -ENOMEM; 218 goto err_context_0; 219 } 220 221 i915_request_get(request); 222 i915_request_add(request); 223 224 ctx[1] = mock_context(i915, "B"); 225 226 ce = i915_gem_context_get_engine(ctx[1], RCS0); 227 GEM_BUG_ON(IS_ERR(ce)); 228 vip = mock_request(ce, 0); 229 intel_context_put(ce); 230 if (!vip) { 231 err = -ENOMEM; 232 goto err_context_1; 233 } 234 235 /* Simulate preemption by manual reordering */ 236 if (!mock_cancel_request(request)) { 237 pr_err("failed to cancel request (already executed)!\n"); 238 i915_request_add(vip); 239 goto err_context_1; 240 } 241 i915_request_get(vip); 242 i915_request_add(vip); 243 rcu_read_lock(); 244 request->engine->submit_request(request); 245 rcu_read_unlock(); 246 247 248 if (i915_request_wait(vip, 0, HZ) == -ETIME) { 249 pr_err("timed out waiting for high priority request\n"); 250 goto err; 251 } 252 253 if (i915_request_completed(request)) { 254 pr_err("low priority request already completed\n"); 255 goto err; 256 } 257 258 err = 0; 259 err: 260 i915_request_put(vip); 261 err_context_1: 262 mock_context_close(ctx[1]); 263 i915_request_put(request); 264 err_context_0: 265 mock_context_close(ctx[0]); 266 mock_device_flush(i915); 267 return err; 268 } 269 270 struct smoketest { 271 struct intel_engine_cs *engine; 272 struct i915_gem_context **contexts; 273 atomic_long_t num_waits, num_fences; 274 int ncontexts, max_batch; 275 struct i915_request *(*request_alloc)(struct intel_context *ce); 276 }; 277 278 static struct i915_request * 279 __mock_request_alloc(struct intel_context *ce) 280 { 281 return mock_request(ce, 0); 282 } 283 284 static struct i915_request * 285 __live_request_alloc(struct intel_context *ce) 286 { 287 return intel_context_create_request(ce); 288 } 289 290 static int __igt_breadcrumbs_smoketest(void *arg) 291 { 292 struct smoketest *t = arg; 293 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1; 294 const unsigned int total = 4 * t->ncontexts + 1; 295 unsigned int num_waits = 0, num_fences = 0; 296 struct i915_request **requests; 297 I915_RND_STATE(prng); 298 unsigned int *order; 299 int err = 0; 300 301 /* 302 * A very simple test to catch the most egregious of list handling bugs. 303 * 304 * At its heart, we simply create oodles of requests running across 305 * multiple kthreads and enable signaling on them, for the sole purpose 306 * of stressing our breadcrumb handling. The only inspection we do is 307 * that the fences were marked as signaled. 308 */ 309 310 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL); 311 if (!requests) 312 return -ENOMEM; 313 314 order = i915_random_order(total, &prng); 315 if (!order) { 316 err = -ENOMEM; 317 goto out_requests; 318 } 319 320 while (!kthread_should_stop()) { 321 struct i915_sw_fence *submit, *wait; 322 unsigned int n, count; 323 324 submit = heap_fence_create(GFP_KERNEL); 325 if (!submit) { 326 err = -ENOMEM; 327 break; 328 } 329 330 wait = heap_fence_create(GFP_KERNEL); 331 if (!wait) { 332 i915_sw_fence_commit(submit); 333 heap_fence_put(submit); 334 err = -ENOMEM; 335 break; 336 } 337 338 i915_random_reorder(order, total, &prng); 339 count = 1 + i915_prandom_u32_max_state(max_batch, &prng); 340 341 for (n = 0; n < count; n++) { 342 struct i915_gem_context *ctx = 343 t->contexts[order[n] % t->ncontexts]; 344 struct i915_request *rq; 345 struct intel_context *ce; 346 347 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx); 348 GEM_BUG_ON(IS_ERR(ce)); 349 rq = t->request_alloc(ce); 350 intel_context_put(ce); 351 if (IS_ERR(rq)) { 352 err = PTR_ERR(rq); 353 count = n; 354 break; 355 } 356 357 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 358 submit, 359 GFP_KERNEL); 360 361 requests[n] = i915_request_get(rq); 362 i915_request_add(rq); 363 364 if (err >= 0) 365 err = i915_sw_fence_await_dma_fence(wait, 366 &rq->fence, 367 0, 368 GFP_KERNEL); 369 370 if (err < 0) { 371 i915_request_put(rq); 372 count = n; 373 break; 374 } 375 } 376 377 i915_sw_fence_commit(submit); 378 i915_sw_fence_commit(wait); 379 380 if (!wait_event_timeout(wait->wait, 381 i915_sw_fence_done(wait), 382 5 * HZ)) { 383 struct i915_request *rq = requests[count - 1]; 384 385 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", 386 atomic_read(&wait->pending), count, 387 rq->fence.context, rq->fence.seqno, 388 t->engine->name); 389 GEM_TRACE_DUMP(); 390 391 intel_gt_set_wedged(t->engine->gt); 392 GEM_BUG_ON(!i915_request_completed(rq)); 393 i915_sw_fence_wait(wait); 394 err = -EIO; 395 } 396 397 for (n = 0; n < count; n++) { 398 struct i915_request *rq = requests[n]; 399 400 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 401 &rq->fence.flags)) { 402 pr_err("%llu:%llu was not signaled!\n", 403 rq->fence.context, rq->fence.seqno); 404 err = -EINVAL; 405 } 406 407 i915_request_put(rq); 408 } 409 410 heap_fence_put(wait); 411 heap_fence_put(submit); 412 413 if (err < 0) 414 break; 415 416 num_fences += count; 417 num_waits++; 418 419 cond_resched(); 420 } 421 422 atomic_long_add(num_fences, &t->num_fences); 423 atomic_long_add(num_waits, &t->num_waits); 424 425 kfree(order); 426 out_requests: 427 kfree(requests); 428 return err; 429 } 430 431 static int mock_breadcrumbs_smoketest(void *arg) 432 { 433 struct drm_i915_private *i915 = arg; 434 struct smoketest t = { 435 .engine = rcs0(i915), 436 .ncontexts = 1024, 437 .max_batch = 1024, 438 .request_alloc = __mock_request_alloc 439 }; 440 unsigned int ncpus = num_online_cpus(); 441 struct task_struct **threads; 442 unsigned int n; 443 int ret = 0; 444 445 /* 446 * Smoketest our breadcrumb/signal handling for requests across multiple 447 * threads. A very simple test to only catch the most egregious of bugs. 448 * See __igt_breadcrumbs_smoketest(); 449 */ 450 451 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL); 452 if (!threads) 453 return -ENOMEM; 454 455 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL); 456 if (!t.contexts) { 457 ret = -ENOMEM; 458 goto out_threads; 459 } 460 461 for (n = 0; n < t.ncontexts; n++) { 462 t.contexts[n] = mock_context(t.engine->i915, "mock"); 463 if (!t.contexts[n]) { 464 ret = -ENOMEM; 465 goto out_contexts; 466 } 467 } 468 469 for (n = 0; n < ncpus; n++) { 470 threads[n] = kthread_run(__igt_breadcrumbs_smoketest, 471 &t, "igt/%d", n); 472 if (IS_ERR(threads[n])) { 473 ret = PTR_ERR(threads[n]); 474 ncpus = n; 475 break; 476 } 477 478 get_task_struct(threads[n]); 479 } 480 481 yield(); /* start all threads before we begin */ 482 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 483 484 for (n = 0; n < ncpus; n++) { 485 int err; 486 487 err = kthread_stop(threads[n]); 488 if (err < 0 && !ret) 489 ret = err; 490 491 put_task_struct(threads[n]); 492 } 493 pr_info("Completed %lu waits for %lu fence across %d cpus\n", 494 atomic_long_read(&t.num_waits), 495 atomic_long_read(&t.num_fences), 496 ncpus); 497 498 out_contexts: 499 for (n = 0; n < t.ncontexts; n++) { 500 if (!t.contexts[n]) 501 break; 502 mock_context_close(t.contexts[n]); 503 } 504 kfree(t.contexts); 505 out_threads: 506 kfree(threads); 507 return ret; 508 } 509 510 int i915_request_mock_selftests(void) 511 { 512 static const struct i915_subtest tests[] = { 513 SUBTEST(igt_add_request), 514 SUBTEST(igt_wait_request), 515 SUBTEST(igt_fence_wait), 516 SUBTEST(igt_request_rewind), 517 SUBTEST(mock_breadcrumbs_smoketest), 518 }; 519 struct drm_i915_private *i915; 520 intel_wakeref_t wakeref; 521 int err = 0; 522 523 i915 = mock_gem_device(); 524 if (!i915) 525 return -ENOMEM; 526 527 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 528 err = i915_subtests(tests, i915); 529 530 mock_destroy_device(i915); 531 532 return err; 533 } 534 535 static int live_nop_request(void *arg) 536 { 537 struct drm_i915_private *i915 = arg; 538 struct intel_engine_cs *engine; 539 struct igt_live_test t; 540 int err = -ENODEV; 541 542 /* 543 * Submit various sized batches of empty requests, to each engine 544 * (individually), and wait for the batch to complete. We can check 545 * the overhead of submitting requests to the hardware. 546 */ 547 548 for_each_uabi_engine(engine, i915) { 549 unsigned long n, prime; 550 IGT_TIMEOUT(end_time); 551 ktime_t times[2] = {}; 552 553 err = igt_live_test_begin(&t, i915, __func__, engine->name); 554 if (err) 555 return err; 556 557 intel_engine_pm_get(engine); 558 for_each_prime_number_from(prime, 1, 8192) { 559 struct i915_request *request = NULL; 560 561 times[1] = ktime_get_raw(); 562 563 for (n = 0; n < prime; n++) { 564 i915_request_put(request); 565 request = i915_request_create(engine->kernel_context); 566 if (IS_ERR(request)) 567 return PTR_ERR(request); 568 569 /* 570 * This space is left intentionally blank. 571 * 572 * We do not actually want to perform any 573 * action with this request, we just want 574 * to measure the latency in allocation 575 * and submission of our breadcrumbs - 576 * ensuring that the bare request is sufficient 577 * for the system to work (i.e. proper HEAD 578 * tracking of the rings, interrupt handling, 579 * etc). It also gives us the lowest bounds 580 * for latency. 581 */ 582 583 i915_request_get(request); 584 i915_request_add(request); 585 } 586 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 587 i915_request_put(request); 588 589 times[1] = ktime_sub(ktime_get_raw(), times[1]); 590 if (prime == 1) 591 times[0] = times[1]; 592 593 if (__igt_timeout(end_time, NULL)) 594 break; 595 } 596 intel_engine_pm_put(engine); 597 598 err = igt_live_test_end(&t); 599 if (err) 600 return err; 601 602 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n", 603 engine->name, 604 ktime_to_ns(times[0]), 605 prime, div64_u64(ktime_to_ns(times[1]), prime)); 606 } 607 608 return err; 609 } 610 611 static struct i915_vma *empty_batch(struct drm_i915_private *i915) 612 { 613 struct drm_i915_gem_object *obj; 614 struct i915_vma *vma; 615 u32 *cmd; 616 int err; 617 618 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 619 if (IS_ERR(obj)) 620 return ERR_CAST(obj); 621 622 cmd = i915_gem_object_pin_map(obj, I915_MAP_WB); 623 if (IS_ERR(cmd)) { 624 err = PTR_ERR(cmd); 625 goto err; 626 } 627 628 *cmd = MI_BATCH_BUFFER_END; 629 630 __i915_gem_object_flush_map(obj, 0, 64); 631 i915_gem_object_unpin_map(obj); 632 633 intel_gt_chipset_flush(&i915->gt); 634 635 vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL); 636 if (IS_ERR(vma)) { 637 err = PTR_ERR(vma); 638 goto err; 639 } 640 641 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL); 642 if (err) 643 goto err; 644 645 /* Force the wait wait now to avoid including it in the benchmark */ 646 err = i915_vma_sync(vma); 647 if (err) 648 goto err_pin; 649 650 return vma; 651 652 err_pin: 653 i915_vma_unpin(vma); 654 err: 655 i915_gem_object_put(obj); 656 return ERR_PTR(err); 657 } 658 659 static struct i915_request * 660 empty_request(struct intel_engine_cs *engine, 661 struct i915_vma *batch) 662 { 663 struct i915_request *request; 664 int err; 665 666 request = i915_request_create(engine->kernel_context); 667 if (IS_ERR(request)) 668 return request; 669 670 err = engine->emit_bb_start(request, 671 batch->node.start, 672 batch->node.size, 673 I915_DISPATCH_SECURE); 674 if (err) 675 goto out_request; 676 677 i915_request_get(request); 678 out_request: 679 i915_request_add(request); 680 return err ? ERR_PTR(err) : request; 681 } 682 683 static int live_empty_request(void *arg) 684 { 685 struct drm_i915_private *i915 = arg; 686 struct intel_engine_cs *engine; 687 struct igt_live_test t; 688 struct i915_vma *batch; 689 int err = 0; 690 691 /* 692 * Submit various sized batches of empty requests, to each engine 693 * (individually), and wait for the batch to complete. We can check 694 * the overhead of submitting requests to the hardware. 695 */ 696 697 batch = empty_batch(i915); 698 if (IS_ERR(batch)) 699 return PTR_ERR(batch); 700 701 for_each_uabi_engine(engine, i915) { 702 IGT_TIMEOUT(end_time); 703 struct i915_request *request; 704 unsigned long n, prime; 705 ktime_t times[2] = {}; 706 707 err = igt_live_test_begin(&t, i915, __func__, engine->name); 708 if (err) 709 goto out_batch; 710 711 intel_engine_pm_get(engine); 712 713 /* Warmup / preload */ 714 request = empty_request(engine, batch); 715 if (IS_ERR(request)) { 716 err = PTR_ERR(request); 717 intel_engine_pm_put(engine); 718 goto out_batch; 719 } 720 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 721 722 for_each_prime_number_from(prime, 1, 8192) { 723 times[1] = ktime_get_raw(); 724 725 for (n = 0; n < prime; n++) { 726 i915_request_put(request); 727 request = empty_request(engine, batch); 728 if (IS_ERR(request)) { 729 err = PTR_ERR(request); 730 intel_engine_pm_put(engine); 731 goto out_batch; 732 } 733 } 734 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 735 736 times[1] = ktime_sub(ktime_get_raw(), times[1]); 737 if (prime == 1) 738 times[0] = times[1]; 739 740 if (__igt_timeout(end_time, NULL)) 741 break; 742 } 743 i915_request_put(request); 744 intel_engine_pm_put(engine); 745 746 err = igt_live_test_end(&t); 747 if (err) 748 goto out_batch; 749 750 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n", 751 engine->name, 752 ktime_to_ns(times[0]), 753 prime, div64_u64(ktime_to_ns(times[1]), prime)); 754 } 755 756 out_batch: 757 i915_vma_unpin(batch); 758 i915_vma_put(batch); 759 return err; 760 } 761 762 static struct i915_vma *recursive_batch(struct drm_i915_private *i915) 763 { 764 struct drm_i915_gem_object *obj; 765 const int gen = INTEL_GEN(i915); 766 struct i915_vma *vma; 767 u32 *cmd; 768 int err; 769 770 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 771 if (IS_ERR(obj)) 772 return ERR_CAST(obj); 773 774 vma = i915_vma_instance(obj, i915->gt.vm, NULL); 775 if (IS_ERR(vma)) { 776 err = PTR_ERR(vma); 777 goto err; 778 } 779 780 err = i915_vma_pin(vma, 0, 0, PIN_USER); 781 if (err) 782 goto err; 783 784 cmd = i915_gem_object_pin_map(obj, I915_MAP_WC); 785 if (IS_ERR(cmd)) { 786 err = PTR_ERR(cmd); 787 goto err; 788 } 789 790 if (gen >= 8) { 791 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 792 *cmd++ = lower_32_bits(vma->node.start); 793 *cmd++ = upper_32_bits(vma->node.start); 794 } else if (gen >= 6) { 795 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8; 796 *cmd++ = lower_32_bits(vma->node.start); 797 } else { 798 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; 799 *cmd++ = lower_32_bits(vma->node.start); 800 } 801 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */ 802 803 __i915_gem_object_flush_map(obj, 0, 64); 804 i915_gem_object_unpin_map(obj); 805 806 intel_gt_chipset_flush(&i915->gt); 807 808 return vma; 809 810 err: 811 i915_gem_object_put(obj); 812 return ERR_PTR(err); 813 } 814 815 static int recursive_batch_resolve(struct i915_vma *batch) 816 { 817 u32 *cmd; 818 819 cmd = i915_gem_object_pin_map(batch->obj, I915_MAP_WC); 820 if (IS_ERR(cmd)) 821 return PTR_ERR(cmd); 822 823 *cmd = MI_BATCH_BUFFER_END; 824 825 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd)); 826 i915_gem_object_unpin_map(batch->obj); 827 828 intel_gt_chipset_flush(batch->vm->gt); 829 830 return 0; 831 } 832 833 static int live_all_engines(void *arg) 834 { 835 struct drm_i915_private *i915 = arg; 836 const unsigned int nengines = num_uabi_engines(i915); 837 struct intel_engine_cs *engine; 838 struct i915_request **request; 839 struct igt_live_test t; 840 struct i915_vma *batch; 841 unsigned int idx; 842 int err; 843 844 /* 845 * Check we can submit requests to all engines simultaneously. We 846 * send a recursive batch to each engine - checking that we don't 847 * block doing so, and that they don't complete too soon. 848 */ 849 850 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 851 if (!request) 852 return -ENOMEM; 853 854 err = igt_live_test_begin(&t, i915, __func__, ""); 855 if (err) 856 goto out_free; 857 858 batch = recursive_batch(i915); 859 if (IS_ERR(batch)) { 860 err = PTR_ERR(batch); 861 pr_err("%s: Unable to create batch, err=%d\n", __func__, err); 862 goto out_free; 863 } 864 865 i915_vma_lock(batch); 866 867 idx = 0; 868 for_each_uabi_engine(engine, i915) { 869 request[idx] = intel_engine_create_kernel_request(engine); 870 if (IS_ERR(request[idx])) { 871 err = PTR_ERR(request[idx]); 872 pr_err("%s: Request allocation failed with err=%d\n", 873 __func__, err); 874 goto out_request; 875 } 876 877 err = i915_request_await_object(request[idx], batch->obj, 0); 878 if (err == 0) 879 err = i915_vma_move_to_active(batch, request[idx], 0); 880 GEM_BUG_ON(err); 881 882 err = engine->emit_bb_start(request[idx], 883 batch->node.start, 884 batch->node.size, 885 0); 886 GEM_BUG_ON(err); 887 request[idx]->batch = batch; 888 889 i915_request_get(request[idx]); 890 i915_request_add(request[idx]); 891 idx++; 892 } 893 894 i915_vma_unlock(batch); 895 896 idx = 0; 897 for_each_uabi_engine(engine, i915) { 898 if (i915_request_completed(request[idx])) { 899 pr_err("%s(%s): request completed too early!\n", 900 __func__, engine->name); 901 err = -EINVAL; 902 goto out_request; 903 } 904 idx++; 905 } 906 907 err = recursive_batch_resolve(batch); 908 if (err) { 909 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err); 910 goto out_request; 911 } 912 913 idx = 0; 914 for_each_uabi_engine(engine, i915) { 915 long timeout; 916 917 timeout = i915_request_wait(request[idx], 0, 918 MAX_SCHEDULE_TIMEOUT); 919 if (timeout < 0) { 920 err = timeout; 921 pr_err("%s: error waiting for request on %s, err=%d\n", 922 __func__, engine->name, err); 923 goto out_request; 924 } 925 926 GEM_BUG_ON(!i915_request_completed(request[idx])); 927 i915_request_put(request[idx]); 928 request[idx] = NULL; 929 idx++; 930 } 931 932 err = igt_live_test_end(&t); 933 934 out_request: 935 idx = 0; 936 for_each_uabi_engine(engine, i915) { 937 if (request[idx]) 938 i915_request_put(request[idx]); 939 idx++; 940 } 941 i915_vma_unpin(batch); 942 i915_vma_put(batch); 943 out_free: 944 kfree(request); 945 return err; 946 } 947 948 static int live_sequential_engines(void *arg) 949 { 950 struct drm_i915_private *i915 = arg; 951 const unsigned int nengines = num_uabi_engines(i915); 952 struct i915_request **request; 953 struct i915_request *prev = NULL; 954 struct intel_engine_cs *engine; 955 struct igt_live_test t; 956 unsigned int idx; 957 int err; 958 959 /* 960 * Check we can submit requests to all engines sequentially, such 961 * that each successive request waits for the earlier ones. This 962 * tests that we don't execute requests out of order, even though 963 * they are running on independent engines. 964 */ 965 966 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 967 if (!request) 968 return -ENOMEM; 969 970 err = igt_live_test_begin(&t, i915, __func__, ""); 971 if (err) 972 goto out_free; 973 974 idx = 0; 975 for_each_uabi_engine(engine, i915) { 976 struct i915_vma *batch; 977 978 batch = recursive_batch(i915); 979 if (IS_ERR(batch)) { 980 err = PTR_ERR(batch); 981 pr_err("%s: Unable to create batch for %s, err=%d\n", 982 __func__, engine->name, err); 983 goto out_free; 984 } 985 986 i915_vma_lock(batch); 987 request[idx] = intel_engine_create_kernel_request(engine); 988 if (IS_ERR(request[idx])) { 989 err = PTR_ERR(request[idx]); 990 pr_err("%s: Request allocation failed for %s with err=%d\n", 991 __func__, engine->name, err); 992 goto out_unlock; 993 } 994 995 if (prev) { 996 err = i915_request_await_dma_fence(request[idx], 997 &prev->fence); 998 if (err) { 999 i915_request_add(request[idx]); 1000 pr_err("%s: Request await failed for %s with err=%d\n", 1001 __func__, engine->name, err); 1002 goto out_unlock; 1003 } 1004 } 1005 1006 err = i915_request_await_object(request[idx], 1007 batch->obj, false); 1008 if (err == 0) 1009 err = i915_vma_move_to_active(batch, request[idx], 0); 1010 GEM_BUG_ON(err); 1011 1012 err = engine->emit_bb_start(request[idx], 1013 batch->node.start, 1014 batch->node.size, 1015 0); 1016 GEM_BUG_ON(err); 1017 request[idx]->batch = batch; 1018 1019 i915_request_get(request[idx]); 1020 i915_request_add(request[idx]); 1021 1022 prev = request[idx]; 1023 idx++; 1024 1025 out_unlock: 1026 i915_vma_unlock(batch); 1027 if (err) 1028 goto out_request; 1029 } 1030 1031 idx = 0; 1032 for_each_uabi_engine(engine, i915) { 1033 long timeout; 1034 1035 if (i915_request_completed(request[idx])) { 1036 pr_err("%s(%s): request completed too early!\n", 1037 __func__, engine->name); 1038 err = -EINVAL; 1039 goto out_request; 1040 } 1041 1042 err = recursive_batch_resolve(request[idx]->batch); 1043 if (err) { 1044 pr_err("%s: failed to resolve batch, err=%d\n", 1045 __func__, err); 1046 goto out_request; 1047 } 1048 1049 timeout = i915_request_wait(request[idx], 0, 1050 MAX_SCHEDULE_TIMEOUT); 1051 if (timeout < 0) { 1052 err = timeout; 1053 pr_err("%s: error waiting for request on %s, err=%d\n", 1054 __func__, engine->name, err); 1055 goto out_request; 1056 } 1057 1058 GEM_BUG_ON(!i915_request_completed(request[idx])); 1059 idx++; 1060 } 1061 1062 err = igt_live_test_end(&t); 1063 1064 out_request: 1065 idx = 0; 1066 for_each_uabi_engine(engine, i915) { 1067 u32 *cmd; 1068 1069 if (!request[idx]) 1070 break; 1071 1072 cmd = i915_gem_object_pin_map(request[idx]->batch->obj, 1073 I915_MAP_WC); 1074 if (!IS_ERR(cmd)) { 1075 *cmd = MI_BATCH_BUFFER_END; 1076 1077 __i915_gem_object_flush_map(request[idx]->batch->obj, 1078 0, sizeof(*cmd)); 1079 i915_gem_object_unpin_map(request[idx]->batch->obj); 1080 1081 intel_gt_chipset_flush(engine->gt); 1082 } 1083 1084 i915_vma_put(request[idx]->batch); 1085 i915_request_put(request[idx]); 1086 idx++; 1087 } 1088 out_free: 1089 kfree(request); 1090 return err; 1091 } 1092 1093 static int __live_parallel_engine1(void *arg) 1094 { 1095 struct intel_engine_cs *engine = arg; 1096 IGT_TIMEOUT(end_time); 1097 unsigned long count; 1098 int err = 0; 1099 1100 count = 0; 1101 intel_engine_pm_get(engine); 1102 do { 1103 struct i915_request *rq; 1104 1105 rq = i915_request_create(engine->kernel_context); 1106 if (IS_ERR(rq)) { 1107 err = PTR_ERR(rq); 1108 break; 1109 } 1110 1111 i915_request_get(rq); 1112 i915_request_add(rq); 1113 1114 err = 0; 1115 if (i915_request_wait(rq, 0, HZ / 5) < 0) 1116 err = -ETIME; 1117 i915_request_put(rq); 1118 if (err) 1119 break; 1120 1121 count++; 1122 } while (!__igt_timeout(end_time, NULL)); 1123 intel_engine_pm_put(engine); 1124 1125 pr_info("%s: %lu request + sync\n", engine->name, count); 1126 return err; 1127 } 1128 1129 static int __live_parallel_engineN(void *arg) 1130 { 1131 struct intel_engine_cs *engine = arg; 1132 IGT_TIMEOUT(end_time); 1133 unsigned long count; 1134 int err = 0; 1135 1136 count = 0; 1137 intel_engine_pm_get(engine); 1138 do { 1139 struct i915_request *rq; 1140 1141 rq = i915_request_create(engine->kernel_context); 1142 if (IS_ERR(rq)) { 1143 err = PTR_ERR(rq); 1144 break; 1145 } 1146 1147 i915_request_add(rq); 1148 count++; 1149 } while (!__igt_timeout(end_time, NULL)); 1150 intel_engine_pm_put(engine); 1151 1152 pr_info("%s: %lu requests\n", engine->name, count); 1153 return err; 1154 } 1155 1156 static bool wake_all(struct drm_i915_private *i915) 1157 { 1158 if (atomic_dec_and_test(&i915->selftest.counter)) { 1159 wake_up_var(&i915->selftest.counter); 1160 return true; 1161 } 1162 1163 return false; 1164 } 1165 1166 static int wait_for_all(struct drm_i915_private *i915) 1167 { 1168 if (wake_all(i915)) 1169 return 0; 1170 1171 if (wait_var_event_timeout(&i915->selftest.counter, 1172 !atomic_read(&i915->selftest.counter), 1173 i915_selftest.timeout_jiffies)) 1174 return 0; 1175 1176 return -ETIME; 1177 } 1178 1179 static int __live_parallel_spin(void *arg) 1180 { 1181 struct intel_engine_cs *engine = arg; 1182 struct igt_spinner spin; 1183 struct i915_request *rq; 1184 int err = 0; 1185 1186 /* 1187 * Create a spinner running for eternity on each engine. If a second 1188 * spinner is incorrectly placed on the same engine, it will not be 1189 * able to start in time. 1190 */ 1191 1192 if (igt_spinner_init(&spin, engine->gt)) { 1193 wake_all(engine->i915); 1194 return -ENOMEM; 1195 } 1196 1197 intel_engine_pm_get(engine); 1198 rq = igt_spinner_create_request(&spin, 1199 engine->kernel_context, 1200 MI_NOOP); /* no preemption */ 1201 intel_engine_pm_put(engine); 1202 if (IS_ERR(rq)) { 1203 err = PTR_ERR(rq); 1204 if (err == -ENODEV) 1205 err = 0; 1206 wake_all(engine->i915); 1207 goto out_spin; 1208 } 1209 1210 i915_request_get(rq); 1211 i915_request_add(rq); 1212 if (igt_wait_for_spinner(&spin, rq)) { 1213 /* Occupy this engine for the whole test */ 1214 err = wait_for_all(engine->i915); 1215 } else { 1216 pr_err("Failed to start spinner on %s\n", engine->name); 1217 err = -EINVAL; 1218 } 1219 igt_spinner_end(&spin); 1220 1221 if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0) 1222 err = -EIO; 1223 i915_request_put(rq); 1224 1225 out_spin: 1226 igt_spinner_fini(&spin); 1227 return err; 1228 } 1229 1230 static int live_parallel_engines(void *arg) 1231 { 1232 struct drm_i915_private *i915 = arg; 1233 static int (* const func[])(void *arg) = { 1234 __live_parallel_engine1, 1235 __live_parallel_engineN, 1236 __live_parallel_spin, 1237 NULL, 1238 }; 1239 const unsigned int nengines = num_uabi_engines(i915); 1240 struct intel_engine_cs *engine; 1241 int (* const *fn)(void *arg); 1242 struct task_struct **tsk; 1243 int err = 0; 1244 1245 /* 1246 * Check we can submit requests to all engines concurrently. This 1247 * tests that we load up the system maximally. 1248 */ 1249 1250 tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL); 1251 if (!tsk) 1252 return -ENOMEM; 1253 1254 for (fn = func; !err && *fn; fn++) { 1255 char name[KSYM_NAME_LEN]; 1256 struct igt_live_test t; 1257 unsigned int idx; 1258 1259 snprintf(name, sizeof(name), "%ps", *fn); 1260 err = igt_live_test_begin(&t, i915, __func__, name); 1261 if (err) 1262 break; 1263 1264 atomic_set(&i915->selftest.counter, nengines); 1265 1266 idx = 0; 1267 for_each_uabi_engine(engine, i915) { 1268 tsk[idx] = kthread_run(*fn, engine, 1269 "igt/parallel:%s", 1270 engine->name); 1271 if (IS_ERR(tsk[idx])) { 1272 err = PTR_ERR(tsk[idx]); 1273 break; 1274 } 1275 get_task_struct(tsk[idx++]); 1276 } 1277 1278 yield(); /* start all threads before we kthread_stop() */ 1279 1280 idx = 0; 1281 for_each_uabi_engine(engine, i915) { 1282 int status; 1283 1284 if (IS_ERR(tsk[idx])) 1285 break; 1286 1287 status = kthread_stop(tsk[idx]); 1288 if (status && !err) 1289 err = status; 1290 1291 put_task_struct(tsk[idx++]); 1292 } 1293 1294 if (igt_live_test_end(&t)) 1295 err = -EIO; 1296 } 1297 1298 kfree(tsk); 1299 return err; 1300 } 1301 1302 static int 1303 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine) 1304 { 1305 struct i915_request *rq; 1306 int ret; 1307 1308 /* 1309 * Before execlists, all contexts share the same ringbuffer. With 1310 * execlists, each context/engine has a separate ringbuffer and 1311 * for the purposes of this test, inexhaustible. 1312 * 1313 * For the global ringbuffer though, we have to be very careful 1314 * that we do not wrap while preventing the execution of requests 1315 * with a unsignaled fence. 1316 */ 1317 if (HAS_EXECLISTS(ctx->i915)) 1318 return INT_MAX; 1319 1320 rq = igt_request_alloc(ctx, engine); 1321 if (IS_ERR(rq)) { 1322 ret = PTR_ERR(rq); 1323 } else { 1324 int sz; 1325 1326 ret = rq->ring->size - rq->reserved_space; 1327 i915_request_add(rq); 1328 1329 sz = rq->ring->emit - rq->head; 1330 if (sz < 0) 1331 sz += rq->ring->size; 1332 ret /= sz; 1333 ret /= 2; /* leave half spare, in case of emergency! */ 1334 } 1335 1336 return ret; 1337 } 1338 1339 static int live_breadcrumbs_smoketest(void *arg) 1340 { 1341 struct drm_i915_private *i915 = arg; 1342 const unsigned int nengines = num_uabi_engines(i915); 1343 const unsigned int ncpus = num_online_cpus(); 1344 unsigned long num_waits, num_fences; 1345 struct intel_engine_cs *engine; 1346 struct task_struct **threads; 1347 struct igt_live_test live; 1348 intel_wakeref_t wakeref; 1349 struct smoketest *smoke; 1350 unsigned int n, idx; 1351 struct file *file; 1352 int ret = 0; 1353 1354 /* 1355 * Smoketest our breadcrumb/signal handling for requests across multiple 1356 * threads. A very simple test to only catch the most egregious of bugs. 1357 * See __igt_breadcrumbs_smoketest(); 1358 * 1359 * On real hardware this time. 1360 */ 1361 1362 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1363 1364 file = mock_file(i915); 1365 if (IS_ERR(file)) { 1366 ret = PTR_ERR(file); 1367 goto out_rpm; 1368 } 1369 1370 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL); 1371 if (!smoke) { 1372 ret = -ENOMEM; 1373 goto out_file; 1374 } 1375 1376 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL); 1377 if (!threads) { 1378 ret = -ENOMEM; 1379 goto out_smoke; 1380 } 1381 1382 smoke[0].request_alloc = __live_request_alloc; 1383 smoke[0].ncontexts = 64; 1384 smoke[0].contexts = kcalloc(smoke[0].ncontexts, 1385 sizeof(*smoke[0].contexts), 1386 GFP_KERNEL); 1387 if (!smoke[0].contexts) { 1388 ret = -ENOMEM; 1389 goto out_threads; 1390 } 1391 1392 for (n = 0; n < smoke[0].ncontexts; n++) { 1393 smoke[0].contexts[n] = live_context(i915, file); 1394 if (!smoke[0].contexts[n]) { 1395 ret = -ENOMEM; 1396 goto out_contexts; 1397 } 1398 } 1399 1400 ret = igt_live_test_begin(&live, i915, __func__, ""); 1401 if (ret) 1402 goto out_contexts; 1403 1404 idx = 0; 1405 for_each_uabi_engine(engine, i915) { 1406 smoke[idx] = smoke[0]; 1407 smoke[idx].engine = engine; 1408 smoke[idx].max_batch = 1409 max_batches(smoke[0].contexts[0], engine); 1410 if (smoke[idx].max_batch < 0) { 1411 ret = smoke[idx].max_batch; 1412 goto out_flush; 1413 } 1414 /* One ring interleaved between requests from all cpus */ 1415 smoke[idx].max_batch /= num_online_cpus() + 1; 1416 pr_debug("Limiting batches to %d requests on %s\n", 1417 smoke[idx].max_batch, engine->name); 1418 1419 for (n = 0; n < ncpus; n++) { 1420 struct task_struct *tsk; 1421 1422 tsk = kthread_run(__igt_breadcrumbs_smoketest, 1423 &smoke[idx], "igt/%d.%d", idx, n); 1424 if (IS_ERR(tsk)) { 1425 ret = PTR_ERR(tsk); 1426 goto out_flush; 1427 } 1428 1429 get_task_struct(tsk); 1430 threads[idx * ncpus + n] = tsk; 1431 } 1432 1433 idx++; 1434 } 1435 1436 yield(); /* start all threads before we begin */ 1437 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 1438 1439 out_flush: 1440 idx = 0; 1441 num_waits = 0; 1442 num_fences = 0; 1443 for_each_uabi_engine(engine, i915) { 1444 for (n = 0; n < ncpus; n++) { 1445 struct task_struct *tsk = threads[idx * ncpus + n]; 1446 int err; 1447 1448 if (!tsk) 1449 continue; 1450 1451 err = kthread_stop(tsk); 1452 if (err < 0 && !ret) 1453 ret = err; 1454 1455 put_task_struct(tsk); 1456 } 1457 1458 num_waits += atomic_long_read(&smoke[idx].num_waits); 1459 num_fences += atomic_long_read(&smoke[idx].num_fences); 1460 idx++; 1461 } 1462 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", 1463 num_waits, num_fences, idx, ncpus); 1464 1465 ret = igt_live_test_end(&live) ?: ret; 1466 out_contexts: 1467 kfree(smoke[0].contexts); 1468 out_threads: 1469 kfree(threads); 1470 out_smoke: 1471 kfree(smoke); 1472 out_file: 1473 fput(file); 1474 out_rpm: 1475 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1476 1477 return ret; 1478 } 1479 1480 int i915_request_live_selftests(struct drm_i915_private *i915) 1481 { 1482 static const struct i915_subtest tests[] = { 1483 SUBTEST(live_nop_request), 1484 SUBTEST(live_all_engines), 1485 SUBTEST(live_sequential_engines), 1486 SUBTEST(live_parallel_engines), 1487 SUBTEST(live_empty_request), 1488 SUBTEST(live_breadcrumbs_smoketest), 1489 }; 1490 1491 if (intel_gt_is_wedged(&i915->gt)) 1492 return 0; 1493 1494 return i915_subtests(tests, i915); 1495 } 1496 1497 static int switch_to_kernel_sync(struct intel_context *ce, int err) 1498 { 1499 struct i915_request *rq; 1500 struct dma_fence *fence; 1501 1502 rq = intel_engine_create_kernel_request(ce->engine); 1503 if (IS_ERR(rq)) 1504 return PTR_ERR(rq); 1505 1506 fence = i915_active_fence_get(&ce->timeline->last_request); 1507 if (fence) { 1508 i915_request_await_dma_fence(rq, fence); 1509 dma_fence_put(fence); 1510 } 1511 1512 rq = i915_request_get(rq); 1513 i915_request_add(rq); 1514 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err) 1515 err = -ETIME; 1516 i915_request_put(rq); 1517 1518 while (!err && !intel_engine_is_idle(ce->engine)) 1519 intel_engine_flush_submission(ce->engine); 1520 1521 return err; 1522 } 1523 1524 struct perf_stats { 1525 struct intel_engine_cs *engine; 1526 unsigned long count; 1527 ktime_t time; 1528 ktime_t busy; 1529 u64 runtime; 1530 }; 1531 1532 struct perf_series { 1533 struct drm_i915_private *i915; 1534 unsigned int nengines; 1535 struct intel_context *ce[]; 1536 }; 1537 1538 static int cmp_u32(const void *A, const void *B) 1539 { 1540 const u32 *a = A, *b = B; 1541 1542 return *a - *b; 1543 } 1544 1545 static u32 trifilter(u32 *a) 1546 { 1547 u64 sum; 1548 1549 #define TF_COUNT 5 1550 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); 1551 1552 sum = mul_u32_u32(a[2], 2); 1553 sum += a[1]; 1554 sum += a[3]; 1555 1556 GEM_BUG_ON(sum > U32_MAX); 1557 return sum; 1558 #define TF_BIAS 2 1559 } 1560 1561 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) 1562 { 1563 u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles); 1564 1565 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); 1566 } 1567 1568 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) 1569 { 1570 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 1571 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); 1572 *cs++ = offset; 1573 *cs++ = 0; 1574 1575 return cs; 1576 } 1577 1578 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) 1579 { 1580 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1581 *cs++ = offset; 1582 *cs++ = 0; 1583 *cs++ = value; 1584 1585 return cs; 1586 } 1587 1588 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) 1589 { 1590 *cs++ = MI_SEMAPHORE_WAIT | 1591 MI_SEMAPHORE_GLOBAL_GTT | 1592 MI_SEMAPHORE_POLL | 1593 mode; 1594 *cs++ = value; 1595 *cs++ = offset; 1596 *cs++ = 0; 1597 1598 return cs; 1599 } 1600 1601 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) 1602 { 1603 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); 1604 } 1605 1606 static void semaphore_set(u32 *sema, u32 value) 1607 { 1608 WRITE_ONCE(*sema, value); 1609 wmb(); /* flush the update to the cache, and beyond */ 1610 } 1611 1612 static u32 *hwsp_scratch(const struct intel_context *ce) 1613 { 1614 return memset32(ce->engine->status_page.addr + 1000, 0, 21); 1615 } 1616 1617 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) 1618 { 1619 return (i915_ggtt_offset(ce->engine->status_page.vma) + 1620 offset_in_page(dw)); 1621 } 1622 1623 static int measure_semaphore_response(struct intel_context *ce) 1624 { 1625 u32 *sema = hwsp_scratch(ce); 1626 const u32 offset = hwsp_offset(ce, sema); 1627 u32 elapsed[TF_COUNT], cycles; 1628 struct i915_request *rq; 1629 u32 *cs; 1630 int err; 1631 int i; 1632 1633 /* 1634 * Measure how many cycles it takes for the HW to detect the change 1635 * in a semaphore value. 1636 * 1637 * A: read CS_TIMESTAMP from CPU 1638 * poke semaphore 1639 * B: read CS_TIMESTAMP on GPU 1640 * 1641 * Semaphore latency: B - A 1642 */ 1643 1644 semaphore_set(sema, -1); 1645 1646 rq = i915_request_create(ce); 1647 if (IS_ERR(rq)) 1648 return PTR_ERR(rq); 1649 1650 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); 1651 if (IS_ERR(cs)) { 1652 i915_request_add(rq); 1653 err = PTR_ERR(cs); 1654 goto err; 1655 } 1656 1657 cs = emit_store_dw(cs, offset, 0); 1658 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1659 cs = emit_semaphore_poll_until(cs, offset, i); 1660 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1661 cs = emit_store_dw(cs, offset, 0); 1662 } 1663 1664 intel_ring_advance(rq, cs); 1665 i915_request_add(rq); 1666 1667 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 1668 err = -EIO; 1669 goto err; 1670 } 1671 1672 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1673 preempt_disable(); 1674 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 1675 semaphore_set(sema, i); 1676 preempt_enable(); 1677 1678 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 1679 err = -EIO; 1680 goto err; 1681 } 1682 1683 elapsed[i - 1] = sema[i] - cycles; 1684 } 1685 1686 cycles = trifilter(elapsed); 1687 pr_info("%s: semaphore response %d cycles, %lluns\n", 1688 ce->engine->name, cycles >> TF_BIAS, 1689 cycles_to_ns(ce->engine, cycles)); 1690 1691 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1692 1693 err: 1694 intel_gt_set_wedged(ce->engine->gt); 1695 return err; 1696 } 1697 1698 static int measure_idle_dispatch(struct intel_context *ce) 1699 { 1700 u32 *sema = hwsp_scratch(ce); 1701 const u32 offset = hwsp_offset(ce, sema); 1702 u32 elapsed[TF_COUNT], cycles; 1703 u32 *cs; 1704 int err; 1705 int i; 1706 1707 /* 1708 * Measure how long it takes for us to submit a request while the 1709 * engine is idle, but is resting in our context. 1710 * 1711 * A: read CS_TIMESTAMP from CPU 1712 * submit request 1713 * B: read CS_TIMESTAMP on GPU 1714 * 1715 * Submission latency: B - A 1716 */ 1717 1718 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 1719 struct i915_request *rq; 1720 1721 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 1722 if (err) 1723 return err; 1724 1725 rq = i915_request_create(ce); 1726 if (IS_ERR(rq)) { 1727 err = PTR_ERR(rq); 1728 goto err; 1729 } 1730 1731 cs = intel_ring_begin(rq, 4); 1732 if (IS_ERR(cs)) { 1733 i915_request_add(rq); 1734 err = PTR_ERR(cs); 1735 goto err; 1736 } 1737 1738 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1739 1740 intel_ring_advance(rq, cs); 1741 1742 preempt_disable(); 1743 local_bh_disable(); 1744 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 1745 i915_request_add(rq); 1746 local_bh_enable(); 1747 preempt_enable(); 1748 } 1749 1750 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 1751 if (err) 1752 goto err; 1753 1754 for (i = 0; i < ARRAY_SIZE(elapsed); i++) 1755 elapsed[i] = sema[i] - elapsed[i]; 1756 1757 cycles = trifilter(elapsed); 1758 pr_info("%s: idle dispatch latency %d cycles, %lluns\n", 1759 ce->engine->name, cycles >> TF_BIAS, 1760 cycles_to_ns(ce->engine, cycles)); 1761 1762 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1763 1764 err: 1765 intel_gt_set_wedged(ce->engine->gt); 1766 return err; 1767 } 1768 1769 static int measure_busy_dispatch(struct intel_context *ce) 1770 { 1771 u32 *sema = hwsp_scratch(ce); 1772 const u32 offset = hwsp_offset(ce, sema); 1773 u32 elapsed[TF_COUNT + 1], cycles; 1774 u32 *cs; 1775 int err; 1776 int i; 1777 1778 /* 1779 * Measure how long it takes for us to submit a request while the 1780 * engine is busy, polling on a semaphore in our context. With 1781 * direct submission, this will include the cost of a lite restore. 1782 * 1783 * A: read CS_TIMESTAMP from CPU 1784 * submit request 1785 * B: read CS_TIMESTAMP on GPU 1786 * 1787 * Submission latency: B - A 1788 */ 1789 1790 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1791 struct i915_request *rq; 1792 1793 rq = i915_request_create(ce); 1794 if (IS_ERR(rq)) { 1795 err = PTR_ERR(rq); 1796 goto err; 1797 } 1798 1799 cs = intel_ring_begin(rq, 12); 1800 if (IS_ERR(cs)) { 1801 i915_request_add(rq); 1802 err = PTR_ERR(cs); 1803 goto err; 1804 } 1805 1806 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 1807 cs = emit_semaphore_poll_until(cs, offset, i); 1808 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1809 1810 intel_ring_advance(rq, cs); 1811 1812 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { 1813 err = -EIO; 1814 goto err; 1815 } 1816 1817 preempt_disable(); 1818 local_bh_disable(); 1819 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 1820 i915_request_add(rq); 1821 local_bh_enable(); 1822 semaphore_set(sema, i - 1); 1823 preempt_enable(); 1824 } 1825 1826 wait_for(READ_ONCE(sema[i - 1]), 500); 1827 semaphore_set(sema, i - 1); 1828 1829 for (i = 1; i <= TF_COUNT; i++) { 1830 GEM_BUG_ON(sema[i] == -1); 1831 elapsed[i - 1] = sema[i] - elapsed[i]; 1832 } 1833 1834 cycles = trifilter(elapsed); 1835 pr_info("%s: busy dispatch latency %d cycles, %lluns\n", 1836 ce->engine->name, cycles >> TF_BIAS, 1837 cycles_to_ns(ce->engine, cycles)); 1838 1839 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1840 1841 err: 1842 intel_gt_set_wedged(ce->engine->gt); 1843 return err; 1844 } 1845 1846 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) 1847 { 1848 const u32 offset = 1849 i915_ggtt_offset(engine->status_page.vma) + 1850 offset_in_page(sema); 1851 struct i915_request *rq; 1852 u32 *cs; 1853 1854 rq = i915_request_create(engine->kernel_context); 1855 if (IS_ERR(rq)) 1856 return PTR_ERR(rq); 1857 1858 cs = intel_ring_begin(rq, 4); 1859 if (IS_ERR(cs)) { 1860 i915_request_add(rq); 1861 return PTR_ERR(cs); 1862 } 1863 1864 cs = emit_semaphore_poll(cs, mode, value, offset); 1865 1866 intel_ring_advance(rq, cs); 1867 i915_request_add(rq); 1868 1869 return 0; 1870 } 1871 1872 static int measure_inter_request(struct intel_context *ce) 1873 { 1874 u32 *sema = hwsp_scratch(ce); 1875 const u32 offset = hwsp_offset(ce, sema); 1876 u32 elapsed[TF_COUNT + 1], cycles; 1877 struct i915_sw_fence *submit; 1878 int i, err; 1879 1880 /* 1881 * Measure how long it takes to advance from one request into the 1882 * next. Between each request we flush the GPU caches to memory, 1883 * update the breadcrumbs, and then invalidate those caches. 1884 * We queue up all the requests to be submitted in one batch so 1885 * it should be one set of contiguous measurements. 1886 * 1887 * A: read CS_TIMESTAMP on GPU 1888 * advance request 1889 * B: read CS_TIMESTAMP on GPU 1890 * 1891 * Request latency: B - A 1892 */ 1893 1894 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 1895 if (err) 1896 return err; 1897 1898 submit = heap_fence_create(GFP_KERNEL); 1899 if (!submit) { 1900 semaphore_set(sema, 1); 1901 return -ENOMEM; 1902 } 1903 1904 intel_engine_flush_submission(ce->engine); 1905 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1906 struct i915_request *rq; 1907 u32 *cs; 1908 1909 rq = i915_request_create(ce); 1910 if (IS_ERR(rq)) { 1911 err = PTR_ERR(rq); 1912 goto err_submit; 1913 } 1914 1915 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 1916 submit, 1917 GFP_KERNEL); 1918 if (err < 0) { 1919 i915_request_add(rq); 1920 goto err_submit; 1921 } 1922 1923 cs = intel_ring_begin(rq, 4); 1924 if (IS_ERR(cs)) { 1925 i915_request_add(rq); 1926 err = PTR_ERR(cs); 1927 goto err_submit; 1928 } 1929 1930 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1931 1932 intel_ring_advance(rq, cs); 1933 i915_request_add(rq); 1934 } 1935 local_bh_disable(); 1936 i915_sw_fence_commit(submit); 1937 local_bh_enable(); 1938 intel_engine_flush_submission(ce->engine); 1939 heap_fence_put(submit); 1940 1941 semaphore_set(sema, 1); 1942 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 1943 if (err) 1944 goto err; 1945 1946 for (i = 1; i <= TF_COUNT; i++) 1947 elapsed[i - 1] = sema[i + 1] - sema[i]; 1948 1949 cycles = trifilter(elapsed); 1950 pr_info("%s: inter-request latency %d cycles, %lluns\n", 1951 ce->engine->name, cycles >> TF_BIAS, 1952 cycles_to_ns(ce->engine, cycles)); 1953 1954 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1955 1956 err_submit: 1957 i915_sw_fence_commit(submit); 1958 heap_fence_put(submit); 1959 semaphore_set(sema, 1); 1960 err: 1961 intel_gt_set_wedged(ce->engine->gt); 1962 return err; 1963 } 1964 1965 static int measure_context_switch(struct intel_context *ce) 1966 { 1967 u32 *sema = hwsp_scratch(ce); 1968 const u32 offset = hwsp_offset(ce, sema); 1969 struct i915_request *fence = NULL; 1970 u32 elapsed[TF_COUNT + 1], cycles; 1971 int i, j, err; 1972 u32 *cs; 1973 1974 /* 1975 * Measure how long it takes to advance from one request in one 1976 * context to a request in another context. This allows us to 1977 * measure how long the context save/restore take, along with all 1978 * the inter-context setup we require. 1979 * 1980 * A: read CS_TIMESTAMP on GPU 1981 * switch context 1982 * B: read CS_TIMESTAMP on GPU 1983 * 1984 * Context switch latency: B - A 1985 */ 1986 1987 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 1988 if (err) 1989 return err; 1990 1991 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1992 struct intel_context *arr[] = { 1993 ce, ce->engine->kernel_context 1994 }; 1995 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); 1996 1997 for (j = 0; j < ARRAY_SIZE(arr); j++) { 1998 struct i915_request *rq; 1999 2000 rq = i915_request_create(arr[j]); 2001 if (IS_ERR(rq)) { 2002 err = PTR_ERR(rq); 2003 goto err_fence; 2004 } 2005 2006 if (fence) { 2007 err = i915_request_await_dma_fence(rq, 2008 &fence->fence); 2009 if (err) { 2010 i915_request_add(rq); 2011 goto err_fence; 2012 } 2013 } 2014 2015 cs = intel_ring_begin(rq, 4); 2016 if (IS_ERR(cs)) { 2017 i915_request_add(rq); 2018 err = PTR_ERR(cs); 2019 goto err_fence; 2020 } 2021 2022 cs = emit_timestamp_store(cs, ce, addr); 2023 addr += sizeof(u32); 2024 2025 intel_ring_advance(rq, cs); 2026 2027 i915_request_put(fence); 2028 fence = i915_request_get(rq); 2029 2030 i915_request_add(rq); 2031 } 2032 } 2033 i915_request_put(fence); 2034 intel_engine_flush_submission(ce->engine); 2035 2036 semaphore_set(sema, 1); 2037 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2038 if (err) 2039 goto err; 2040 2041 for (i = 1; i <= TF_COUNT; i++) 2042 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; 2043 2044 cycles = trifilter(elapsed); 2045 pr_info("%s: context switch latency %d cycles, %lluns\n", 2046 ce->engine->name, cycles >> TF_BIAS, 2047 cycles_to_ns(ce->engine, cycles)); 2048 2049 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2050 2051 err_fence: 2052 i915_request_put(fence); 2053 semaphore_set(sema, 1); 2054 err: 2055 intel_gt_set_wedged(ce->engine->gt); 2056 return err; 2057 } 2058 2059 static int measure_preemption(struct intel_context *ce) 2060 { 2061 u32 *sema = hwsp_scratch(ce); 2062 const u32 offset = hwsp_offset(ce, sema); 2063 u32 elapsed[TF_COUNT], cycles; 2064 u32 *cs; 2065 int err; 2066 int i; 2067 2068 /* 2069 * We measure two latencies while triggering preemption. The first 2070 * latency is how long it takes for us to submit a preempting request. 2071 * The second latency is how it takes for us to return from the 2072 * preemption back to the original context. 2073 * 2074 * A: read CS_TIMESTAMP from CPU 2075 * submit preemption 2076 * B: read CS_TIMESTAMP on GPU (in preempting context) 2077 * context switch 2078 * C: read CS_TIMESTAMP on GPU (in original context) 2079 * 2080 * Preemption dispatch latency: B - A 2081 * Preemption switch latency: C - B 2082 */ 2083 2084 if (!intel_engine_has_preemption(ce->engine)) 2085 return 0; 2086 2087 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2088 u32 addr = offset + 2 * i * sizeof(u32); 2089 struct i915_request *rq; 2090 2091 rq = i915_request_create(ce); 2092 if (IS_ERR(rq)) { 2093 err = PTR_ERR(rq); 2094 goto err; 2095 } 2096 2097 cs = intel_ring_begin(rq, 12); 2098 if (IS_ERR(cs)) { 2099 i915_request_add(rq); 2100 err = PTR_ERR(cs); 2101 goto err; 2102 } 2103 2104 cs = emit_store_dw(cs, addr, -1); 2105 cs = emit_semaphore_poll_until(cs, offset, i); 2106 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); 2107 2108 intel_ring_advance(rq, cs); 2109 i915_request_add(rq); 2110 2111 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { 2112 err = -EIO; 2113 goto err; 2114 } 2115 2116 rq = i915_request_create(ce->engine->kernel_context); 2117 if (IS_ERR(rq)) { 2118 err = PTR_ERR(rq); 2119 goto err; 2120 } 2121 2122 cs = intel_ring_begin(rq, 8); 2123 if (IS_ERR(cs)) { 2124 i915_request_add(rq); 2125 err = PTR_ERR(cs); 2126 goto err; 2127 } 2128 2129 cs = emit_timestamp_store(cs, ce, addr); 2130 cs = emit_store_dw(cs, offset, i); 2131 2132 intel_ring_advance(rq, cs); 2133 rq->sched.attr.priority = I915_PRIORITY_BARRIER; 2134 2135 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2136 i915_request_add(rq); 2137 } 2138 2139 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { 2140 err = -EIO; 2141 goto err; 2142 } 2143 2144 for (i = 1; i <= TF_COUNT; i++) 2145 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; 2146 2147 cycles = trifilter(elapsed); 2148 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", 2149 ce->engine->name, cycles >> TF_BIAS, 2150 cycles_to_ns(ce->engine, cycles)); 2151 2152 for (i = 1; i <= TF_COUNT; i++) 2153 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; 2154 2155 cycles = trifilter(elapsed); 2156 pr_info("%s: preemption switch latency %d cycles, %lluns\n", 2157 ce->engine->name, cycles >> TF_BIAS, 2158 cycles_to_ns(ce->engine, cycles)); 2159 2160 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2161 2162 err: 2163 intel_gt_set_wedged(ce->engine->gt); 2164 return err; 2165 } 2166 2167 struct signal_cb { 2168 struct dma_fence_cb base; 2169 bool seen; 2170 }; 2171 2172 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 2173 { 2174 struct signal_cb *s = container_of(cb, typeof(*s), base); 2175 2176 smp_store_mb(s->seen, true); /* be safe, be strong */ 2177 } 2178 2179 static int measure_completion(struct intel_context *ce) 2180 { 2181 u32 *sema = hwsp_scratch(ce); 2182 const u32 offset = hwsp_offset(ce, sema); 2183 u32 elapsed[TF_COUNT], cycles; 2184 u32 *cs; 2185 int err; 2186 int i; 2187 2188 /* 2189 * Measure how long it takes for the signal (interrupt) to be 2190 * sent from the GPU to be processed by the CPU. 2191 * 2192 * A: read CS_TIMESTAMP on GPU 2193 * signal 2194 * B: read CS_TIMESTAMP from CPU 2195 * 2196 * Completion latency: B - A 2197 */ 2198 2199 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2200 struct signal_cb cb = { .seen = false }; 2201 struct i915_request *rq; 2202 2203 rq = i915_request_create(ce); 2204 if (IS_ERR(rq)) { 2205 err = PTR_ERR(rq); 2206 goto err; 2207 } 2208 2209 cs = intel_ring_begin(rq, 12); 2210 if (IS_ERR(cs)) { 2211 i915_request_add(rq); 2212 err = PTR_ERR(cs); 2213 goto err; 2214 } 2215 2216 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2217 cs = emit_semaphore_poll_until(cs, offset, i); 2218 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2219 2220 intel_ring_advance(rq, cs); 2221 2222 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); 2223 2224 local_bh_disable(); 2225 i915_request_add(rq); 2226 local_bh_enable(); 2227 2228 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { 2229 err = -EIO; 2230 goto err; 2231 } 2232 2233 preempt_disable(); 2234 semaphore_set(sema, i); 2235 while (!READ_ONCE(cb.seen)) 2236 cpu_relax(); 2237 2238 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2239 preempt_enable(); 2240 } 2241 2242 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2243 if (err) 2244 goto err; 2245 2246 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2247 GEM_BUG_ON(sema[i + 1] == -1); 2248 elapsed[i] = elapsed[i] - sema[i + 1]; 2249 } 2250 2251 cycles = trifilter(elapsed); 2252 pr_info("%s: completion latency %d cycles, %lluns\n", 2253 ce->engine->name, cycles >> TF_BIAS, 2254 cycles_to_ns(ce->engine, cycles)); 2255 2256 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2257 2258 err: 2259 intel_gt_set_wedged(ce->engine->gt); 2260 return err; 2261 } 2262 2263 static void rps_pin(struct intel_gt *gt) 2264 { 2265 /* Pin the frequency to max */ 2266 atomic_inc(>->rps.num_waiters); 2267 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 2268 2269 mutex_lock(>->rps.lock); 2270 intel_rps_set(>->rps, gt->rps.max_freq); 2271 mutex_unlock(>->rps.lock); 2272 } 2273 2274 static void rps_unpin(struct intel_gt *gt) 2275 { 2276 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 2277 atomic_dec(>->rps.num_waiters); 2278 } 2279 2280 static int perf_request_latency(void *arg) 2281 { 2282 struct drm_i915_private *i915 = arg; 2283 struct intel_engine_cs *engine; 2284 struct pm_qos_request qos; 2285 int err = 0; 2286 2287 if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */ 2288 return 0; 2289 2290 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2291 2292 for_each_uabi_engine(engine, i915) { 2293 struct intel_context *ce; 2294 2295 ce = intel_context_create(engine); 2296 if (IS_ERR(ce)) 2297 goto out; 2298 2299 err = intel_context_pin(ce); 2300 if (err) { 2301 intel_context_put(ce); 2302 goto out; 2303 } 2304 2305 st_engine_heartbeat_disable(engine); 2306 rps_pin(engine->gt); 2307 2308 if (err == 0) 2309 err = measure_semaphore_response(ce); 2310 if (err == 0) 2311 err = measure_idle_dispatch(ce); 2312 if (err == 0) 2313 err = measure_busy_dispatch(ce); 2314 if (err == 0) 2315 err = measure_inter_request(ce); 2316 if (err == 0) 2317 err = measure_context_switch(ce); 2318 if (err == 0) 2319 err = measure_preemption(ce); 2320 if (err == 0) 2321 err = measure_completion(ce); 2322 2323 rps_unpin(engine->gt); 2324 st_engine_heartbeat_enable(engine); 2325 2326 intel_context_unpin(ce); 2327 intel_context_put(ce); 2328 if (err) 2329 goto out; 2330 } 2331 2332 out: 2333 if (igt_flush_test(i915)) 2334 err = -EIO; 2335 2336 cpu_latency_qos_remove_request(&qos); 2337 return err; 2338 } 2339 2340 static int s_sync0(void *arg) 2341 { 2342 struct perf_series *ps = arg; 2343 IGT_TIMEOUT(end_time); 2344 unsigned int idx = 0; 2345 int err = 0; 2346 2347 GEM_BUG_ON(!ps->nengines); 2348 do { 2349 struct i915_request *rq; 2350 2351 rq = i915_request_create(ps->ce[idx]); 2352 if (IS_ERR(rq)) { 2353 err = PTR_ERR(rq); 2354 break; 2355 } 2356 2357 i915_request_get(rq); 2358 i915_request_add(rq); 2359 2360 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2361 err = -ETIME; 2362 i915_request_put(rq); 2363 if (err) 2364 break; 2365 2366 if (++idx == ps->nengines) 2367 idx = 0; 2368 } while (!__igt_timeout(end_time, NULL)); 2369 2370 return err; 2371 } 2372 2373 static int s_sync1(void *arg) 2374 { 2375 struct perf_series *ps = arg; 2376 struct i915_request *prev = NULL; 2377 IGT_TIMEOUT(end_time); 2378 unsigned int idx = 0; 2379 int err = 0; 2380 2381 GEM_BUG_ON(!ps->nengines); 2382 do { 2383 struct i915_request *rq; 2384 2385 rq = i915_request_create(ps->ce[idx]); 2386 if (IS_ERR(rq)) { 2387 err = PTR_ERR(rq); 2388 break; 2389 } 2390 2391 i915_request_get(rq); 2392 i915_request_add(rq); 2393 2394 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2395 err = -ETIME; 2396 i915_request_put(prev); 2397 prev = rq; 2398 if (err) 2399 break; 2400 2401 if (++idx == ps->nengines) 2402 idx = 0; 2403 } while (!__igt_timeout(end_time, NULL)); 2404 i915_request_put(prev); 2405 2406 return err; 2407 } 2408 2409 static int s_many(void *arg) 2410 { 2411 struct perf_series *ps = arg; 2412 IGT_TIMEOUT(end_time); 2413 unsigned int idx = 0; 2414 2415 GEM_BUG_ON(!ps->nengines); 2416 do { 2417 struct i915_request *rq; 2418 2419 rq = i915_request_create(ps->ce[idx]); 2420 if (IS_ERR(rq)) 2421 return PTR_ERR(rq); 2422 2423 i915_request_add(rq); 2424 2425 if (++idx == ps->nengines) 2426 idx = 0; 2427 } while (!__igt_timeout(end_time, NULL)); 2428 2429 return 0; 2430 } 2431 2432 static int perf_series_engines(void *arg) 2433 { 2434 struct drm_i915_private *i915 = arg; 2435 static int (* const func[])(void *arg) = { 2436 s_sync0, 2437 s_sync1, 2438 s_many, 2439 NULL, 2440 }; 2441 const unsigned int nengines = num_uabi_engines(i915); 2442 struct intel_engine_cs *engine; 2443 int (* const *fn)(void *arg); 2444 struct pm_qos_request qos; 2445 struct perf_stats *stats; 2446 struct perf_series *ps; 2447 unsigned int idx; 2448 int err = 0; 2449 2450 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL); 2451 if (!stats) 2452 return -ENOMEM; 2453 2454 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL); 2455 if (!ps) { 2456 kfree(stats); 2457 return -ENOMEM; 2458 } 2459 2460 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2461 2462 ps->i915 = i915; 2463 ps->nengines = nengines; 2464 2465 idx = 0; 2466 for_each_uabi_engine(engine, i915) { 2467 struct intel_context *ce; 2468 2469 ce = intel_context_create(engine); 2470 if (IS_ERR(ce)) 2471 goto out; 2472 2473 err = intel_context_pin(ce); 2474 if (err) { 2475 intel_context_put(ce); 2476 goto out; 2477 } 2478 2479 ps->ce[idx++] = ce; 2480 } 2481 GEM_BUG_ON(idx != ps->nengines); 2482 2483 for (fn = func; *fn && !err; fn++) { 2484 char name[KSYM_NAME_LEN]; 2485 struct igt_live_test t; 2486 2487 snprintf(name, sizeof(name), "%ps", *fn); 2488 err = igt_live_test_begin(&t, i915, __func__, name); 2489 if (err) 2490 break; 2491 2492 for (idx = 0; idx < nengines; idx++) { 2493 struct perf_stats *p = 2494 memset(&stats[idx], 0, sizeof(stats[idx])); 2495 struct intel_context *ce = ps->ce[idx]; 2496 2497 p->engine = ps->ce[idx]->engine; 2498 intel_engine_pm_get(p->engine); 2499 2500 if (intel_engine_supports_stats(p->engine)) 2501 p->busy = intel_engine_get_busy_time(p->engine, 2502 &p->time) + 1; 2503 else 2504 p->time = ktime_get(); 2505 p->runtime = -intel_context_get_total_runtime_ns(ce); 2506 } 2507 2508 err = (*fn)(ps); 2509 if (igt_live_test_end(&t)) 2510 err = -EIO; 2511 2512 for (idx = 0; idx < nengines; idx++) { 2513 struct perf_stats *p = &stats[idx]; 2514 struct intel_context *ce = ps->ce[idx]; 2515 int integer, decimal; 2516 u64 busy, dt, now; 2517 2518 if (p->busy) 2519 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine, 2520 &now), 2521 p->busy - 1); 2522 else 2523 now = ktime_get(); 2524 p->time = ktime_sub(now, p->time); 2525 2526 err = switch_to_kernel_sync(ce, err); 2527 p->runtime += intel_context_get_total_runtime_ns(ce); 2528 intel_engine_pm_put(p->engine); 2529 2530 busy = 100 * ktime_to_ns(p->busy); 2531 dt = ktime_to_ns(p->time); 2532 if (dt) { 2533 integer = div64_u64(busy, dt); 2534 busy -= integer * dt; 2535 decimal = div64_u64(100 * busy, dt); 2536 } else { 2537 integer = 0; 2538 decimal = 0; 2539 } 2540 2541 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2542 name, p->engine->name, ce->timeline->seqno, 2543 integer, decimal, 2544 div_u64(p->runtime, 1000 * 1000), 2545 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2546 } 2547 } 2548 2549 out: 2550 for (idx = 0; idx < nengines; idx++) { 2551 if (IS_ERR_OR_NULL(ps->ce[idx])) 2552 break; 2553 2554 intel_context_unpin(ps->ce[idx]); 2555 intel_context_put(ps->ce[idx]); 2556 } 2557 kfree(ps); 2558 2559 cpu_latency_qos_remove_request(&qos); 2560 kfree(stats); 2561 return err; 2562 } 2563 2564 static int p_sync0(void *arg) 2565 { 2566 struct perf_stats *p = arg; 2567 struct intel_engine_cs *engine = p->engine; 2568 struct intel_context *ce; 2569 IGT_TIMEOUT(end_time); 2570 unsigned long count; 2571 bool busy; 2572 int err = 0; 2573 2574 ce = intel_context_create(engine); 2575 if (IS_ERR(ce)) 2576 return PTR_ERR(ce); 2577 2578 err = intel_context_pin(ce); 2579 if (err) { 2580 intel_context_put(ce); 2581 return err; 2582 } 2583 2584 if (intel_engine_supports_stats(engine)) { 2585 p->busy = intel_engine_get_busy_time(engine, &p->time); 2586 busy = true; 2587 } else { 2588 p->time = ktime_get(); 2589 busy = false; 2590 } 2591 2592 count = 0; 2593 do { 2594 struct i915_request *rq; 2595 2596 rq = i915_request_create(ce); 2597 if (IS_ERR(rq)) { 2598 err = PTR_ERR(rq); 2599 break; 2600 } 2601 2602 i915_request_get(rq); 2603 i915_request_add(rq); 2604 2605 err = 0; 2606 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2607 err = -ETIME; 2608 i915_request_put(rq); 2609 if (err) 2610 break; 2611 2612 count++; 2613 } while (!__igt_timeout(end_time, NULL)); 2614 2615 if (busy) { 2616 ktime_t now; 2617 2618 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 2619 p->busy); 2620 p->time = ktime_sub(now, p->time); 2621 } else { 2622 p->time = ktime_sub(ktime_get(), p->time); 2623 } 2624 2625 err = switch_to_kernel_sync(ce, err); 2626 p->runtime = intel_context_get_total_runtime_ns(ce); 2627 p->count = count; 2628 2629 intel_context_unpin(ce); 2630 intel_context_put(ce); 2631 return err; 2632 } 2633 2634 static int p_sync1(void *arg) 2635 { 2636 struct perf_stats *p = arg; 2637 struct intel_engine_cs *engine = p->engine; 2638 struct i915_request *prev = NULL; 2639 struct intel_context *ce; 2640 IGT_TIMEOUT(end_time); 2641 unsigned long count; 2642 bool busy; 2643 int err = 0; 2644 2645 ce = intel_context_create(engine); 2646 if (IS_ERR(ce)) 2647 return PTR_ERR(ce); 2648 2649 err = intel_context_pin(ce); 2650 if (err) { 2651 intel_context_put(ce); 2652 return err; 2653 } 2654 2655 if (intel_engine_supports_stats(engine)) { 2656 p->busy = intel_engine_get_busy_time(engine, &p->time); 2657 busy = true; 2658 } else { 2659 p->time = ktime_get(); 2660 busy = false; 2661 } 2662 2663 count = 0; 2664 do { 2665 struct i915_request *rq; 2666 2667 rq = i915_request_create(ce); 2668 if (IS_ERR(rq)) { 2669 err = PTR_ERR(rq); 2670 break; 2671 } 2672 2673 i915_request_get(rq); 2674 i915_request_add(rq); 2675 2676 err = 0; 2677 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2678 err = -ETIME; 2679 i915_request_put(prev); 2680 prev = rq; 2681 if (err) 2682 break; 2683 2684 count++; 2685 } while (!__igt_timeout(end_time, NULL)); 2686 i915_request_put(prev); 2687 2688 if (busy) { 2689 ktime_t now; 2690 2691 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 2692 p->busy); 2693 p->time = ktime_sub(now, p->time); 2694 } else { 2695 p->time = ktime_sub(ktime_get(), p->time); 2696 } 2697 2698 err = switch_to_kernel_sync(ce, err); 2699 p->runtime = intel_context_get_total_runtime_ns(ce); 2700 p->count = count; 2701 2702 intel_context_unpin(ce); 2703 intel_context_put(ce); 2704 return err; 2705 } 2706 2707 static int p_many(void *arg) 2708 { 2709 struct perf_stats *p = arg; 2710 struct intel_engine_cs *engine = p->engine; 2711 struct intel_context *ce; 2712 IGT_TIMEOUT(end_time); 2713 unsigned long count; 2714 int err = 0; 2715 bool busy; 2716 2717 ce = intel_context_create(engine); 2718 if (IS_ERR(ce)) 2719 return PTR_ERR(ce); 2720 2721 err = intel_context_pin(ce); 2722 if (err) { 2723 intel_context_put(ce); 2724 return err; 2725 } 2726 2727 if (intel_engine_supports_stats(engine)) { 2728 p->busy = intel_engine_get_busy_time(engine, &p->time); 2729 busy = true; 2730 } else { 2731 p->time = ktime_get(); 2732 busy = false; 2733 } 2734 2735 count = 0; 2736 do { 2737 struct i915_request *rq; 2738 2739 rq = i915_request_create(ce); 2740 if (IS_ERR(rq)) { 2741 err = PTR_ERR(rq); 2742 break; 2743 } 2744 2745 i915_request_add(rq); 2746 count++; 2747 } while (!__igt_timeout(end_time, NULL)); 2748 2749 if (busy) { 2750 ktime_t now; 2751 2752 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 2753 p->busy); 2754 p->time = ktime_sub(now, p->time); 2755 } else { 2756 p->time = ktime_sub(ktime_get(), p->time); 2757 } 2758 2759 err = switch_to_kernel_sync(ce, err); 2760 p->runtime = intel_context_get_total_runtime_ns(ce); 2761 p->count = count; 2762 2763 intel_context_unpin(ce); 2764 intel_context_put(ce); 2765 return err; 2766 } 2767 2768 static int perf_parallel_engines(void *arg) 2769 { 2770 struct drm_i915_private *i915 = arg; 2771 static int (* const func[])(void *arg) = { 2772 p_sync0, 2773 p_sync1, 2774 p_many, 2775 NULL, 2776 }; 2777 const unsigned int nengines = num_uabi_engines(i915); 2778 struct intel_engine_cs *engine; 2779 int (* const *fn)(void *arg); 2780 struct pm_qos_request qos; 2781 struct { 2782 struct perf_stats p; 2783 struct task_struct *tsk; 2784 } *engines; 2785 int err = 0; 2786 2787 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL); 2788 if (!engines) 2789 return -ENOMEM; 2790 2791 cpu_latency_qos_add_request(&qos, 0); 2792 2793 for (fn = func; *fn; fn++) { 2794 char name[KSYM_NAME_LEN]; 2795 struct igt_live_test t; 2796 unsigned int idx; 2797 2798 snprintf(name, sizeof(name), "%ps", *fn); 2799 err = igt_live_test_begin(&t, i915, __func__, name); 2800 if (err) 2801 break; 2802 2803 atomic_set(&i915->selftest.counter, nengines); 2804 2805 idx = 0; 2806 for_each_uabi_engine(engine, i915) { 2807 intel_engine_pm_get(engine); 2808 2809 memset(&engines[idx].p, 0, sizeof(engines[idx].p)); 2810 engines[idx].p.engine = engine; 2811 2812 engines[idx].tsk = kthread_run(*fn, &engines[idx].p, 2813 "igt:%s", engine->name); 2814 if (IS_ERR(engines[idx].tsk)) { 2815 err = PTR_ERR(engines[idx].tsk); 2816 intel_engine_pm_put(engine); 2817 break; 2818 } 2819 get_task_struct(engines[idx++].tsk); 2820 } 2821 2822 yield(); /* start all threads before we kthread_stop() */ 2823 2824 idx = 0; 2825 for_each_uabi_engine(engine, i915) { 2826 int status; 2827 2828 if (IS_ERR(engines[idx].tsk)) 2829 break; 2830 2831 status = kthread_stop(engines[idx].tsk); 2832 if (status && !err) 2833 err = status; 2834 2835 intel_engine_pm_put(engine); 2836 put_task_struct(engines[idx++].tsk); 2837 } 2838 2839 if (igt_live_test_end(&t)) 2840 err = -EIO; 2841 if (err) 2842 break; 2843 2844 idx = 0; 2845 for_each_uabi_engine(engine, i915) { 2846 struct perf_stats *p = &engines[idx].p; 2847 u64 busy = 100 * ktime_to_ns(p->busy); 2848 u64 dt = ktime_to_ns(p->time); 2849 int integer, decimal; 2850 2851 if (dt) { 2852 integer = div64_u64(busy, dt); 2853 busy -= integer * dt; 2854 decimal = div64_u64(100 * busy, dt); 2855 } else { 2856 integer = 0; 2857 decimal = 0; 2858 } 2859 2860 GEM_BUG_ON(engine != p->engine); 2861 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2862 name, engine->name, p->count, integer, decimal, 2863 div_u64(p->runtime, 1000 * 1000), 2864 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2865 idx++; 2866 } 2867 } 2868 2869 cpu_latency_qos_remove_request(&qos); 2870 kfree(engines); 2871 return err; 2872 } 2873 2874 int i915_request_perf_selftests(struct drm_i915_private *i915) 2875 { 2876 static const struct i915_subtest tests[] = { 2877 SUBTEST(perf_request_latency), 2878 SUBTEST(perf_series_engines), 2879 SUBTEST(perf_parallel_engines), 2880 }; 2881 2882 if (intel_gt_is_wedged(&i915->gt)) 2883 return 0; 2884 2885 return i915_subtests(tests, i915); 2886 } 2887