1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/kthread.h> 26 27 #include "gem/i915_gem_context.h" 28 #include "gt/intel_gt.h" 29 #include "intel_engine_pm.h" 30 31 #include "i915_selftest.h" 32 #include "selftests/i915_random.h" 33 #include "selftests/igt_flush_test.h" 34 #include "selftests/igt_reset.h" 35 #include "selftests/igt_atomic.h" 36 37 #include "selftests/mock_drm.h" 38 39 #include "gem/selftests/mock_context.h" 40 #include "gem/selftests/igt_gem_utils.h" 41 42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 43 44 struct hang { 45 struct intel_gt *gt; 46 struct drm_i915_gem_object *hws; 47 struct drm_i915_gem_object *obj; 48 struct i915_gem_context *ctx; 49 u32 *seqno; 50 u32 *batch; 51 }; 52 53 static int hang_init(struct hang *h, struct intel_gt *gt) 54 { 55 void *vaddr; 56 int err; 57 58 memset(h, 0, sizeof(*h)); 59 h->gt = gt; 60 61 h->ctx = kernel_context(gt->i915); 62 if (IS_ERR(h->ctx)) 63 return PTR_ERR(h->ctx); 64 65 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 66 67 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 68 if (IS_ERR(h->hws)) { 69 err = PTR_ERR(h->hws); 70 goto err_ctx; 71 } 72 73 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 74 if (IS_ERR(h->obj)) { 75 err = PTR_ERR(h->obj); 76 goto err_hws; 77 } 78 79 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 80 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); 81 if (IS_ERR(vaddr)) { 82 err = PTR_ERR(vaddr); 83 goto err_obj; 84 } 85 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 86 87 vaddr = i915_gem_object_pin_map(h->obj, 88 i915_coherent_map_type(gt->i915)); 89 if (IS_ERR(vaddr)) { 90 err = PTR_ERR(vaddr); 91 goto err_unpin_hws; 92 } 93 h->batch = vaddr; 94 95 return 0; 96 97 err_unpin_hws: 98 i915_gem_object_unpin_map(h->hws); 99 err_obj: 100 i915_gem_object_put(h->obj); 101 err_hws: 102 i915_gem_object_put(h->hws); 103 err_ctx: 104 kernel_context_close(h->ctx); 105 return err; 106 } 107 108 static u64 hws_address(const struct i915_vma *hws, 109 const struct i915_request *rq) 110 { 111 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 112 } 113 114 static int move_to_active(struct i915_vma *vma, 115 struct i915_request *rq, 116 unsigned int flags) 117 { 118 int err; 119 120 i915_vma_lock(vma); 121 err = i915_request_await_object(rq, vma->obj, 122 flags & EXEC_OBJECT_WRITE); 123 if (err == 0) 124 err = i915_vma_move_to_active(vma, rq, flags); 125 i915_vma_unlock(vma); 126 127 return err; 128 } 129 130 static struct i915_request * 131 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 132 { 133 struct intel_gt *gt = h->gt; 134 struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx); 135 struct drm_i915_gem_object *obj; 136 struct i915_request *rq = NULL; 137 struct i915_vma *hws, *vma; 138 unsigned int flags; 139 void *vaddr; 140 u32 *batch; 141 int err; 142 143 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 144 if (IS_ERR(obj)) { 145 i915_vm_put(vm); 146 return ERR_CAST(obj); 147 } 148 149 vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915)); 150 if (IS_ERR(vaddr)) { 151 i915_gem_object_put(obj); 152 i915_vm_put(vm); 153 return ERR_CAST(vaddr); 154 } 155 156 i915_gem_object_unpin_map(h->obj); 157 i915_gem_object_put(h->obj); 158 159 h->obj = obj; 160 h->batch = vaddr; 161 162 vma = i915_vma_instance(h->obj, vm, NULL); 163 if (IS_ERR(vma)) { 164 i915_vm_put(vm); 165 return ERR_CAST(vma); 166 } 167 168 hws = i915_vma_instance(h->hws, vm, NULL); 169 if (IS_ERR(hws)) { 170 i915_vm_put(vm); 171 return ERR_CAST(hws); 172 } 173 174 err = i915_vma_pin(vma, 0, 0, PIN_USER); 175 if (err) { 176 i915_vm_put(vm); 177 return ERR_PTR(err); 178 } 179 180 err = i915_vma_pin(hws, 0, 0, PIN_USER); 181 if (err) 182 goto unpin_vma; 183 184 rq = igt_request_alloc(h->ctx, engine); 185 if (IS_ERR(rq)) { 186 err = PTR_ERR(rq); 187 goto unpin_hws; 188 } 189 190 err = move_to_active(vma, rq, 0); 191 if (err) 192 goto cancel_rq; 193 194 err = move_to_active(hws, rq, 0); 195 if (err) 196 goto cancel_rq; 197 198 batch = h->batch; 199 if (INTEL_GEN(gt->i915) >= 8) { 200 *batch++ = MI_STORE_DWORD_IMM_GEN4; 201 *batch++ = lower_32_bits(hws_address(hws, rq)); 202 *batch++ = upper_32_bits(hws_address(hws, rq)); 203 *batch++ = rq->fence.seqno; 204 *batch++ = MI_ARB_CHECK; 205 206 memset(batch, 0, 1024); 207 batch += 1024 / sizeof(*batch); 208 209 *batch++ = MI_ARB_CHECK; 210 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 211 *batch++ = lower_32_bits(vma->node.start); 212 *batch++ = upper_32_bits(vma->node.start); 213 } else if (INTEL_GEN(gt->i915) >= 6) { 214 *batch++ = MI_STORE_DWORD_IMM_GEN4; 215 *batch++ = 0; 216 *batch++ = lower_32_bits(hws_address(hws, rq)); 217 *batch++ = rq->fence.seqno; 218 *batch++ = MI_ARB_CHECK; 219 220 memset(batch, 0, 1024); 221 batch += 1024 / sizeof(*batch); 222 223 *batch++ = MI_ARB_CHECK; 224 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 225 *batch++ = lower_32_bits(vma->node.start); 226 } else if (INTEL_GEN(gt->i915) >= 4) { 227 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 228 *batch++ = 0; 229 *batch++ = lower_32_bits(hws_address(hws, rq)); 230 *batch++ = rq->fence.seqno; 231 *batch++ = MI_ARB_CHECK; 232 233 memset(batch, 0, 1024); 234 batch += 1024 / sizeof(*batch); 235 236 *batch++ = MI_ARB_CHECK; 237 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 238 *batch++ = lower_32_bits(vma->node.start); 239 } else { 240 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 241 *batch++ = lower_32_bits(hws_address(hws, rq)); 242 *batch++ = rq->fence.seqno; 243 *batch++ = MI_ARB_CHECK; 244 245 memset(batch, 0, 1024); 246 batch += 1024 / sizeof(*batch); 247 248 *batch++ = MI_ARB_CHECK; 249 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 250 *batch++ = lower_32_bits(vma->node.start); 251 } 252 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 253 intel_gt_chipset_flush(engine->gt); 254 255 if (rq->engine->emit_init_breadcrumb) { 256 err = rq->engine->emit_init_breadcrumb(rq); 257 if (err) 258 goto cancel_rq; 259 } 260 261 flags = 0; 262 if (INTEL_GEN(gt->i915) <= 5) 263 flags |= I915_DISPATCH_SECURE; 264 265 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 266 267 cancel_rq: 268 if (err) { 269 i915_request_skip(rq, err); 270 i915_request_add(rq); 271 } 272 unpin_hws: 273 i915_vma_unpin(hws); 274 unpin_vma: 275 i915_vma_unpin(vma); 276 i915_vm_put(vm); 277 return err ? ERR_PTR(err) : rq; 278 } 279 280 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 281 { 282 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 283 } 284 285 static void hang_fini(struct hang *h) 286 { 287 *h->batch = MI_BATCH_BUFFER_END; 288 intel_gt_chipset_flush(h->gt); 289 290 i915_gem_object_unpin_map(h->obj); 291 i915_gem_object_put(h->obj); 292 293 i915_gem_object_unpin_map(h->hws); 294 i915_gem_object_put(h->hws); 295 296 kernel_context_close(h->ctx); 297 298 igt_flush_test(h->gt->i915); 299 } 300 301 static bool wait_until_running(struct hang *h, struct i915_request *rq) 302 { 303 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 304 rq->fence.seqno), 305 10) && 306 wait_for(i915_seqno_passed(hws_seqno(h, rq), 307 rq->fence.seqno), 308 1000)); 309 } 310 311 static int igt_hang_sanitycheck(void *arg) 312 { 313 struct intel_gt *gt = arg; 314 struct i915_request *rq; 315 struct intel_engine_cs *engine; 316 enum intel_engine_id id; 317 struct hang h; 318 int err; 319 320 /* Basic check that we can execute our hanging batch */ 321 322 err = hang_init(&h, gt); 323 if (err) 324 return err; 325 326 for_each_engine(engine, gt->i915, id) { 327 struct intel_wedge_me w; 328 long timeout; 329 330 if (!intel_engine_can_store_dword(engine)) 331 continue; 332 333 rq = hang_create_request(&h, engine); 334 if (IS_ERR(rq)) { 335 err = PTR_ERR(rq); 336 pr_err("Failed to create request for %s, err=%d\n", 337 engine->name, err); 338 goto fini; 339 } 340 341 i915_request_get(rq); 342 343 *h.batch = MI_BATCH_BUFFER_END; 344 intel_gt_chipset_flush(engine->gt); 345 346 i915_request_add(rq); 347 348 timeout = 0; 349 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 350 timeout = i915_request_wait(rq, 0, 351 MAX_SCHEDULE_TIMEOUT); 352 if (intel_gt_is_wedged(gt)) 353 timeout = -EIO; 354 355 i915_request_put(rq); 356 357 if (timeout < 0) { 358 err = timeout; 359 pr_err("Wait for request failed on %s, err=%d\n", 360 engine->name, err); 361 goto fini; 362 } 363 } 364 365 fini: 366 hang_fini(&h); 367 return err; 368 } 369 370 static bool wait_for_idle(struct intel_engine_cs *engine) 371 { 372 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 373 } 374 375 static int igt_reset_nop(void *arg) 376 { 377 struct intel_gt *gt = arg; 378 struct i915_gpu_error *global = >->i915->gpu_error; 379 struct intel_engine_cs *engine; 380 struct i915_gem_context *ctx; 381 unsigned int reset_count, count; 382 enum intel_engine_id id; 383 struct drm_file *file; 384 IGT_TIMEOUT(end_time); 385 int err = 0; 386 387 /* Check that we can reset during non-user portions of requests */ 388 389 file = mock_file(gt->i915); 390 if (IS_ERR(file)) 391 return PTR_ERR(file); 392 393 ctx = live_context(gt->i915, file); 394 if (IS_ERR(ctx)) { 395 err = PTR_ERR(ctx); 396 goto out; 397 } 398 399 i915_gem_context_clear_bannable(ctx); 400 reset_count = i915_reset_count(global); 401 count = 0; 402 do { 403 for_each_engine(engine, gt->i915, id) { 404 int i; 405 406 for (i = 0; i < 16; i++) { 407 struct i915_request *rq; 408 409 rq = igt_request_alloc(ctx, engine); 410 if (IS_ERR(rq)) { 411 err = PTR_ERR(rq); 412 break; 413 } 414 415 i915_request_add(rq); 416 } 417 } 418 419 igt_global_reset_lock(gt); 420 intel_gt_reset(gt, ALL_ENGINES, NULL); 421 igt_global_reset_unlock(gt); 422 423 if (intel_gt_is_wedged(gt)) { 424 err = -EIO; 425 break; 426 } 427 428 if (i915_reset_count(global) != reset_count + ++count) { 429 pr_err("Full GPU reset not recorded!\n"); 430 err = -EINVAL; 431 break; 432 } 433 434 err = igt_flush_test(gt->i915); 435 if (err) 436 break; 437 } while (time_before(jiffies, end_time)); 438 pr_info("%s: %d resets\n", __func__, count); 439 440 err = igt_flush_test(gt->i915); 441 out: 442 mock_file_free(gt->i915, file); 443 if (intel_gt_is_wedged(gt)) 444 err = -EIO; 445 return err; 446 } 447 448 static int igt_reset_nop_engine(void *arg) 449 { 450 struct intel_gt *gt = arg; 451 struct i915_gpu_error *global = >->i915->gpu_error; 452 struct intel_engine_cs *engine; 453 struct i915_gem_context *ctx; 454 enum intel_engine_id id; 455 struct drm_file *file; 456 int err = 0; 457 458 /* Check that we can engine-reset during non-user portions */ 459 460 if (!intel_has_reset_engine(gt)) 461 return 0; 462 463 file = mock_file(gt->i915); 464 if (IS_ERR(file)) 465 return PTR_ERR(file); 466 467 ctx = live_context(gt->i915, file); 468 if (IS_ERR(ctx)) { 469 err = PTR_ERR(ctx); 470 goto out; 471 } 472 473 i915_gem_context_clear_bannable(ctx); 474 for_each_engine(engine, gt->i915, id) { 475 unsigned int reset_count, reset_engine_count; 476 unsigned int count; 477 IGT_TIMEOUT(end_time); 478 479 reset_count = i915_reset_count(global); 480 reset_engine_count = i915_reset_engine_count(global, engine); 481 count = 0; 482 483 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 484 do { 485 int i; 486 487 if (!wait_for_idle(engine)) { 488 pr_err("%s failed to idle before reset\n", 489 engine->name); 490 err = -EIO; 491 break; 492 } 493 494 for (i = 0; i < 16; i++) { 495 struct i915_request *rq; 496 497 rq = igt_request_alloc(ctx, engine); 498 if (IS_ERR(rq)) { 499 err = PTR_ERR(rq); 500 break; 501 } 502 503 i915_request_add(rq); 504 } 505 err = intel_engine_reset(engine, NULL); 506 if (err) { 507 pr_err("i915_reset_engine failed\n"); 508 break; 509 } 510 511 if (i915_reset_count(global) != reset_count) { 512 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 513 err = -EINVAL; 514 break; 515 } 516 517 if (i915_reset_engine_count(global, engine) != 518 reset_engine_count + ++count) { 519 pr_err("%s engine reset not recorded!\n", 520 engine->name); 521 err = -EINVAL; 522 break; 523 } 524 } while (time_before(jiffies, end_time)); 525 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 526 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 527 528 if (err) 529 break; 530 531 err = igt_flush_test(gt->i915); 532 if (err) 533 break; 534 } 535 536 err = igt_flush_test(gt->i915); 537 out: 538 mock_file_free(gt->i915, file); 539 if (intel_gt_is_wedged(gt)) 540 err = -EIO; 541 return err; 542 } 543 544 static int __igt_reset_engine(struct intel_gt *gt, bool active) 545 { 546 struct i915_gpu_error *global = >->i915->gpu_error; 547 struct intel_engine_cs *engine; 548 enum intel_engine_id id; 549 struct hang h; 550 int err = 0; 551 552 /* Check that we can issue an engine reset on an idle engine (no-op) */ 553 554 if (!intel_has_reset_engine(gt)) 555 return 0; 556 557 if (active) { 558 err = hang_init(&h, gt); 559 if (err) 560 return err; 561 } 562 563 for_each_engine(engine, gt->i915, id) { 564 unsigned int reset_count, reset_engine_count; 565 IGT_TIMEOUT(end_time); 566 567 if (active && !intel_engine_can_store_dword(engine)) 568 continue; 569 570 if (!wait_for_idle(engine)) { 571 pr_err("%s failed to idle before reset\n", 572 engine->name); 573 err = -EIO; 574 break; 575 } 576 577 reset_count = i915_reset_count(global); 578 reset_engine_count = i915_reset_engine_count(global, engine); 579 580 intel_engine_pm_get(engine); 581 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 582 do { 583 if (active) { 584 struct i915_request *rq; 585 586 rq = hang_create_request(&h, engine); 587 if (IS_ERR(rq)) { 588 err = PTR_ERR(rq); 589 break; 590 } 591 592 i915_request_get(rq); 593 i915_request_add(rq); 594 595 if (!wait_until_running(&h, rq)) { 596 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 597 598 pr_err("%s: Failed to start request %llx, at %x\n", 599 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 600 intel_engine_dump(engine, &p, 601 "%s\n", engine->name); 602 603 i915_request_put(rq); 604 err = -EIO; 605 break; 606 } 607 608 i915_request_put(rq); 609 } 610 611 err = intel_engine_reset(engine, NULL); 612 if (err) { 613 pr_err("i915_reset_engine failed\n"); 614 break; 615 } 616 617 if (i915_reset_count(global) != reset_count) { 618 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 619 err = -EINVAL; 620 break; 621 } 622 623 if (i915_reset_engine_count(global, engine) != 624 ++reset_engine_count) { 625 pr_err("%s engine reset not recorded!\n", 626 engine->name); 627 err = -EINVAL; 628 break; 629 } 630 } while (time_before(jiffies, end_time)); 631 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 632 intel_engine_pm_put(engine); 633 634 if (err) 635 break; 636 637 err = igt_flush_test(gt->i915); 638 if (err) 639 break; 640 } 641 642 if (intel_gt_is_wedged(gt)) 643 err = -EIO; 644 645 if (active) 646 hang_fini(&h); 647 648 return err; 649 } 650 651 static int igt_reset_idle_engine(void *arg) 652 { 653 return __igt_reset_engine(arg, false); 654 } 655 656 static int igt_reset_active_engine(void *arg) 657 { 658 return __igt_reset_engine(arg, true); 659 } 660 661 struct active_engine { 662 struct task_struct *task; 663 struct intel_engine_cs *engine; 664 unsigned long resets; 665 unsigned int flags; 666 }; 667 668 #define TEST_ACTIVE BIT(0) 669 #define TEST_OTHERS BIT(1) 670 #define TEST_SELF BIT(2) 671 #define TEST_PRIORITY BIT(3) 672 673 static int active_request_put(struct i915_request *rq) 674 { 675 int err = 0; 676 677 if (!rq) 678 return 0; 679 680 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 681 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 682 rq->engine->name, 683 rq->fence.context, 684 rq->fence.seqno); 685 GEM_TRACE_DUMP(); 686 687 intel_gt_set_wedged(rq->engine->gt); 688 err = -EIO; 689 } 690 691 i915_request_put(rq); 692 693 return err; 694 } 695 696 static int active_engine(void *data) 697 { 698 I915_RND_STATE(prng); 699 struct active_engine *arg = data; 700 struct intel_engine_cs *engine = arg->engine; 701 struct i915_request *rq[8] = {}; 702 struct i915_gem_context *ctx[ARRAY_SIZE(rq)]; 703 struct drm_file *file; 704 unsigned long count = 0; 705 int err = 0; 706 707 file = mock_file(engine->i915); 708 if (IS_ERR(file)) 709 return PTR_ERR(file); 710 711 for (count = 0; count < ARRAY_SIZE(ctx); count++) { 712 ctx[count] = live_context(engine->i915, file); 713 if (IS_ERR(ctx[count])) { 714 err = PTR_ERR(ctx[count]); 715 while (--count) 716 i915_gem_context_put(ctx[count]); 717 goto err_file; 718 } 719 } 720 721 while (!kthread_should_stop()) { 722 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 723 struct i915_request *old = rq[idx]; 724 struct i915_request *new; 725 726 new = igt_request_alloc(ctx[idx], engine); 727 if (IS_ERR(new)) { 728 err = PTR_ERR(new); 729 break; 730 } 731 732 if (arg->flags & TEST_PRIORITY) 733 ctx[idx]->sched.priority = 734 i915_prandom_u32_max_state(512, &prng); 735 736 rq[idx] = i915_request_get(new); 737 i915_request_add(new); 738 739 err = active_request_put(old); 740 if (err) 741 break; 742 743 cond_resched(); 744 } 745 746 for (count = 0; count < ARRAY_SIZE(rq); count++) { 747 int err__ = active_request_put(rq[count]); 748 749 /* Keep the first error */ 750 if (!err) 751 err = err__; 752 } 753 754 err_file: 755 mock_file_free(engine->i915, file); 756 return err; 757 } 758 759 static int __igt_reset_engines(struct intel_gt *gt, 760 const char *test_name, 761 unsigned int flags) 762 { 763 struct i915_gpu_error *global = >->i915->gpu_error; 764 struct intel_engine_cs *engine, *other; 765 enum intel_engine_id id, tmp; 766 struct hang h; 767 int err = 0; 768 769 /* Check that issuing a reset on one engine does not interfere 770 * with any other engine. 771 */ 772 773 if (!intel_has_reset_engine(gt)) 774 return 0; 775 776 if (flags & TEST_ACTIVE) { 777 err = hang_init(&h, gt); 778 if (err) 779 return err; 780 781 if (flags & TEST_PRIORITY) 782 h.ctx->sched.priority = 1024; 783 } 784 785 for_each_engine(engine, gt->i915, id) { 786 struct active_engine threads[I915_NUM_ENGINES] = {}; 787 unsigned long device = i915_reset_count(global); 788 unsigned long count = 0, reported; 789 IGT_TIMEOUT(end_time); 790 791 if (flags & TEST_ACTIVE && 792 !intel_engine_can_store_dword(engine)) 793 continue; 794 795 if (!wait_for_idle(engine)) { 796 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 797 engine->name, test_name); 798 err = -EIO; 799 break; 800 } 801 802 memset(threads, 0, sizeof(threads)); 803 for_each_engine(other, gt->i915, tmp) { 804 struct task_struct *tsk; 805 806 threads[tmp].resets = 807 i915_reset_engine_count(global, other); 808 809 if (!(flags & TEST_OTHERS)) 810 continue; 811 812 if (other == engine && !(flags & TEST_SELF)) 813 continue; 814 815 threads[tmp].engine = other; 816 threads[tmp].flags = flags; 817 818 tsk = kthread_run(active_engine, &threads[tmp], 819 "igt/%s", other->name); 820 if (IS_ERR(tsk)) { 821 err = PTR_ERR(tsk); 822 goto unwind; 823 } 824 825 threads[tmp].task = tsk; 826 get_task_struct(tsk); 827 } 828 829 intel_engine_pm_get(engine); 830 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 831 do { 832 struct i915_request *rq = NULL; 833 834 if (flags & TEST_ACTIVE) { 835 rq = hang_create_request(&h, engine); 836 if (IS_ERR(rq)) { 837 err = PTR_ERR(rq); 838 break; 839 } 840 841 i915_request_get(rq); 842 i915_request_add(rq); 843 844 if (!wait_until_running(&h, rq)) { 845 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 846 847 pr_err("%s: Failed to start request %llx, at %x\n", 848 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 849 intel_engine_dump(engine, &p, 850 "%s\n", engine->name); 851 852 i915_request_put(rq); 853 err = -EIO; 854 break; 855 } 856 } 857 858 err = intel_engine_reset(engine, NULL); 859 if (err) { 860 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 861 engine->name, test_name, err); 862 break; 863 } 864 865 count++; 866 867 if (rq) { 868 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 869 struct drm_printer p = 870 drm_info_printer(gt->i915->drm.dev); 871 872 pr_err("i915_reset_engine(%s:%s):" 873 " failed to complete request after reset\n", 874 engine->name, test_name); 875 intel_engine_dump(engine, &p, 876 "%s\n", engine->name); 877 i915_request_put(rq); 878 879 GEM_TRACE_DUMP(); 880 intel_gt_set_wedged(gt); 881 err = -EIO; 882 break; 883 } 884 885 i915_request_put(rq); 886 } 887 888 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 889 struct drm_printer p = 890 drm_info_printer(gt->i915->drm.dev); 891 892 pr_err("i915_reset_engine(%s:%s):" 893 " failed to idle after reset\n", 894 engine->name, test_name); 895 intel_engine_dump(engine, &p, 896 "%s\n", engine->name); 897 898 err = -EIO; 899 break; 900 } 901 } while (time_before(jiffies, end_time)); 902 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 903 intel_engine_pm_put(engine); 904 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 905 engine->name, test_name, count); 906 907 reported = i915_reset_engine_count(global, engine); 908 reported -= threads[engine->id].resets; 909 if (reported != count) { 910 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 911 engine->name, test_name, count, reported); 912 if (!err) 913 err = -EINVAL; 914 } 915 916 unwind: 917 for_each_engine(other, gt->i915, tmp) { 918 int ret; 919 920 if (!threads[tmp].task) 921 continue; 922 923 ret = kthread_stop(threads[tmp].task); 924 if (ret) { 925 pr_err("kthread for other engine %s failed, err=%d\n", 926 other->name, ret); 927 if (!err) 928 err = ret; 929 } 930 put_task_struct(threads[tmp].task); 931 932 if (other->uabi_class != engine->uabi_class && 933 threads[tmp].resets != 934 i915_reset_engine_count(global, other)) { 935 pr_err("Innocent engine %s was reset (count=%ld)\n", 936 other->name, 937 i915_reset_engine_count(global, other) - 938 threads[tmp].resets); 939 if (!err) 940 err = -EINVAL; 941 } 942 } 943 944 if (device != i915_reset_count(global)) { 945 pr_err("Global reset (count=%ld)!\n", 946 i915_reset_count(global) - device); 947 if (!err) 948 err = -EINVAL; 949 } 950 951 if (err) 952 break; 953 954 err = igt_flush_test(gt->i915); 955 if (err) 956 break; 957 } 958 959 if (intel_gt_is_wedged(gt)) 960 err = -EIO; 961 962 if (flags & TEST_ACTIVE) 963 hang_fini(&h); 964 965 return err; 966 } 967 968 static int igt_reset_engines(void *arg) 969 { 970 static const struct { 971 const char *name; 972 unsigned int flags; 973 } phases[] = { 974 { "idle", 0 }, 975 { "active", TEST_ACTIVE }, 976 { "others-idle", TEST_OTHERS }, 977 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 978 { 979 "others-priority", 980 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 981 }, 982 { 983 "self-priority", 984 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 985 }, 986 { } 987 }; 988 struct intel_gt *gt = arg; 989 typeof(*phases) *p; 990 int err; 991 992 for (p = phases; p->name; p++) { 993 if (p->flags & TEST_PRIORITY) { 994 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 995 continue; 996 } 997 998 err = __igt_reset_engines(arg, p->name, p->flags); 999 if (err) 1000 return err; 1001 } 1002 1003 return 0; 1004 } 1005 1006 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1007 { 1008 u32 count = i915_reset_count(>->i915->gpu_error); 1009 1010 intel_gt_reset(gt, mask, NULL); 1011 1012 return count; 1013 } 1014 1015 static int igt_reset_wait(void *arg) 1016 { 1017 struct intel_gt *gt = arg; 1018 struct i915_gpu_error *global = >->i915->gpu_error; 1019 struct intel_engine_cs *engine = gt->i915->engine[RCS0]; 1020 struct i915_request *rq; 1021 unsigned int reset_count; 1022 struct hang h; 1023 long timeout; 1024 int err; 1025 1026 if (!engine || !intel_engine_can_store_dword(engine)) 1027 return 0; 1028 1029 /* Check that we detect a stuck waiter and issue a reset */ 1030 1031 igt_global_reset_lock(gt); 1032 1033 err = hang_init(&h, gt); 1034 if (err) 1035 goto unlock; 1036 1037 rq = hang_create_request(&h, engine); 1038 if (IS_ERR(rq)) { 1039 err = PTR_ERR(rq); 1040 goto fini; 1041 } 1042 1043 i915_request_get(rq); 1044 i915_request_add(rq); 1045 1046 if (!wait_until_running(&h, rq)) { 1047 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1048 1049 pr_err("%s: Failed to start request %llx, at %x\n", 1050 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1051 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1052 1053 intel_gt_set_wedged(gt); 1054 1055 err = -EIO; 1056 goto out_rq; 1057 } 1058 1059 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1060 1061 timeout = i915_request_wait(rq, 0, 10); 1062 if (timeout < 0) { 1063 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1064 timeout); 1065 err = timeout; 1066 goto out_rq; 1067 } 1068 1069 if (i915_reset_count(global) == reset_count) { 1070 pr_err("No GPU reset recorded!\n"); 1071 err = -EINVAL; 1072 goto out_rq; 1073 } 1074 1075 out_rq: 1076 i915_request_put(rq); 1077 fini: 1078 hang_fini(&h); 1079 unlock: 1080 igt_global_reset_unlock(gt); 1081 1082 if (intel_gt_is_wedged(gt)) 1083 return -EIO; 1084 1085 return err; 1086 } 1087 1088 struct evict_vma { 1089 struct completion completion; 1090 struct i915_vma *vma; 1091 }; 1092 1093 static int evict_vma(void *data) 1094 { 1095 struct evict_vma *arg = data; 1096 struct i915_address_space *vm = arg->vma->vm; 1097 struct drm_mm_node evict = arg->vma->node; 1098 int err; 1099 1100 complete(&arg->completion); 1101 1102 mutex_lock(&vm->mutex); 1103 err = i915_gem_evict_for_node(vm, &evict, 0); 1104 mutex_unlock(&vm->mutex); 1105 1106 return err; 1107 } 1108 1109 static int evict_fence(void *data) 1110 { 1111 struct evict_vma *arg = data; 1112 int err; 1113 1114 complete(&arg->completion); 1115 1116 /* Mark the fence register as dirty to force the mmio update. */ 1117 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1118 if (err) { 1119 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1120 return err; 1121 } 1122 1123 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1124 if (err) { 1125 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1126 return err; 1127 } 1128 1129 err = i915_vma_pin_fence(arg->vma); 1130 i915_vma_unpin(arg->vma); 1131 if (err) { 1132 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1133 return err; 1134 } 1135 1136 i915_vma_unpin_fence(arg->vma); 1137 1138 return 0; 1139 } 1140 1141 static int __igt_reset_evict_vma(struct intel_gt *gt, 1142 struct i915_address_space *vm, 1143 int (*fn)(void *), 1144 unsigned int flags) 1145 { 1146 struct intel_engine_cs *engine = gt->i915->engine[RCS0]; 1147 struct drm_i915_gem_object *obj; 1148 struct task_struct *tsk = NULL; 1149 struct i915_request *rq; 1150 struct evict_vma arg; 1151 struct hang h; 1152 int err; 1153 1154 if (!engine || !intel_engine_can_store_dword(engine)) 1155 return 0; 1156 1157 /* Check that we can recover an unbind stuck on a hanging request */ 1158 1159 err = hang_init(&h, gt); 1160 if (err) 1161 return err; 1162 1163 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1164 if (IS_ERR(obj)) { 1165 err = PTR_ERR(obj); 1166 goto fini; 1167 } 1168 1169 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1170 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1171 if (err) { 1172 pr_err("Invalid X-tiling settings; err:%d\n", err); 1173 goto out_obj; 1174 } 1175 } 1176 1177 arg.vma = i915_vma_instance(obj, vm, NULL); 1178 if (IS_ERR(arg.vma)) { 1179 err = PTR_ERR(arg.vma); 1180 goto out_obj; 1181 } 1182 1183 rq = hang_create_request(&h, engine); 1184 if (IS_ERR(rq)) { 1185 err = PTR_ERR(rq); 1186 goto out_obj; 1187 } 1188 1189 err = i915_vma_pin(arg.vma, 0, 0, 1190 i915_vma_is_ggtt(arg.vma) ? 1191 PIN_GLOBAL | PIN_MAPPABLE : 1192 PIN_USER); 1193 if (err) { 1194 i915_request_add(rq); 1195 goto out_obj; 1196 } 1197 1198 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1199 err = i915_vma_pin_fence(arg.vma); 1200 if (err) { 1201 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1202 i915_vma_unpin(arg.vma); 1203 i915_request_add(rq); 1204 goto out_obj; 1205 } 1206 } 1207 1208 i915_vma_lock(arg.vma); 1209 err = i915_request_await_object(rq, arg.vma->obj, 1210 flags & EXEC_OBJECT_WRITE); 1211 if (err == 0) 1212 err = i915_vma_move_to_active(arg.vma, rq, flags); 1213 i915_vma_unlock(arg.vma); 1214 1215 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1216 i915_vma_unpin_fence(arg.vma); 1217 i915_vma_unpin(arg.vma); 1218 1219 i915_request_get(rq); 1220 i915_request_add(rq); 1221 if (err) 1222 goto out_rq; 1223 1224 if (!wait_until_running(&h, rq)) { 1225 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1226 1227 pr_err("%s: Failed to start request %llx, at %x\n", 1228 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1229 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1230 1231 intel_gt_set_wedged(gt); 1232 goto out_reset; 1233 } 1234 1235 init_completion(&arg.completion); 1236 1237 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1238 if (IS_ERR(tsk)) { 1239 err = PTR_ERR(tsk); 1240 tsk = NULL; 1241 goto out_reset; 1242 } 1243 get_task_struct(tsk); 1244 1245 wait_for_completion(&arg.completion); 1246 1247 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1248 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1249 1250 pr_err("igt/evict_vma kthread did not wait\n"); 1251 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1252 1253 intel_gt_set_wedged(gt); 1254 goto out_reset; 1255 } 1256 1257 out_reset: 1258 igt_global_reset_lock(gt); 1259 fake_hangcheck(gt, rq->engine->mask); 1260 igt_global_reset_unlock(gt); 1261 1262 if (tsk) { 1263 struct intel_wedge_me w; 1264 1265 /* The reset, even indirectly, should take less than 10ms. */ 1266 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1267 err = kthread_stop(tsk); 1268 1269 put_task_struct(tsk); 1270 } 1271 1272 out_rq: 1273 i915_request_put(rq); 1274 out_obj: 1275 i915_gem_object_put(obj); 1276 fini: 1277 hang_fini(&h); 1278 if (intel_gt_is_wedged(gt)) 1279 return -EIO; 1280 1281 return err; 1282 } 1283 1284 static int igt_reset_evict_ggtt(void *arg) 1285 { 1286 struct intel_gt *gt = arg; 1287 1288 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1289 evict_vma, EXEC_OBJECT_WRITE); 1290 } 1291 1292 static int igt_reset_evict_ppgtt(void *arg) 1293 { 1294 struct intel_gt *gt = arg; 1295 struct i915_gem_context *ctx; 1296 struct i915_address_space *vm; 1297 struct drm_file *file; 1298 int err; 1299 1300 file = mock_file(gt->i915); 1301 if (IS_ERR(file)) 1302 return PTR_ERR(file); 1303 1304 ctx = live_context(gt->i915, file); 1305 if (IS_ERR(ctx)) { 1306 err = PTR_ERR(ctx); 1307 goto out; 1308 } 1309 1310 err = 0; 1311 vm = i915_gem_context_get_vm_rcu(ctx); 1312 if (!i915_is_ggtt(vm)) { 1313 /* aliasing == global gtt locking, covered above */ 1314 err = __igt_reset_evict_vma(gt, vm, 1315 evict_vma, EXEC_OBJECT_WRITE); 1316 } 1317 i915_vm_put(vm); 1318 1319 out: 1320 mock_file_free(gt->i915, file); 1321 return err; 1322 } 1323 1324 static int igt_reset_evict_fence(void *arg) 1325 { 1326 struct intel_gt *gt = arg; 1327 1328 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1329 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1330 } 1331 1332 static int wait_for_others(struct intel_gt *gt, 1333 struct intel_engine_cs *exclude) 1334 { 1335 struct intel_engine_cs *engine; 1336 enum intel_engine_id id; 1337 1338 for_each_engine(engine, gt->i915, id) { 1339 if (engine == exclude) 1340 continue; 1341 1342 if (!wait_for_idle(engine)) 1343 return -EIO; 1344 } 1345 1346 return 0; 1347 } 1348 1349 static int igt_reset_queue(void *arg) 1350 { 1351 struct intel_gt *gt = arg; 1352 struct i915_gpu_error *global = >->i915->gpu_error; 1353 struct intel_engine_cs *engine; 1354 enum intel_engine_id id; 1355 struct hang h; 1356 int err; 1357 1358 /* Check that we replay pending requests following a hang */ 1359 1360 igt_global_reset_lock(gt); 1361 1362 err = hang_init(&h, gt); 1363 if (err) 1364 goto unlock; 1365 1366 for_each_engine(engine, gt->i915, id) { 1367 struct i915_request *prev; 1368 IGT_TIMEOUT(end_time); 1369 unsigned int count; 1370 1371 if (!intel_engine_can_store_dword(engine)) 1372 continue; 1373 1374 prev = hang_create_request(&h, engine); 1375 if (IS_ERR(prev)) { 1376 err = PTR_ERR(prev); 1377 goto fini; 1378 } 1379 1380 i915_request_get(prev); 1381 i915_request_add(prev); 1382 1383 count = 0; 1384 do { 1385 struct i915_request *rq; 1386 unsigned int reset_count; 1387 1388 rq = hang_create_request(&h, engine); 1389 if (IS_ERR(rq)) { 1390 err = PTR_ERR(rq); 1391 goto fini; 1392 } 1393 1394 i915_request_get(rq); 1395 i915_request_add(rq); 1396 1397 /* 1398 * XXX We don't handle resetting the kernel context 1399 * very well. If we trigger a device reset twice in 1400 * quick succession while the kernel context is 1401 * executing, we may end up skipping the breadcrumb. 1402 * This is really only a problem for the selftest as 1403 * normally there is a large interlude between resets 1404 * (hangcheck), or we focus on resetting just one 1405 * engine and so avoid repeatedly resetting innocents. 1406 */ 1407 err = wait_for_others(gt, engine); 1408 if (err) { 1409 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1410 __func__, engine->name); 1411 i915_request_put(rq); 1412 i915_request_put(prev); 1413 1414 GEM_TRACE_DUMP(); 1415 intel_gt_set_wedged(gt); 1416 goto fini; 1417 } 1418 1419 if (!wait_until_running(&h, prev)) { 1420 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1421 1422 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1423 __func__, engine->name, 1424 prev->fence.seqno, hws_seqno(&h, prev)); 1425 intel_engine_dump(engine, &p, 1426 "%s\n", engine->name); 1427 1428 i915_request_put(rq); 1429 i915_request_put(prev); 1430 1431 intel_gt_set_wedged(gt); 1432 1433 err = -EIO; 1434 goto fini; 1435 } 1436 1437 reset_count = fake_hangcheck(gt, BIT(id)); 1438 1439 if (prev->fence.error != -EIO) { 1440 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1441 prev->fence.error); 1442 i915_request_put(rq); 1443 i915_request_put(prev); 1444 err = -EINVAL; 1445 goto fini; 1446 } 1447 1448 if (rq->fence.error) { 1449 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1450 rq->fence.error); 1451 i915_request_put(rq); 1452 i915_request_put(prev); 1453 err = -EINVAL; 1454 goto fini; 1455 } 1456 1457 if (i915_reset_count(global) == reset_count) { 1458 pr_err("No GPU reset recorded!\n"); 1459 i915_request_put(rq); 1460 i915_request_put(prev); 1461 err = -EINVAL; 1462 goto fini; 1463 } 1464 1465 i915_request_put(prev); 1466 prev = rq; 1467 count++; 1468 } while (time_before(jiffies, end_time)); 1469 pr_info("%s: Completed %d resets\n", engine->name, count); 1470 1471 *h.batch = MI_BATCH_BUFFER_END; 1472 intel_gt_chipset_flush(engine->gt); 1473 1474 i915_request_put(prev); 1475 1476 err = igt_flush_test(gt->i915); 1477 if (err) 1478 break; 1479 } 1480 1481 fini: 1482 hang_fini(&h); 1483 unlock: 1484 igt_global_reset_unlock(gt); 1485 1486 if (intel_gt_is_wedged(gt)) 1487 return -EIO; 1488 1489 return err; 1490 } 1491 1492 static int igt_handle_error(void *arg) 1493 { 1494 struct intel_gt *gt = arg; 1495 struct i915_gpu_error *global = >->i915->gpu_error; 1496 struct intel_engine_cs *engine = gt->i915->engine[RCS0]; 1497 struct hang h; 1498 struct i915_request *rq; 1499 struct i915_gpu_state *error; 1500 int err; 1501 1502 /* Check that we can issue a global GPU and engine reset */ 1503 1504 if (!intel_has_reset_engine(gt)) 1505 return 0; 1506 1507 if (!engine || !intel_engine_can_store_dword(engine)) 1508 return 0; 1509 1510 err = hang_init(&h, gt); 1511 if (err) 1512 return err; 1513 1514 rq = hang_create_request(&h, engine); 1515 if (IS_ERR(rq)) { 1516 err = PTR_ERR(rq); 1517 goto err_fini; 1518 } 1519 1520 i915_request_get(rq); 1521 i915_request_add(rq); 1522 1523 if (!wait_until_running(&h, rq)) { 1524 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1525 1526 pr_err("%s: Failed to start request %llx, at %x\n", 1527 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1528 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1529 1530 intel_gt_set_wedged(gt); 1531 1532 err = -EIO; 1533 goto err_request; 1534 } 1535 1536 /* Temporarily disable error capture */ 1537 error = xchg(&global->first_error, (void *)-1); 1538 1539 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1540 1541 xchg(&global->first_error, error); 1542 1543 if (rq->fence.error != -EIO) { 1544 pr_err("Guilty request not identified!\n"); 1545 err = -EINVAL; 1546 goto err_request; 1547 } 1548 1549 err_request: 1550 i915_request_put(rq); 1551 err_fini: 1552 hang_fini(&h); 1553 return err; 1554 } 1555 1556 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1557 const struct igt_atomic_section *p, 1558 const char *mode) 1559 { 1560 struct tasklet_struct * const t = &engine->execlists.tasklet; 1561 int err; 1562 1563 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1564 engine->name, mode, p->name); 1565 1566 tasklet_disable_nosync(t); 1567 p->critical_section_begin(); 1568 1569 err = intel_engine_reset(engine, NULL); 1570 1571 p->critical_section_end(); 1572 tasklet_enable(t); 1573 1574 if (err) 1575 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1576 engine->name, mode, p->name); 1577 1578 return err; 1579 } 1580 1581 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1582 const struct igt_atomic_section *p) 1583 { 1584 struct i915_request *rq; 1585 struct hang h; 1586 int err; 1587 1588 err = __igt_atomic_reset_engine(engine, p, "idle"); 1589 if (err) 1590 return err; 1591 1592 err = hang_init(&h, engine->gt); 1593 if (err) 1594 return err; 1595 1596 rq = hang_create_request(&h, engine); 1597 if (IS_ERR(rq)) { 1598 err = PTR_ERR(rq); 1599 goto out; 1600 } 1601 1602 i915_request_get(rq); 1603 i915_request_add(rq); 1604 1605 if (wait_until_running(&h, rq)) { 1606 err = __igt_atomic_reset_engine(engine, p, "active"); 1607 } else { 1608 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1609 __func__, engine->name, 1610 rq->fence.seqno, hws_seqno(&h, rq)); 1611 intel_gt_set_wedged(engine->gt); 1612 err = -EIO; 1613 } 1614 1615 if (err == 0) { 1616 struct intel_wedge_me w; 1617 1618 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1619 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1620 if (intel_gt_is_wedged(engine->gt)) 1621 err = -EIO; 1622 } 1623 1624 i915_request_put(rq); 1625 out: 1626 hang_fini(&h); 1627 return err; 1628 } 1629 1630 static int igt_reset_engines_atomic(void *arg) 1631 { 1632 struct intel_gt *gt = arg; 1633 const typeof(*igt_atomic_phases) *p; 1634 int err = 0; 1635 1636 /* Check that the engines resets are usable from atomic context */ 1637 1638 if (!intel_has_reset_engine(gt)) 1639 return 0; 1640 1641 if (USES_GUC_SUBMISSION(gt->i915)) 1642 return 0; 1643 1644 igt_global_reset_lock(gt); 1645 1646 /* Flush any requests before we get started and check basics */ 1647 if (!igt_force_reset(gt)) 1648 goto unlock; 1649 1650 for (p = igt_atomic_phases; p->name; p++) { 1651 struct intel_engine_cs *engine; 1652 enum intel_engine_id id; 1653 1654 for_each_engine(engine, gt->i915, id) { 1655 err = igt_atomic_reset_engine(engine, p); 1656 if (err) 1657 goto out; 1658 } 1659 } 1660 1661 out: 1662 /* As we poke around the guts, do a full reset before continuing. */ 1663 igt_force_reset(gt); 1664 unlock: 1665 igt_global_reset_unlock(gt); 1666 1667 return err; 1668 } 1669 1670 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1671 { 1672 static const struct i915_subtest tests[] = { 1673 SUBTEST(igt_hang_sanitycheck), 1674 SUBTEST(igt_reset_nop), 1675 SUBTEST(igt_reset_nop_engine), 1676 SUBTEST(igt_reset_idle_engine), 1677 SUBTEST(igt_reset_active_engine), 1678 SUBTEST(igt_reset_engines), 1679 SUBTEST(igt_reset_engines_atomic), 1680 SUBTEST(igt_reset_queue), 1681 SUBTEST(igt_reset_wait), 1682 SUBTEST(igt_reset_evict_ggtt), 1683 SUBTEST(igt_reset_evict_ppgtt), 1684 SUBTEST(igt_reset_evict_fence), 1685 SUBTEST(igt_handle_error), 1686 }; 1687 struct intel_gt *gt = &i915->gt; 1688 intel_wakeref_t wakeref; 1689 bool saved_hangcheck; 1690 int err; 1691 1692 if (!intel_has_gpu_reset(gt)) 1693 return 0; 1694 1695 if (intel_gt_is_wedged(gt)) 1696 return -EIO; /* we're long past hope of a successful reset */ 1697 1698 wakeref = intel_runtime_pm_get(>->i915->runtime_pm); 1699 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck); 1700 drain_delayed_work(>->hangcheck.work); /* flush param */ 1701 1702 err = intel_gt_live_subtests(tests, gt); 1703 1704 i915_modparams.enable_hangcheck = saved_hangcheck; 1705 intel_runtime_pm_put(>->i915->runtime_pm, wakeref); 1706 1707 return err; 1708 } 1709