1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/kthread.h> 26 27 #include "gem/i915_gem_context.h" 28 #include "gt/intel_gt.h" 29 #include "intel_engine_pm.h" 30 31 #include "i915_selftest.h" 32 #include "selftests/i915_random.h" 33 #include "selftests/igt_flush_test.h" 34 #include "selftests/igt_reset.h" 35 #include "selftests/igt_atomic.h" 36 37 #include "selftests/mock_drm.h" 38 39 #include "gem/selftests/mock_context.h" 40 #include "gem/selftests/igt_gem_utils.h" 41 42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 43 44 struct hang { 45 struct intel_gt *gt; 46 struct drm_i915_gem_object *hws; 47 struct drm_i915_gem_object *obj; 48 struct i915_gem_context *ctx; 49 u32 *seqno; 50 u32 *batch; 51 }; 52 53 static int hang_init(struct hang *h, struct intel_gt *gt) 54 { 55 void *vaddr; 56 int err; 57 58 memset(h, 0, sizeof(*h)); 59 h->gt = gt; 60 61 mutex_lock(>->i915->drm.struct_mutex); 62 h->ctx = kernel_context(gt->i915); 63 mutex_unlock(>->i915->drm.struct_mutex); 64 if (IS_ERR(h->ctx)) 65 return PTR_ERR(h->ctx); 66 67 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 68 69 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 70 if (IS_ERR(h->hws)) { 71 err = PTR_ERR(h->hws); 72 goto err_ctx; 73 } 74 75 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 76 if (IS_ERR(h->obj)) { 77 err = PTR_ERR(h->obj); 78 goto err_hws; 79 } 80 81 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 82 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); 83 if (IS_ERR(vaddr)) { 84 err = PTR_ERR(vaddr); 85 goto err_obj; 86 } 87 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 88 89 vaddr = i915_gem_object_pin_map(h->obj, 90 i915_coherent_map_type(gt->i915)); 91 if (IS_ERR(vaddr)) { 92 err = PTR_ERR(vaddr); 93 goto err_unpin_hws; 94 } 95 h->batch = vaddr; 96 97 return 0; 98 99 err_unpin_hws: 100 i915_gem_object_unpin_map(h->hws); 101 err_obj: 102 i915_gem_object_put(h->obj); 103 err_hws: 104 i915_gem_object_put(h->hws); 105 err_ctx: 106 kernel_context_close(h->ctx); 107 return err; 108 } 109 110 static u64 hws_address(const struct i915_vma *hws, 111 const struct i915_request *rq) 112 { 113 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 114 } 115 116 static int move_to_active(struct i915_vma *vma, 117 struct i915_request *rq, 118 unsigned int flags) 119 { 120 int err; 121 122 i915_vma_lock(vma); 123 err = i915_request_await_object(rq, vma->obj, 124 flags & EXEC_OBJECT_WRITE); 125 if (err == 0) 126 err = i915_vma_move_to_active(vma, rq, flags); 127 i915_vma_unlock(vma); 128 129 return err; 130 } 131 132 static struct i915_request * 133 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 134 { 135 struct intel_gt *gt = h->gt; 136 struct i915_address_space *vm = h->ctx->vm ?: &engine->gt->ggtt->vm; 137 struct drm_i915_gem_object *obj; 138 struct i915_request *rq = NULL; 139 struct i915_vma *hws, *vma; 140 unsigned int flags; 141 void *vaddr; 142 u32 *batch; 143 int err; 144 145 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 146 if (IS_ERR(obj)) 147 return ERR_CAST(obj); 148 149 vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915)); 150 if (IS_ERR(vaddr)) { 151 i915_gem_object_put(obj); 152 return ERR_CAST(vaddr); 153 } 154 155 i915_gem_object_unpin_map(h->obj); 156 i915_gem_object_put(h->obj); 157 158 h->obj = obj; 159 h->batch = vaddr; 160 161 vma = i915_vma_instance(h->obj, vm, NULL); 162 if (IS_ERR(vma)) 163 return ERR_CAST(vma); 164 165 hws = i915_vma_instance(h->hws, vm, NULL); 166 if (IS_ERR(hws)) 167 return ERR_CAST(hws); 168 169 err = i915_vma_pin(vma, 0, 0, PIN_USER); 170 if (err) 171 return ERR_PTR(err); 172 173 err = i915_vma_pin(hws, 0, 0, PIN_USER); 174 if (err) 175 goto unpin_vma; 176 177 rq = igt_request_alloc(h->ctx, engine); 178 if (IS_ERR(rq)) { 179 err = PTR_ERR(rq); 180 goto unpin_hws; 181 } 182 183 err = move_to_active(vma, rq, 0); 184 if (err) 185 goto cancel_rq; 186 187 err = move_to_active(hws, rq, 0); 188 if (err) 189 goto cancel_rq; 190 191 batch = h->batch; 192 if (INTEL_GEN(gt->i915) >= 8) { 193 *batch++ = MI_STORE_DWORD_IMM_GEN4; 194 *batch++ = lower_32_bits(hws_address(hws, rq)); 195 *batch++ = upper_32_bits(hws_address(hws, rq)); 196 *batch++ = rq->fence.seqno; 197 *batch++ = MI_ARB_CHECK; 198 199 memset(batch, 0, 1024); 200 batch += 1024 / sizeof(*batch); 201 202 *batch++ = MI_ARB_CHECK; 203 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 204 *batch++ = lower_32_bits(vma->node.start); 205 *batch++ = upper_32_bits(vma->node.start); 206 } else if (INTEL_GEN(gt->i915) >= 6) { 207 *batch++ = MI_STORE_DWORD_IMM_GEN4; 208 *batch++ = 0; 209 *batch++ = lower_32_bits(hws_address(hws, rq)); 210 *batch++ = rq->fence.seqno; 211 *batch++ = MI_ARB_CHECK; 212 213 memset(batch, 0, 1024); 214 batch += 1024 / sizeof(*batch); 215 216 *batch++ = MI_ARB_CHECK; 217 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 218 *batch++ = lower_32_bits(vma->node.start); 219 } else if (INTEL_GEN(gt->i915) >= 4) { 220 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 221 *batch++ = 0; 222 *batch++ = lower_32_bits(hws_address(hws, rq)); 223 *batch++ = rq->fence.seqno; 224 *batch++ = MI_ARB_CHECK; 225 226 memset(batch, 0, 1024); 227 batch += 1024 / sizeof(*batch); 228 229 *batch++ = MI_ARB_CHECK; 230 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 231 *batch++ = lower_32_bits(vma->node.start); 232 } else { 233 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 234 *batch++ = lower_32_bits(hws_address(hws, rq)); 235 *batch++ = rq->fence.seqno; 236 *batch++ = MI_ARB_CHECK; 237 238 memset(batch, 0, 1024); 239 batch += 1024 / sizeof(*batch); 240 241 *batch++ = MI_ARB_CHECK; 242 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 243 *batch++ = lower_32_bits(vma->node.start); 244 } 245 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 246 intel_gt_chipset_flush(engine->gt); 247 248 if (rq->engine->emit_init_breadcrumb) { 249 err = rq->engine->emit_init_breadcrumb(rq); 250 if (err) 251 goto cancel_rq; 252 } 253 254 flags = 0; 255 if (INTEL_GEN(gt->i915) <= 5) 256 flags |= I915_DISPATCH_SECURE; 257 258 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 259 260 cancel_rq: 261 if (err) { 262 i915_request_skip(rq, err); 263 i915_request_add(rq); 264 } 265 unpin_hws: 266 i915_vma_unpin(hws); 267 unpin_vma: 268 i915_vma_unpin(vma); 269 return err ? ERR_PTR(err) : rq; 270 } 271 272 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 273 { 274 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 275 } 276 277 static void hang_fini(struct hang *h) 278 { 279 *h->batch = MI_BATCH_BUFFER_END; 280 intel_gt_chipset_flush(h->gt); 281 282 i915_gem_object_unpin_map(h->obj); 283 i915_gem_object_put(h->obj); 284 285 i915_gem_object_unpin_map(h->hws); 286 i915_gem_object_put(h->hws); 287 288 kernel_context_close(h->ctx); 289 290 igt_flush_test(h->gt->i915); 291 } 292 293 static bool wait_until_running(struct hang *h, struct i915_request *rq) 294 { 295 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 296 rq->fence.seqno), 297 10) && 298 wait_for(i915_seqno_passed(hws_seqno(h, rq), 299 rq->fence.seqno), 300 1000)); 301 } 302 303 static int igt_hang_sanitycheck(void *arg) 304 { 305 struct intel_gt *gt = arg; 306 struct i915_request *rq; 307 struct intel_engine_cs *engine; 308 enum intel_engine_id id; 309 struct hang h; 310 int err; 311 312 /* Basic check that we can execute our hanging batch */ 313 314 err = hang_init(&h, gt); 315 if (err) 316 return err; 317 318 for_each_engine(engine, gt->i915, id) { 319 struct intel_wedge_me w; 320 long timeout; 321 322 if (!intel_engine_can_store_dword(engine)) 323 continue; 324 325 rq = hang_create_request(&h, engine); 326 if (IS_ERR(rq)) { 327 err = PTR_ERR(rq); 328 pr_err("Failed to create request for %s, err=%d\n", 329 engine->name, err); 330 goto fini; 331 } 332 333 i915_request_get(rq); 334 335 *h.batch = MI_BATCH_BUFFER_END; 336 intel_gt_chipset_flush(engine->gt); 337 338 i915_request_add(rq); 339 340 timeout = 0; 341 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 342 timeout = i915_request_wait(rq, 0, 343 MAX_SCHEDULE_TIMEOUT); 344 if (intel_gt_is_wedged(gt)) 345 timeout = -EIO; 346 347 i915_request_put(rq); 348 349 if (timeout < 0) { 350 err = timeout; 351 pr_err("Wait for request failed on %s, err=%d\n", 352 engine->name, err); 353 goto fini; 354 } 355 } 356 357 fini: 358 hang_fini(&h); 359 return err; 360 } 361 362 static bool wait_for_idle(struct intel_engine_cs *engine) 363 { 364 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 365 } 366 367 static int igt_reset_nop(void *arg) 368 { 369 struct intel_gt *gt = arg; 370 struct i915_gpu_error *global = >->i915->gpu_error; 371 struct intel_engine_cs *engine; 372 struct i915_gem_context *ctx; 373 unsigned int reset_count, count; 374 enum intel_engine_id id; 375 struct drm_file *file; 376 IGT_TIMEOUT(end_time); 377 int err = 0; 378 379 /* Check that we can reset during non-user portions of requests */ 380 381 file = mock_file(gt->i915); 382 if (IS_ERR(file)) 383 return PTR_ERR(file); 384 385 mutex_lock(>->i915->drm.struct_mutex); 386 ctx = live_context(gt->i915, file); 387 mutex_unlock(>->i915->drm.struct_mutex); 388 if (IS_ERR(ctx)) { 389 err = PTR_ERR(ctx); 390 goto out; 391 } 392 393 i915_gem_context_clear_bannable(ctx); 394 reset_count = i915_reset_count(global); 395 count = 0; 396 do { 397 for_each_engine(engine, gt->i915, id) { 398 int i; 399 400 for (i = 0; i < 16; i++) { 401 struct i915_request *rq; 402 403 rq = igt_request_alloc(ctx, engine); 404 if (IS_ERR(rq)) { 405 err = PTR_ERR(rq); 406 break; 407 } 408 409 i915_request_add(rq); 410 } 411 } 412 413 igt_global_reset_lock(gt); 414 intel_gt_reset(gt, ALL_ENGINES, NULL); 415 igt_global_reset_unlock(gt); 416 417 if (intel_gt_is_wedged(gt)) { 418 err = -EIO; 419 break; 420 } 421 422 if (i915_reset_count(global) != reset_count + ++count) { 423 pr_err("Full GPU reset not recorded!\n"); 424 err = -EINVAL; 425 break; 426 } 427 428 err = igt_flush_test(gt->i915); 429 if (err) 430 break; 431 } while (time_before(jiffies, end_time)); 432 pr_info("%s: %d resets\n", __func__, count); 433 434 err = igt_flush_test(gt->i915); 435 out: 436 mock_file_free(gt->i915, file); 437 if (intel_gt_is_wedged(gt)) 438 err = -EIO; 439 return err; 440 } 441 442 static int igt_reset_nop_engine(void *arg) 443 { 444 struct intel_gt *gt = arg; 445 struct i915_gpu_error *global = >->i915->gpu_error; 446 struct intel_engine_cs *engine; 447 struct i915_gem_context *ctx; 448 enum intel_engine_id id; 449 struct drm_file *file; 450 int err = 0; 451 452 /* Check that we can engine-reset during non-user portions */ 453 454 if (!intel_has_reset_engine(gt)) 455 return 0; 456 457 file = mock_file(gt->i915); 458 if (IS_ERR(file)) 459 return PTR_ERR(file); 460 461 mutex_lock(>->i915->drm.struct_mutex); 462 ctx = live_context(gt->i915, file); 463 mutex_unlock(>->i915->drm.struct_mutex); 464 if (IS_ERR(ctx)) { 465 err = PTR_ERR(ctx); 466 goto out; 467 } 468 469 i915_gem_context_clear_bannable(ctx); 470 for_each_engine(engine, gt->i915, id) { 471 unsigned int reset_count, reset_engine_count; 472 unsigned int count; 473 IGT_TIMEOUT(end_time); 474 475 reset_count = i915_reset_count(global); 476 reset_engine_count = i915_reset_engine_count(global, engine); 477 count = 0; 478 479 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 480 do { 481 int i; 482 483 if (!wait_for_idle(engine)) { 484 pr_err("%s failed to idle before reset\n", 485 engine->name); 486 err = -EIO; 487 break; 488 } 489 490 for (i = 0; i < 16; i++) { 491 struct i915_request *rq; 492 493 rq = igt_request_alloc(ctx, engine); 494 if (IS_ERR(rq)) { 495 err = PTR_ERR(rq); 496 break; 497 } 498 499 i915_request_add(rq); 500 } 501 err = intel_engine_reset(engine, NULL); 502 if (err) { 503 pr_err("i915_reset_engine failed\n"); 504 break; 505 } 506 507 if (i915_reset_count(global) != reset_count) { 508 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 509 err = -EINVAL; 510 break; 511 } 512 513 if (i915_reset_engine_count(global, engine) != 514 reset_engine_count + ++count) { 515 pr_err("%s engine reset not recorded!\n", 516 engine->name); 517 err = -EINVAL; 518 break; 519 } 520 } while (time_before(jiffies, end_time)); 521 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 522 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 523 524 if (err) 525 break; 526 527 err = igt_flush_test(gt->i915); 528 if (err) 529 break; 530 } 531 532 err = igt_flush_test(gt->i915); 533 out: 534 mock_file_free(gt->i915, file); 535 if (intel_gt_is_wedged(gt)) 536 err = -EIO; 537 return err; 538 } 539 540 static int __igt_reset_engine(struct intel_gt *gt, bool active) 541 { 542 struct i915_gpu_error *global = >->i915->gpu_error; 543 struct intel_engine_cs *engine; 544 enum intel_engine_id id; 545 struct hang h; 546 int err = 0; 547 548 /* Check that we can issue an engine reset on an idle engine (no-op) */ 549 550 if (!intel_has_reset_engine(gt)) 551 return 0; 552 553 if (active) { 554 err = hang_init(&h, gt); 555 if (err) 556 return err; 557 } 558 559 for_each_engine(engine, gt->i915, id) { 560 unsigned int reset_count, reset_engine_count; 561 IGT_TIMEOUT(end_time); 562 563 if (active && !intel_engine_can_store_dword(engine)) 564 continue; 565 566 if (!wait_for_idle(engine)) { 567 pr_err("%s failed to idle before reset\n", 568 engine->name); 569 err = -EIO; 570 break; 571 } 572 573 reset_count = i915_reset_count(global); 574 reset_engine_count = i915_reset_engine_count(global, engine); 575 576 intel_engine_pm_get(engine); 577 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 578 do { 579 if (active) { 580 struct i915_request *rq; 581 582 rq = hang_create_request(&h, engine); 583 if (IS_ERR(rq)) { 584 err = PTR_ERR(rq); 585 break; 586 } 587 588 i915_request_get(rq); 589 i915_request_add(rq); 590 591 if (!wait_until_running(&h, rq)) { 592 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 593 594 pr_err("%s: Failed to start request %llx, at %x\n", 595 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 596 intel_engine_dump(engine, &p, 597 "%s\n", engine->name); 598 599 i915_request_put(rq); 600 err = -EIO; 601 break; 602 } 603 604 i915_request_put(rq); 605 } 606 607 err = intel_engine_reset(engine, NULL); 608 if (err) { 609 pr_err("i915_reset_engine failed\n"); 610 break; 611 } 612 613 if (i915_reset_count(global) != reset_count) { 614 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 615 err = -EINVAL; 616 break; 617 } 618 619 if (i915_reset_engine_count(global, engine) != 620 ++reset_engine_count) { 621 pr_err("%s engine reset not recorded!\n", 622 engine->name); 623 err = -EINVAL; 624 break; 625 } 626 } while (time_before(jiffies, end_time)); 627 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 628 intel_engine_pm_put(engine); 629 630 if (err) 631 break; 632 633 err = igt_flush_test(gt->i915); 634 if (err) 635 break; 636 } 637 638 if (intel_gt_is_wedged(gt)) 639 err = -EIO; 640 641 if (active) 642 hang_fini(&h); 643 644 return err; 645 } 646 647 static int igt_reset_idle_engine(void *arg) 648 { 649 return __igt_reset_engine(arg, false); 650 } 651 652 static int igt_reset_active_engine(void *arg) 653 { 654 return __igt_reset_engine(arg, true); 655 } 656 657 struct active_engine { 658 struct task_struct *task; 659 struct intel_engine_cs *engine; 660 unsigned long resets; 661 unsigned int flags; 662 }; 663 664 #define TEST_ACTIVE BIT(0) 665 #define TEST_OTHERS BIT(1) 666 #define TEST_SELF BIT(2) 667 #define TEST_PRIORITY BIT(3) 668 669 static int active_request_put(struct i915_request *rq) 670 { 671 int err = 0; 672 673 if (!rq) 674 return 0; 675 676 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 677 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 678 rq->engine->name, 679 rq->fence.context, 680 rq->fence.seqno); 681 GEM_TRACE_DUMP(); 682 683 intel_gt_set_wedged(rq->engine->gt); 684 err = -EIO; 685 } 686 687 i915_request_put(rq); 688 689 return err; 690 } 691 692 static int active_engine(void *data) 693 { 694 I915_RND_STATE(prng); 695 struct active_engine *arg = data; 696 struct intel_engine_cs *engine = arg->engine; 697 struct i915_request *rq[8] = {}; 698 struct i915_gem_context *ctx[ARRAY_SIZE(rq)]; 699 struct drm_file *file; 700 unsigned long count = 0; 701 int err = 0; 702 703 file = mock_file(engine->i915); 704 if (IS_ERR(file)) 705 return PTR_ERR(file); 706 707 for (count = 0; count < ARRAY_SIZE(ctx); count++) { 708 mutex_lock(&engine->i915->drm.struct_mutex); 709 ctx[count] = live_context(engine->i915, file); 710 mutex_unlock(&engine->i915->drm.struct_mutex); 711 if (IS_ERR(ctx[count])) { 712 err = PTR_ERR(ctx[count]); 713 while (--count) 714 i915_gem_context_put(ctx[count]); 715 goto err_file; 716 } 717 } 718 719 while (!kthread_should_stop()) { 720 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 721 struct i915_request *old = rq[idx]; 722 struct i915_request *new; 723 724 new = igt_request_alloc(ctx[idx], engine); 725 if (IS_ERR(new)) { 726 err = PTR_ERR(new); 727 break; 728 } 729 730 if (arg->flags & TEST_PRIORITY) 731 ctx[idx]->sched.priority = 732 i915_prandom_u32_max_state(512, &prng); 733 734 rq[idx] = i915_request_get(new); 735 i915_request_add(new); 736 737 err = active_request_put(old); 738 if (err) 739 break; 740 741 cond_resched(); 742 } 743 744 for (count = 0; count < ARRAY_SIZE(rq); count++) { 745 int err__ = active_request_put(rq[count]); 746 747 /* Keep the first error */ 748 if (!err) 749 err = err__; 750 } 751 752 err_file: 753 mock_file_free(engine->i915, file); 754 return err; 755 } 756 757 static int __igt_reset_engines(struct intel_gt *gt, 758 const char *test_name, 759 unsigned int flags) 760 { 761 struct i915_gpu_error *global = >->i915->gpu_error; 762 struct intel_engine_cs *engine, *other; 763 enum intel_engine_id id, tmp; 764 struct hang h; 765 int err = 0; 766 767 /* Check that issuing a reset on one engine does not interfere 768 * with any other engine. 769 */ 770 771 if (!intel_has_reset_engine(gt)) 772 return 0; 773 774 if (flags & TEST_ACTIVE) { 775 err = hang_init(&h, gt); 776 if (err) 777 return err; 778 779 if (flags & TEST_PRIORITY) 780 h.ctx->sched.priority = 1024; 781 } 782 783 for_each_engine(engine, gt->i915, id) { 784 struct active_engine threads[I915_NUM_ENGINES] = {}; 785 unsigned long device = i915_reset_count(global); 786 unsigned long count = 0, reported; 787 IGT_TIMEOUT(end_time); 788 789 if (flags & TEST_ACTIVE && 790 !intel_engine_can_store_dword(engine)) 791 continue; 792 793 if (!wait_for_idle(engine)) { 794 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 795 engine->name, test_name); 796 err = -EIO; 797 break; 798 } 799 800 memset(threads, 0, sizeof(threads)); 801 for_each_engine(other, gt->i915, tmp) { 802 struct task_struct *tsk; 803 804 threads[tmp].resets = 805 i915_reset_engine_count(global, other); 806 807 if (!(flags & TEST_OTHERS)) 808 continue; 809 810 if (other == engine && !(flags & TEST_SELF)) 811 continue; 812 813 threads[tmp].engine = other; 814 threads[tmp].flags = flags; 815 816 tsk = kthread_run(active_engine, &threads[tmp], 817 "igt/%s", other->name); 818 if (IS_ERR(tsk)) { 819 err = PTR_ERR(tsk); 820 goto unwind; 821 } 822 823 threads[tmp].task = tsk; 824 get_task_struct(tsk); 825 } 826 827 intel_engine_pm_get(engine); 828 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 829 do { 830 struct i915_request *rq = NULL; 831 832 if (flags & TEST_ACTIVE) { 833 rq = hang_create_request(&h, engine); 834 if (IS_ERR(rq)) { 835 err = PTR_ERR(rq); 836 break; 837 } 838 839 i915_request_get(rq); 840 i915_request_add(rq); 841 842 if (!wait_until_running(&h, rq)) { 843 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 844 845 pr_err("%s: Failed to start request %llx, at %x\n", 846 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 847 intel_engine_dump(engine, &p, 848 "%s\n", engine->name); 849 850 i915_request_put(rq); 851 err = -EIO; 852 break; 853 } 854 } 855 856 err = intel_engine_reset(engine, NULL); 857 if (err) { 858 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 859 engine->name, test_name, err); 860 break; 861 } 862 863 count++; 864 865 if (rq) { 866 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 867 struct drm_printer p = 868 drm_info_printer(gt->i915->drm.dev); 869 870 pr_err("i915_reset_engine(%s:%s):" 871 " failed to complete request after reset\n", 872 engine->name, test_name); 873 intel_engine_dump(engine, &p, 874 "%s\n", engine->name); 875 i915_request_put(rq); 876 877 GEM_TRACE_DUMP(); 878 intel_gt_set_wedged(gt); 879 err = -EIO; 880 break; 881 } 882 883 i915_request_put(rq); 884 } 885 886 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 887 struct drm_printer p = 888 drm_info_printer(gt->i915->drm.dev); 889 890 pr_err("i915_reset_engine(%s:%s):" 891 " failed to idle after reset\n", 892 engine->name, test_name); 893 intel_engine_dump(engine, &p, 894 "%s\n", engine->name); 895 896 err = -EIO; 897 break; 898 } 899 } while (time_before(jiffies, end_time)); 900 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 901 intel_engine_pm_put(engine); 902 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 903 engine->name, test_name, count); 904 905 reported = i915_reset_engine_count(global, engine); 906 reported -= threads[engine->id].resets; 907 if (reported != count) { 908 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 909 engine->name, test_name, count, reported); 910 if (!err) 911 err = -EINVAL; 912 } 913 914 unwind: 915 for_each_engine(other, gt->i915, tmp) { 916 int ret; 917 918 if (!threads[tmp].task) 919 continue; 920 921 ret = kthread_stop(threads[tmp].task); 922 if (ret) { 923 pr_err("kthread for other engine %s failed, err=%d\n", 924 other->name, ret); 925 if (!err) 926 err = ret; 927 } 928 put_task_struct(threads[tmp].task); 929 930 if (other->uabi_class != engine->uabi_class && 931 threads[tmp].resets != 932 i915_reset_engine_count(global, other)) { 933 pr_err("Innocent engine %s was reset (count=%ld)\n", 934 other->name, 935 i915_reset_engine_count(global, other) - 936 threads[tmp].resets); 937 if (!err) 938 err = -EINVAL; 939 } 940 } 941 942 if (device != i915_reset_count(global)) { 943 pr_err("Global reset (count=%ld)!\n", 944 i915_reset_count(global) - device); 945 if (!err) 946 err = -EINVAL; 947 } 948 949 if (err) 950 break; 951 952 err = igt_flush_test(gt->i915); 953 if (err) 954 break; 955 } 956 957 if (intel_gt_is_wedged(gt)) 958 err = -EIO; 959 960 if (flags & TEST_ACTIVE) 961 hang_fini(&h); 962 963 return err; 964 } 965 966 static int igt_reset_engines(void *arg) 967 { 968 static const struct { 969 const char *name; 970 unsigned int flags; 971 } phases[] = { 972 { "idle", 0 }, 973 { "active", TEST_ACTIVE }, 974 { "others-idle", TEST_OTHERS }, 975 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 976 { 977 "others-priority", 978 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 979 }, 980 { 981 "self-priority", 982 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 983 }, 984 { } 985 }; 986 struct intel_gt *gt = arg; 987 typeof(*phases) *p; 988 int err; 989 990 for (p = phases; p->name; p++) { 991 if (p->flags & TEST_PRIORITY) { 992 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 993 continue; 994 } 995 996 err = __igt_reset_engines(arg, p->name, p->flags); 997 if (err) 998 return err; 999 } 1000 1001 return 0; 1002 } 1003 1004 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1005 { 1006 u32 count = i915_reset_count(>->i915->gpu_error); 1007 1008 intel_gt_reset(gt, mask, NULL); 1009 1010 return count; 1011 } 1012 1013 static int igt_reset_wait(void *arg) 1014 { 1015 struct intel_gt *gt = arg; 1016 struct i915_gpu_error *global = >->i915->gpu_error; 1017 struct intel_engine_cs *engine = gt->i915->engine[RCS0]; 1018 struct i915_request *rq; 1019 unsigned int reset_count; 1020 struct hang h; 1021 long timeout; 1022 int err; 1023 1024 if (!engine || !intel_engine_can_store_dword(engine)) 1025 return 0; 1026 1027 /* Check that we detect a stuck waiter and issue a reset */ 1028 1029 igt_global_reset_lock(gt); 1030 1031 err = hang_init(&h, gt); 1032 if (err) 1033 goto unlock; 1034 1035 rq = hang_create_request(&h, engine); 1036 if (IS_ERR(rq)) { 1037 err = PTR_ERR(rq); 1038 goto fini; 1039 } 1040 1041 i915_request_get(rq); 1042 i915_request_add(rq); 1043 1044 if (!wait_until_running(&h, rq)) { 1045 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1046 1047 pr_err("%s: Failed to start request %llx, at %x\n", 1048 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1049 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1050 1051 intel_gt_set_wedged(gt); 1052 1053 err = -EIO; 1054 goto out_rq; 1055 } 1056 1057 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1058 1059 timeout = i915_request_wait(rq, 0, 10); 1060 if (timeout < 0) { 1061 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1062 timeout); 1063 err = timeout; 1064 goto out_rq; 1065 } 1066 1067 if (i915_reset_count(global) == reset_count) { 1068 pr_err("No GPU reset recorded!\n"); 1069 err = -EINVAL; 1070 goto out_rq; 1071 } 1072 1073 out_rq: 1074 i915_request_put(rq); 1075 fini: 1076 hang_fini(&h); 1077 unlock: 1078 igt_global_reset_unlock(gt); 1079 1080 if (intel_gt_is_wedged(gt)) 1081 return -EIO; 1082 1083 return err; 1084 } 1085 1086 struct evict_vma { 1087 struct completion completion; 1088 struct i915_vma *vma; 1089 }; 1090 1091 static int evict_vma(void *data) 1092 { 1093 struct evict_vma *arg = data; 1094 struct i915_address_space *vm = arg->vma->vm; 1095 struct drm_mm_node evict = arg->vma->node; 1096 int err; 1097 1098 complete(&arg->completion); 1099 1100 mutex_lock(&vm->mutex); 1101 err = i915_gem_evict_for_node(vm, &evict, 0); 1102 mutex_unlock(&vm->mutex); 1103 1104 return err; 1105 } 1106 1107 static int evict_fence(void *data) 1108 { 1109 struct evict_vma *arg = data; 1110 int err; 1111 1112 complete(&arg->completion); 1113 1114 /* Mark the fence register as dirty to force the mmio update. */ 1115 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1116 if (err) { 1117 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1118 return err; 1119 } 1120 1121 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1122 if (err) { 1123 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1124 return err; 1125 } 1126 1127 err = i915_vma_pin_fence(arg->vma); 1128 i915_vma_unpin(arg->vma); 1129 if (err) { 1130 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1131 return err; 1132 } 1133 1134 i915_vma_unpin_fence(arg->vma); 1135 1136 return 0; 1137 } 1138 1139 static int __igt_reset_evict_vma(struct intel_gt *gt, 1140 struct i915_address_space *vm, 1141 int (*fn)(void *), 1142 unsigned int flags) 1143 { 1144 struct intel_engine_cs *engine = gt->i915->engine[RCS0]; 1145 struct drm_i915_gem_object *obj; 1146 struct task_struct *tsk = NULL; 1147 struct i915_request *rq; 1148 struct evict_vma arg; 1149 struct hang h; 1150 int err; 1151 1152 if (!engine || !intel_engine_can_store_dword(engine)) 1153 return 0; 1154 1155 /* Check that we can recover an unbind stuck on a hanging request */ 1156 1157 err = hang_init(&h, gt); 1158 if (err) 1159 return err; 1160 1161 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1162 if (IS_ERR(obj)) { 1163 err = PTR_ERR(obj); 1164 goto fini; 1165 } 1166 1167 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1168 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1169 if (err) { 1170 pr_err("Invalid X-tiling settings; err:%d\n", err); 1171 goto out_obj; 1172 } 1173 } 1174 1175 arg.vma = i915_vma_instance(obj, vm, NULL); 1176 if (IS_ERR(arg.vma)) { 1177 err = PTR_ERR(arg.vma); 1178 goto out_obj; 1179 } 1180 1181 rq = hang_create_request(&h, engine); 1182 if (IS_ERR(rq)) { 1183 err = PTR_ERR(rq); 1184 goto out_obj; 1185 } 1186 1187 err = i915_vma_pin(arg.vma, 0, 0, 1188 i915_vma_is_ggtt(arg.vma) ? 1189 PIN_GLOBAL | PIN_MAPPABLE : 1190 PIN_USER); 1191 if (err) { 1192 i915_request_add(rq); 1193 goto out_obj; 1194 } 1195 1196 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1197 err = i915_vma_pin_fence(arg.vma); 1198 if (err) { 1199 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1200 i915_vma_unpin(arg.vma); 1201 i915_request_add(rq); 1202 goto out_obj; 1203 } 1204 } 1205 1206 i915_vma_lock(arg.vma); 1207 err = i915_request_await_object(rq, arg.vma->obj, 1208 flags & EXEC_OBJECT_WRITE); 1209 if (err == 0) 1210 err = i915_vma_move_to_active(arg.vma, rq, flags); 1211 i915_vma_unlock(arg.vma); 1212 1213 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1214 i915_vma_unpin_fence(arg.vma); 1215 i915_vma_unpin(arg.vma); 1216 1217 i915_request_get(rq); 1218 i915_request_add(rq); 1219 if (err) 1220 goto out_rq; 1221 1222 if (!wait_until_running(&h, rq)) { 1223 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1224 1225 pr_err("%s: Failed to start request %llx, at %x\n", 1226 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1227 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1228 1229 intel_gt_set_wedged(gt); 1230 goto out_reset; 1231 } 1232 1233 init_completion(&arg.completion); 1234 1235 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1236 if (IS_ERR(tsk)) { 1237 err = PTR_ERR(tsk); 1238 tsk = NULL; 1239 goto out_reset; 1240 } 1241 get_task_struct(tsk); 1242 1243 wait_for_completion(&arg.completion); 1244 1245 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1246 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1247 1248 pr_err("igt/evict_vma kthread did not wait\n"); 1249 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1250 1251 intel_gt_set_wedged(gt); 1252 goto out_reset; 1253 } 1254 1255 out_reset: 1256 igt_global_reset_lock(gt); 1257 fake_hangcheck(gt, rq->engine->mask); 1258 igt_global_reset_unlock(gt); 1259 1260 if (tsk) { 1261 struct intel_wedge_me w; 1262 1263 /* The reset, even indirectly, should take less than 10ms. */ 1264 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1265 err = kthread_stop(tsk); 1266 1267 put_task_struct(tsk); 1268 } 1269 1270 out_rq: 1271 i915_request_put(rq); 1272 out_obj: 1273 i915_gem_object_put(obj); 1274 fini: 1275 hang_fini(&h); 1276 if (intel_gt_is_wedged(gt)) 1277 return -EIO; 1278 1279 return err; 1280 } 1281 1282 static int igt_reset_evict_ggtt(void *arg) 1283 { 1284 struct intel_gt *gt = arg; 1285 1286 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1287 evict_vma, EXEC_OBJECT_WRITE); 1288 } 1289 1290 static int igt_reset_evict_ppgtt(void *arg) 1291 { 1292 struct intel_gt *gt = arg; 1293 struct i915_gem_context *ctx; 1294 struct drm_file *file; 1295 int err; 1296 1297 file = mock_file(gt->i915); 1298 if (IS_ERR(file)) 1299 return PTR_ERR(file); 1300 1301 mutex_lock(>->i915->drm.struct_mutex); 1302 ctx = live_context(gt->i915, file); 1303 mutex_unlock(>->i915->drm.struct_mutex); 1304 if (IS_ERR(ctx)) { 1305 err = PTR_ERR(ctx); 1306 goto out; 1307 } 1308 1309 err = 0; 1310 if (ctx->vm) /* aliasing == global gtt locking, covered above */ 1311 err = __igt_reset_evict_vma(gt, ctx->vm, 1312 evict_vma, EXEC_OBJECT_WRITE); 1313 1314 out: 1315 mock_file_free(gt->i915, file); 1316 return err; 1317 } 1318 1319 static int igt_reset_evict_fence(void *arg) 1320 { 1321 struct intel_gt *gt = arg; 1322 1323 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1324 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1325 } 1326 1327 static int wait_for_others(struct intel_gt *gt, 1328 struct intel_engine_cs *exclude) 1329 { 1330 struct intel_engine_cs *engine; 1331 enum intel_engine_id id; 1332 1333 for_each_engine(engine, gt->i915, id) { 1334 if (engine == exclude) 1335 continue; 1336 1337 if (!wait_for_idle(engine)) 1338 return -EIO; 1339 } 1340 1341 return 0; 1342 } 1343 1344 static int igt_reset_queue(void *arg) 1345 { 1346 struct intel_gt *gt = arg; 1347 struct i915_gpu_error *global = >->i915->gpu_error; 1348 struct intel_engine_cs *engine; 1349 enum intel_engine_id id; 1350 struct hang h; 1351 int err; 1352 1353 /* Check that we replay pending requests following a hang */ 1354 1355 igt_global_reset_lock(gt); 1356 1357 err = hang_init(&h, gt); 1358 if (err) 1359 goto unlock; 1360 1361 for_each_engine(engine, gt->i915, id) { 1362 struct i915_request *prev; 1363 IGT_TIMEOUT(end_time); 1364 unsigned int count; 1365 1366 if (!intel_engine_can_store_dword(engine)) 1367 continue; 1368 1369 prev = hang_create_request(&h, engine); 1370 if (IS_ERR(prev)) { 1371 err = PTR_ERR(prev); 1372 goto fini; 1373 } 1374 1375 i915_request_get(prev); 1376 i915_request_add(prev); 1377 1378 count = 0; 1379 do { 1380 struct i915_request *rq; 1381 unsigned int reset_count; 1382 1383 rq = hang_create_request(&h, engine); 1384 if (IS_ERR(rq)) { 1385 err = PTR_ERR(rq); 1386 goto fini; 1387 } 1388 1389 i915_request_get(rq); 1390 i915_request_add(rq); 1391 1392 /* 1393 * XXX We don't handle resetting the kernel context 1394 * very well. If we trigger a device reset twice in 1395 * quick succession while the kernel context is 1396 * executing, we may end up skipping the breadcrumb. 1397 * This is really only a problem for the selftest as 1398 * normally there is a large interlude between resets 1399 * (hangcheck), or we focus on resetting just one 1400 * engine and so avoid repeatedly resetting innocents. 1401 */ 1402 err = wait_for_others(gt, engine); 1403 if (err) { 1404 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1405 __func__, engine->name); 1406 i915_request_put(rq); 1407 i915_request_put(prev); 1408 1409 GEM_TRACE_DUMP(); 1410 intel_gt_set_wedged(gt); 1411 goto fini; 1412 } 1413 1414 if (!wait_until_running(&h, prev)) { 1415 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1416 1417 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1418 __func__, engine->name, 1419 prev->fence.seqno, hws_seqno(&h, prev)); 1420 intel_engine_dump(engine, &p, 1421 "%s\n", engine->name); 1422 1423 i915_request_put(rq); 1424 i915_request_put(prev); 1425 1426 intel_gt_set_wedged(gt); 1427 1428 err = -EIO; 1429 goto fini; 1430 } 1431 1432 reset_count = fake_hangcheck(gt, BIT(id)); 1433 1434 if (prev->fence.error != -EIO) { 1435 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1436 prev->fence.error); 1437 i915_request_put(rq); 1438 i915_request_put(prev); 1439 err = -EINVAL; 1440 goto fini; 1441 } 1442 1443 if (rq->fence.error) { 1444 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1445 rq->fence.error); 1446 i915_request_put(rq); 1447 i915_request_put(prev); 1448 err = -EINVAL; 1449 goto fini; 1450 } 1451 1452 if (i915_reset_count(global) == reset_count) { 1453 pr_err("No GPU reset recorded!\n"); 1454 i915_request_put(rq); 1455 i915_request_put(prev); 1456 err = -EINVAL; 1457 goto fini; 1458 } 1459 1460 i915_request_put(prev); 1461 prev = rq; 1462 count++; 1463 } while (time_before(jiffies, end_time)); 1464 pr_info("%s: Completed %d resets\n", engine->name, count); 1465 1466 *h.batch = MI_BATCH_BUFFER_END; 1467 intel_gt_chipset_flush(engine->gt); 1468 1469 i915_request_put(prev); 1470 1471 err = igt_flush_test(gt->i915); 1472 if (err) 1473 break; 1474 } 1475 1476 fini: 1477 hang_fini(&h); 1478 unlock: 1479 igt_global_reset_unlock(gt); 1480 1481 if (intel_gt_is_wedged(gt)) 1482 return -EIO; 1483 1484 return err; 1485 } 1486 1487 static int igt_handle_error(void *arg) 1488 { 1489 struct intel_gt *gt = arg; 1490 struct i915_gpu_error *global = >->i915->gpu_error; 1491 struct intel_engine_cs *engine = gt->i915->engine[RCS0]; 1492 struct hang h; 1493 struct i915_request *rq; 1494 struct i915_gpu_state *error; 1495 int err; 1496 1497 /* Check that we can issue a global GPU and engine reset */ 1498 1499 if (!intel_has_reset_engine(gt)) 1500 return 0; 1501 1502 if (!engine || !intel_engine_can_store_dword(engine)) 1503 return 0; 1504 1505 err = hang_init(&h, gt); 1506 if (err) 1507 return err; 1508 1509 rq = hang_create_request(&h, engine); 1510 if (IS_ERR(rq)) { 1511 err = PTR_ERR(rq); 1512 goto err_fini; 1513 } 1514 1515 i915_request_get(rq); 1516 i915_request_add(rq); 1517 1518 if (!wait_until_running(&h, rq)) { 1519 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1520 1521 pr_err("%s: Failed to start request %llx, at %x\n", 1522 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1523 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1524 1525 intel_gt_set_wedged(gt); 1526 1527 err = -EIO; 1528 goto err_request; 1529 } 1530 1531 /* Temporarily disable error capture */ 1532 error = xchg(&global->first_error, (void *)-1); 1533 1534 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1535 1536 xchg(&global->first_error, error); 1537 1538 if (rq->fence.error != -EIO) { 1539 pr_err("Guilty request not identified!\n"); 1540 err = -EINVAL; 1541 goto err_request; 1542 } 1543 1544 err_request: 1545 i915_request_put(rq); 1546 err_fini: 1547 hang_fini(&h); 1548 return err; 1549 } 1550 1551 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1552 const struct igt_atomic_section *p, 1553 const char *mode) 1554 { 1555 struct tasklet_struct * const t = &engine->execlists.tasklet; 1556 int err; 1557 1558 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1559 engine->name, mode, p->name); 1560 1561 tasklet_disable_nosync(t); 1562 p->critical_section_begin(); 1563 1564 err = intel_engine_reset(engine, NULL); 1565 1566 p->critical_section_end(); 1567 tasklet_enable(t); 1568 1569 if (err) 1570 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1571 engine->name, mode, p->name); 1572 1573 return err; 1574 } 1575 1576 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1577 const struct igt_atomic_section *p) 1578 { 1579 struct i915_request *rq; 1580 struct hang h; 1581 int err; 1582 1583 err = __igt_atomic_reset_engine(engine, p, "idle"); 1584 if (err) 1585 return err; 1586 1587 err = hang_init(&h, engine->gt); 1588 if (err) 1589 return err; 1590 1591 rq = hang_create_request(&h, engine); 1592 if (IS_ERR(rq)) { 1593 err = PTR_ERR(rq); 1594 goto out; 1595 } 1596 1597 i915_request_get(rq); 1598 i915_request_add(rq); 1599 1600 if (wait_until_running(&h, rq)) { 1601 err = __igt_atomic_reset_engine(engine, p, "active"); 1602 } else { 1603 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1604 __func__, engine->name, 1605 rq->fence.seqno, hws_seqno(&h, rq)); 1606 intel_gt_set_wedged(engine->gt); 1607 err = -EIO; 1608 } 1609 1610 if (err == 0) { 1611 struct intel_wedge_me w; 1612 1613 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1614 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1615 if (intel_gt_is_wedged(engine->gt)) 1616 err = -EIO; 1617 } 1618 1619 i915_request_put(rq); 1620 out: 1621 hang_fini(&h); 1622 return err; 1623 } 1624 1625 static int igt_reset_engines_atomic(void *arg) 1626 { 1627 struct intel_gt *gt = arg; 1628 const typeof(*igt_atomic_phases) *p; 1629 int err = 0; 1630 1631 /* Check that the engines resets are usable from atomic context */ 1632 1633 if (!intel_has_reset_engine(gt)) 1634 return 0; 1635 1636 if (USES_GUC_SUBMISSION(gt->i915)) 1637 return 0; 1638 1639 igt_global_reset_lock(gt); 1640 1641 /* Flush any requests before we get started and check basics */ 1642 if (!igt_force_reset(gt)) 1643 goto unlock; 1644 1645 for (p = igt_atomic_phases; p->name; p++) { 1646 struct intel_engine_cs *engine; 1647 enum intel_engine_id id; 1648 1649 for_each_engine(engine, gt->i915, id) { 1650 err = igt_atomic_reset_engine(engine, p); 1651 if (err) 1652 goto out; 1653 } 1654 } 1655 1656 out: 1657 /* As we poke around the guts, do a full reset before continuing. */ 1658 igt_force_reset(gt); 1659 unlock: 1660 igt_global_reset_unlock(gt); 1661 1662 return err; 1663 } 1664 1665 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1666 { 1667 static const struct i915_subtest tests[] = { 1668 SUBTEST(igt_hang_sanitycheck), 1669 SUBTEST(igt_reset_nop), 1670 SUBTEST(igt_reset_nop_engine), 1671 SUBTEST(igt_reset_idle_engine), 1672 SUBTEST(igt_reset_active_engine), 1673 SUBTEST(igt_reset_engines), 1674 SUBTEST(igt_reset_engines_atomic), 1675 SUBTEST(igt_reset_queue), 1676 SUBTEST(igt_reset_wait), 1677 SUBTEST(igt_reset_evict_ggtt), 1678 SUBTEST(igt_reset_evict_ppgtt), 1679 SUBTEST(igt_reset_evict_fence), 1680 SUBTEST(igt_handle_error), 1681 }; 1682 struct intel_gt *gt = &i915->gt; 1683 intel_wakeref_t wakeref; 1684 bool saved_hangcheck; 1685 int err; 1686 1687 if (!intel_has_gpu_reset(gt)) 1688 return 0; 1689 1690 if (intel_gt_is_wedged(gt)) 1691 return -EIO; /* we're long past hope of a successful reset */ 1692 1693 wakeref = intel_runtime_pm_get(>->i915->runtime_pm); 1694 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck); 1695 drain_delayed_work(>->hangcheck.work); /* flush param */ 1696 1697 err = intel_gt_live_subtests(tests, gt); 1698 1699 i915_modparams.enable_hangcheck = saved_hangcheck; 1700 intel_runtime_pm_put(>->i915->runtime_pm, wakeref); 1701 1702 return err; 1703 } 1704