1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2016 Intel Corporation 4 */ 5 6 #include <linux/kthread.h> 7 8 #include "gem/i915_gem_context.h" 9 10 #include "i915_gem_evict.h" 11 #include "intel_gt.h" 12 #include "intel_engine_heartbeat.h" 13 #include "intel_engine_pm.h" 14 #include "selftest_engine_heartbeat.h" 15 16 #include "i915_selftest.h" 17 #include "selftests/i915_random.h" 18 #include "selftests/igt_flush_test.h" 19 #include "selftests/igt_reset.h" 20 #include "selftests/igt_atomic.h" 21 #include "selftests/igt_spinner.h" 22 #include "selftests/intel_scheduler_helpers.h" 23 24 #include "selftests/mock_drm.h" 25 26 #include "gem/selftests/mock_context.h" 27 #include "gem/selftests/igt_gem_utils.h" 28 29 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 30 31 struct hang { 32 struct intel_gt *gt; 33 struct drm_i915_gem_object *hws; 34 struct drm_i915_gem_object *obj; 35 struct i915_gem_context *ctx; 36 u32 *seqno; 37 u32 *batch; 38 }; 39 40 static int hang_init(struct hang *h, struct intel_gt *gt) 41 { 42 void *vaddr; 43 int err; 44 45 memset(h, 0, sizeof(*h)); 46 h->gt = gt; 47 48 h->ctx = kernel_context(gt->i915, NULL); 49 if (IS_ERR(h->ctx)) 50 return PTR_ERR(h->ctx); 51 52 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 53 54 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 55 if (IS_ERR(h->hws)) { 56 err = PTR_ERR(h->hws); 57 goto err_ctx; 58 } 59 60 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 61 if (IS_ERR(h->obj)) { 62 err = PTR_ERR(h->obj); 63 goto err_hws; 64 } 65 66 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 67 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB); 68 if (IS_ERR(vaddr)) { 69 err = PTR_ERR(vaddr); 70 goto err_obj; 71 } 72 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 73 74 vaddr = i915_gem_object_pin_map_unlocked(h->obj, 75 i915_coherent_map_type(gt->i915, h->obj, false)); 76 if (IS_ERR(vaddr)) { 77 err = PTR_ERR(vaddr); 78 goto err_unpin_hws; 79 } 80 h->batch = vaddr; 81 82 return 0; 83 84 err_unpin_hws: 85 i915_gem_object_unpin_map(h->hws); 86 err_obj: 87 i915_gem_object_put(h->obj); 88 err_hws: 89 i915_gem_object_put(h->hws); 90 err_ctx: 91 kernel_context_close(h->ctx); 92 return err; 93 } 94 95 static u64 hws_address(const struct i915_vma *hws, 96 const struct i915_request *rq) 97 { 98 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 99 } 100 101 static int move_to_active(struct i915_vma *vma, 102 struct i915_request *rq, 103 unsigned int flags) 104 { 105 int err; 106 107 i915_vma_lock(vma); 108 err = i915_request_await_object(rq, vma->obj, 109 flags & EXEC_OBJECT_WRITE); 110 if (err == 0) 111 err = i915_vma_move_to_active(vma, rq, flags); 112 i915_vma_unlock(vma); 113 114 return err; 115 } 116 117 static struct i915_request * 118 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 119 { 120 struct intel_gt *gt = h->gt; 121 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx); 122 struct drm_i915_gem_object *obj; 123 struct i915_request *rq = NULL; 124 struct i915_vma *hws, *vma; 125 unsigned int flags; 126 void *vaddr; 127 u32 *batch; 128 int err; 129 130 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 131 if (IS_ERR(obj)) { 132 i915_vm_put(vm); 133 return ERR_CAST(obj); 134 } 135 136 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false)); 137 if (IS_ERR(vaddr)) { 138 i915_gem_object_put(obj); 139 i915_vm_put(vm); 140 return ERR_CAST(vaddr); 141 } 142 143 i915_gem_object_unpin_map(h->obj); 144 i915_gem_object_put(h->obj); 145 146 h->obj = obj; 147 h->batch = vaddr; 148 149 vma = i915_vma_instance(h->obj, vm, NULL); 150 if (IS_ERR(vma)) { 151 i915_vm_put(vm); 152 return ERR_CAST(vma); 153 } 154 155 hws = i915_vma_instance(h->hws, vm, NULL); 156 if (IS_ERR(hws)) { 157 i915_vm_put(vm); 158 return ERR_CAST(hws); 159 } 160 161 err = i915_vma_pin(vma, 0, 0, PIN_USER); 162 if (err) { 163 i915_vm_put(vm); 164 return ERR_PTR(err); 165 } 166 167 err = i915_vma_pin(hws, 0, 0, PIN_USER); 168 if (err) 169 goto unpin_vma; 170 171 rq = igt_request_alloc(h->ctx, engine); 172 if (IS_ERR(rq)) { 173 err = PTR_ERR(rq); 174 goto unpin_hws; 175 } 176 177 err = move_to_active(vma, rq, 0); 178 if (err) 179 goto cancel_rq; 180 181 err = move_to_active(hws, rq, 0); 182 if (err) 183 goto cancel_rq; 184 185 batch = h->batch; 186 if (GRAPHICS_VER(gt->i915) >= 8) { 187 *batch++ = MI_STORE_DWORD_IMM_GEN4; 188 *batch++ = lower_32_bits(hws_address(hws, rq)); 189 *batch++ = upper_32_bits(hws_address(hws, rq)); 190 *batch++ = rq->fence.seqno; 191 *batch++ = MI_NOOP; 192 193 memset(batch, 0, 1024); 194 batch += 1024 / sizeof(*batch); 195 196 *batch++ = MI_NOOP; 197 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 198 *batch++ = lower_32_bits(vma->node.start); 199 *batch++ = upper_32_bits(vma->node.start); 200 } else if (GRAPHICS_VER(gt->i915) >= 6) { 201 *batch++ = MI_STORE_DWORD_IMM_GEN4; 202 *batch++ = 0; 203 *batch++ = lower_32_bits(hws_address(hws, rq)); 204 *batch++ = rq->fence.seqno; 205 *batch++ = MI_NOOP; 206 207 memset(batch, 0, 1024); 208 batch += 1024 / sizeof(*batch); 209 210 *batch++ = MI_NOOP; 211 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 212 *batch++ = lower_32_bits(vma->node.start); 213 } else if (GRAPHICS_VER(gt->i915) >= 4) { 214 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 215 *batch++ = 0; 216 *batch++ = lower_32_bits(hws_address(hws, rq)); 217 *batch++ = rq->fence.seqno; 218 *batch++ = MI_NOOP; 219 220 memset(batch, 0, 1024); 221 batch += 1024 / sizeof(*batch); 222 223 *batch++ = MI_NOOP; 224 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 225 *batch++ = lower_32_bits(vma->node.start); 226 } else { 227 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 228 *batch++ = lower_32_bits(hws_address(hws, rq)); 229 *batch++ = rq->fence.seqno; 230 *batch++ = MI_NOOP; 231 232 memset(batch, 0, 1024); 233 batch += 1024 / sizeof(*batch); 234 235 *batch++ = MI_NOOP; 236 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 237 *batch++ = lower_32_bits(vma->node.start); 238 } 239 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 240 intel_gt_chipset_flush(engine->gt); 241 242 if (rq->engine->emit_init_breadcrumb) { 243 err = rq->engine->emit_init_breadcrumb(rq); 244 if (err) 245 goto cancel_rq; 246 } 247 248 flags = 0; 249 if (GRAPHICS_VER(gt->i915) <= 5) 250 flags |= I915_DISPATCH_SECURE; 251 252 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 253 254 cancel_rq: 255 if (err) { 256 i915_request_set_error_once(rq, err); 257 i915_request_add(rq); 258 } 259 unpin_hws: 260 i915_vma_unpin(hws); 261 unpin_vma: 262 i915_vma_unpin(vma); 263 i915_vm_put(vm); 264 return err ? ERR_PTR(err) : rq; 265 } 266 267 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 268 { 269 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 270 } 271 272 static void hang_fini(struct hang *h) 273 { 274 *h->batch = MI_BATCH_BUFFER_END; 275 intel_gt_chipset_flush(h->gt); 276 277 i915_gem_object_unpin_map(h->obj); 278 i915_gem_object_put(h->obj); 279 280 i915_gem_object_unpin_map(h->hws); 281 i915_gem_object_put(h->hws); 282 283 kernel_context_close(h->ctx); 284 285 igt_flush_test(h->gt->i915); 286 } 287 288 static bool wait_until_running(struct hang *h, struct i915_request *rq) 289 { 290 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 291 rq->fence.seqno), 292 10) && 293 wait_for(i915_seqno_passed(hws_seqno(h, rq), 294 rq->fence.seqno), 295 1000)); 296 } 297 298 static int igt_hang_sanitycheck(void *arg) 299 { 300 struct intel_gt *gt = arg; 301 struct i915_request *rq; 302 struct intel_engine_cs *engine; 303 enum intel_engine_id id; 304 struct hang h; 305 int err; 306 307 /* Basic check that we can execute our hanging batch */ 308 309 err = hang_init(&h, gt); 310 if (err) 311 return err; 312 313 for_each_engine(engine, gt, id) { 314 struct intel_wedge_me w; 315 long timeout; 316 317 if (!intel_engine_can_store_dword(engine)) 318 continue; 319 320 rq = hang_create_request(&h, engine); 321 if (IS_ERR(rq)) { 322 err = PTR_ERR(rq); 323 pr_err("Failed to create request for %s, err=%d\n", 324 engine->name, err); 325 goto fini; 326 } 327 328 i915_request_get(rq); 329 330 *h.batch = MI_BATCH_BUFFER_END; 331 intel_gt_chipset_flush(engine->gt); 332 333 i915_request_add(rq); 334 335 timeout = 0; 336 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 337 timeout = i915_request_wait(rq, 0, 338 MAX_SCHEDULE_TIMEOUT); 339 if (intel_gt_is_wedged(gt)) 340 timeout = -EIO; 341 342 i915_request_put(rq); 343 344 if (timeout < 0) { 345 err = timeout; 346 pr_err("Wait for request failed on %s, err=%d\n", 347 engine->name, err); 348 goto fini; 349 } 350 } 351 352 fini: 353 hang_fini(&h); 354 return err; 355 } 356 357 static bool wait_for_idle(struct intel_engine_cs *engine) 358 { 359 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 360 } 361 362 static int igt_reset_nop(void *arg) 363 { 364 struct intel_gt *gt = arg; 365 struct i915_gpu_error *global = >->i915->gpu_error; 366 struct intel_engine_cs *engine; 367 unsigned int reset_count, count; 368 enum intel_engine_id id; 369 IGT_TIMEOUT(end_time); 370 int err = 0; 371 372 /* Check that we can reset during non-user portions of requests */ 373 374 reset_count = i915_reset_count(global); 375 count = 0; 376 do { 377 for_each_engine(engine, gt, id) { 378 struct intel_context *ce; 379 int i; 380 381 ce = intel_context_create(engine); 382 if (IS_ERR(ce)) { 383 err = PTR_ERR(ce); 384 pr_err("[%s] Create context failed: %d!\n", engine->name, err); 385 break; 386 } 387 388 for (i = 0; i < 16; i++) { 389 struct i915_request *rq; 390 391 rq = intel_context_create_request(ce); 392 if (IS_ERR(rq)) { 393 err = PTR_ERR(rq); 394 pr_err("[%s] Create request failed: %d!\n", 395 engine->name, err); 396 break; 397 } 398 399 i915_request_add(rq); 400 } 401 402 intel_context_put(ce); 403 } 404 405 igt_global_reset_lock(gt); 406 intel_gt_reset(gt, ALL_ENGINES, NULL); 407 igt_global_reset_unlock(gt); 408 409 if (intel_gt_is_wedged(gt)) { 410 pr_err("[%s] GT is wedged!\n", engine->name); 411 err = -EIO; 412 break; 413 } 414 415 if (i915_reset_count(global) != reset_count + ++count) { 416 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n", 417 engine->name, i915_reset_count(global), reset_count, count); 418 err = -EINVAL; 419 break; 420 } 421 422 err = igt_flush_test(gt->i915); 423 if (err) { 424 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 425 break; 426 } 427 } while (time_before(jiffies, end_time)); 428 pr_info("%s: %d resets\n", __func__, count); 429 430 if (igt_flush_test(gt->i915)) { 431 pr_err("Post flush failed: %d!\n", err); 432 err = -EIO; 433 } 434 435 return err; 436 } 437 438 static int igt_reset_nop_engine(void *arg) 439 { 440 struct intel_gt *gt = arg; 441 struct i915_gpu_error *global = >->i915->gpu_error; 442 struct intel_engine_cs *engine; 443 enum intel_engine_id id; 444 445 /* Check that we can engine-reset during non-user portions */ 446 447 if (!intel_has_reset_engine(gt)) 448 return 0; 449 450 for_each_engine(engine, gt, id) { 451 unsigned int reset_count, reset_engine_count, count; 452 struct intel_context *ce; 453 IGT_TIMEOUT(end_time); 454 int err; 455 456 if (intel_engine_uses_guc(engine)) { 457 /* Engine level resets are triggered by GuC when a hang 458 * is detected. They can't be triggered by the KMD any 459 * more. Thus a nop batch cannot be used as a reset test 460 */ 461 continue; 462 } 463 464 ce = intel_context_create(engine); 465 if (IS_ERR(ce)) { 466 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 467 return PTR_ERR(ce); 468 } 469 470 reset_count = i915_reset_count(global); 471 reset_engine_count = i915_reset_engine_count(global, engine); 472 count = 0; 473 474 st_engine_heartbeat_disable(engine); 475 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 476 >->reset.flags)); 477 do { 478 int i; 479 480 if (!wait_for_idle(engine)) { 481 pr_err("%s failed to idle before reset\n", 482 engine->name); 483 err = -EIO; 484 break; 485 } 486 487 for (i = 0; i < 16; i++) { 488 struct i915_request *rq; 489 490 rq = intel_context_create_request(ce); 491 if (IS_ERR(rq)) { 492 struct drm_printer p = 493 drm_info_printer(gt->i915->drm.dev); 494 intel_engine_dump(engine, &p, 495 "%s(%s): failed to submit request\n", 496 __func__, 497 engine->name); 498 499 GEM_TRACE("%s(%s): failed to submit request\n", 500 __func__, 501 engine->name); 502 GEM_TRACE_DUMP(); 503 504 intel_gt_set_wedged(gt); 505 506 err = PTR_ERR(rq); 507 break; 508 } 509 510 i915_request_add(rq); 511 } 512 err = intel_engine_reset(engine, NULL); 513 if (err) { 514 pr_err("intel_engine_reset(%s) failed, err:%d\n", 515 engine->name, err); 516 break; 517 } 518 519 if (i915_reset_count(global) != reset_count) { 520 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 521 err = -EINVAL; 522 break; 523 } 524 525 if (i915_reset_engine_count(global, engine) != 526 reset_engine_count + ++count) { 527 pr_err("%s engine reset not recorded!\n", 528 engine->name); 529 err = -EINVAL; 530 break; 531 } 532 } while (time_before(jiffies, end_time)); 533 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 534 st_engine_heartbeat_enable(engine); 535 536 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 537 538 intel_context_put(ce); 539 if (igt_flush_test(gt->i915)) 540 err = -EIO; 541 if (err) 542 return err; 543 } 544 545 return 0; 546 } 547 548 static void force_reset_timeout(struct intel_engine_cs *engine) 549 { 550 engine->reset_timeout.probability = 999; 551 atomic_set(&engine->reset_timeout.times, -1); 552 } 553 554 static void cancel_reset_timeout(struct intel_engine_cs *engine) 555 { 556 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 557 } 558 559 static int igt_reset_fail_engine(void *arg) 560 { 561 struct intel_gt *gt = arg; 562 struct intel_engine_cs *engine; 563 enum intel_engine_id id; 564 565 /* Check that we can recover from engine-reset failues */ 566 567 if (!intel_has_reset_engine(gt)) 568 return 0; 569 570 for_each_engine(engine, gt, id) { 571 unsigned int count; 572 struct intel_context *ce; 573 IGT_TIMEOUT(end_time); 574 int err; 575 576 /* Can't manually break the reset if i915 doesn't perform it */ 577 if (intel_engine_uses_guc(engine)) 578 continue; 579 580 ce = intel_context_create(engine); 581 if (IS_ERR(ce)) { 582 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 583 return PTR_ERR(ce); 584 } 585 586 st_engine_heartbeat_disable(engine); 587 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 588 >->reset.flags)); 589 590 force_reset_timeout(engine); 591 err = intel_engine_reset(engine, NULL); 592 cancel_reset_timeout(engine); 593 if (err == 0) /* timeouts only generated on gen8+ */ 594 goto skip; 595 596 count = 0; 597 do { 598 struct i915_request *last = NULL; 599 int i; 600 601 if (!wait_for_idle(engine)) { 602 pr_err("%s failed to idle before reset\n", 603 engine->name); 604 err = -EIO; 605 break; 606 } 607 608 for (i = 0; i < count % 15; i++) { 609 struct i915_request *rq; 610 611 rq = intel_context_create_request(ce); 612 if (IS_ERR(rq)) { 613 struct drm_printer p = 614 drm_info_printer(gt->i915->drm.dev); 615 intel_engine_dump(engine, &p, 616 "%s(%s): failed to submit request\n", 617 __func__, 618 engine->name); 619 620 GEM_TRACE("%s(%s): failed to submit request\n", 621 __func__, 622 engine->name); 623 GEM_TRACE_DUMP(); 624 625 intel_gt_set_wedged(gt); 626 if (last) 627 i915_request_put(last); 628 629 err = PTR_ERR(rq); 630 goto out; 631 } 632 633 if (last) 634 i915_request_put(last); 635 last = i915_request_get(rq); 636 i915_request_add(rq); 637 } 638 639 if (count & 1) { 640 err = intel_engine_reset(engine, NULL); 641 if (err) { 642 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 643 engine->name, err); 644 GEM_TRACE_DUMP(); 645 i915_request_put(last); 646 break; 647 } 648 } else { 649 force_reset_timeout(engine); 650 err = intel_engine_reset(engine, NULL); 651 cancel_reset_timeout(engine); 652 if (err != -ETIMEDOUT) { 653 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 654 engine->name, err); 655 i915_request_put(last); 656 break; 657 } 658 } 659 660 err = 0; 661 if (last) { 662 if (i915_request_wait(last, 0, HZ / 2) < 0) { 663 struct drm_printer p = 664 drm_info_printer(gt->i915->drm.dev); 665 666 intel_engine_dump(engine, &p, 667 "%s(%s): failed to complete request\n", 668 __func__, 669 engine->name); 670 671 GEM_TRACE("%s(%s): failed to complete request\n", 672 __func__, 673 engine->name); 674 GEM_TRACE_DUMP(); 675 676 err = -EIO; 677 } 678 i915_request_put(last); 679 } 680 count++; 681 } while (err == 0 && time_before(jiffies, end_time)); 682 out: 683 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 684 skip: 685 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 686 st_engine_heartbeat_enable(engine); 687 intel_context_put(ce); 688 689 if (igt_flush_test(gt->i915)) 690 err = -EIO; 691 if (err) 692 return err; 693 } 694 695 return 0; 696 } 697 698 static int __igt_reset_engine(struct intel_gt *gt, bool active) 699 { 700 struct i915_gpu_error *global = >->i915->gpu_error; 701 struct intel_engine_cs *engine; 702 enum intel_engine_id id; 703 struct hang h; 704 int err = 0; 705 706 /* Check that we can issue an engine reset on an idle engine (no-op) */ 707 708 if (!intel_has_reset_engine(gt)) 709 return 0; 710 711 if (active) { 712 err = hang_init(&h, gt); 713 if (err) 714 return err; 715 } 716 717 for_each_engine(engine, gt, id) { 718 unsigned int reset_count, reset_engine_count; 719 unsigned long count; 720 bool using_guc = intel_engine_uses_guc(engine); 721 IGT_TIMEOUT(end_time); 722 723 if (using_guc && !active) 724 continue; 725 726 if (active && !intel_engine_can_store_dword(engine)) 727 continue; 728 729 if (!wait_for_idle(engine)) { 730 pr_err("%s failed to idle before reset\n", 731 engine->name); 732 err = -EIO; 733 break; 734 } 735 736 reset_count = i915_reset_count(global); 737 reset_engine_count = i915_reset_engine_count(global, engine); 738 739 st_engine_heartbeat_disable(engine); 740 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 741 >->reset.flags)); 742 count = 0; 743 do { 744 struct i915_request *rq = NULL; 745 struct intel_selftest_saved_policy saved; 746 int err2; 747 748 err = intel_selftest_modify_policy(engine, &saved, 749 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 750 if (err) { 751 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 752 break; 753 } 754 755 if (active) { 756 rq = hang_create_request(&h, engine); 757 if (IS_ERR(rq)) { 758 err = PTR_ERR(rq); 759 pr_err("[%s] Create hang request failed: %d!\n", 760 engine->name, err); 761 goto restore; 762 } 763 764 i915_request_get(rq); 765 i915_request_add(rq); 766 767 if (!wait_until_running(&h, rq)) { 768 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 769 770 pr_err("%s: Failed to start request %llx, at %x\n", 771 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 772 intel_engine_dump(engine, &p, 773 "%s\n", engine->name); 774 775 i915_request_put(rq); 776 err = -EIO; 777 goto restore; 778 } 779 } 780 781 if (!using_guc) { 782 err = intel_engine_reset(engine, NULL); 783 if (err) { 784 pr_err("intel_engine_reset(%s) failed, err:%d\n", 785 engine->name, err); 786 goto skip; 787 } 788 } 789 790 if (rq) { 791 /* Ensure the reset happens and kills the engine */ 792 err = intel_selftest_wait_for_rq(rq); 793 if (err) 794 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 795 engine->name, rq->fence.context, 796 rq->fence.seqno, rq->context->guc_id.id, err); 797 } 798 799 skip: 800 if (rq) 801 i915_request_put(rq); 802 803 if (i915_reset_count(global) != reset_count) { 804 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 805 err = -EINVAL; 806 goto restore; 807 } 808 809 /* GuC based resets are not logged per engine */ 810 if (!using_guc) { 811 if (i915_reset_engine_count(global, engine) != 812 ++reset_engine_count) { 813 pr_err("%s engine reset not recorded!\n", 814 engine->name); 815 err = -EINVAL; 816 goto restore; 817 } 818 } 819 820 count++; 821 822 restore: 823 err2 = intel_selftest_restore_policy(engine, &saved); 824 if (err2) 825 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err); 826 if (err == 0) 827 err = err2; 828 if (err) 829 break; 830 } while (time_before(jiffies, end_time)); 831 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 832 st_engine_heartbeat_enable(engine); 833 pr_info("%s: Completed %lu %s resets\n", 834 engine->name, count, active ? "active" : "idle"); 835 836 if (err) 837 break; 838 839 err = igt_flush_test(gt->i915); 840 if (err) { 841 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 842 break; 843 } 844 } 845 846 if (intel_gt_is_wedged(gt)) { 847 pr_err("GT is wedged!\n"); 848 err = -EIO; 849 } 850 851 if (active) 852 hang_fini(&h); 853 854 return err; 855 } 856 857 static int igt_reset_idle_engine(void *arg) 858 { 859 return __igt_reset_engine(arg, false); 860 } 861 862 static int igt_reset_active_engine(void *arg) 863 { 864 return __igt_reset_engine(arg, true); 865 } 866 867 struct active_engine { 868 struct task_struct *task; 869 struct intel_engine_cs *engine; 870 unsigned long resets; 871 unsigned int flags; 872 }; 873 874 #define TEST_ACTIVE BIT(0) 875 #define TEST_OTHERS BIT(1) 876 #define TEST_SELF BIT(2) 877 #define TEST_PRIORITY BIT(3) 878 879 static int active_request_put(struct i915_request *rq) 880 { 881 int err = 0; 882 883 if (!rq) 884 return 0; 885 886 if (i915_request_wait(rq, 0, 10 * HZ) < 0) { 887 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 888 rq->engine->name, 889 rq->fence.context, 890 rq->fence.seqno); 891 GEM_TRACE_DUMP(); 892 893 intel_gt_set_wedged(rq->engine->gt); 894 err = -EIO; 895 } 896 897 i915_request_put(rq); 898 899 return err; 900 } 901 902 static int active_engine(void *data) 903 { 904 I915_RND_STATE(prng); 905 struct active_engine *arg = data; 906 struct intel_engine_cs *engine = arg->engine; 907 struct i915_request *rq[8] = {}; 908 struct intel_context *ce[ARRAY_SIZE(rq)]; 909 unsigned long count; 910 int err = 0; 911 912 for (count = 0; count < ARRAY_SIZE(ce); count++) { 913 ce[count] = intel_context_create(engine); 914 if (IS_ERR(ce[count])) { 915 err = PTR_ERR(ce[count]); 916 pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err); 917 while (--count) 918 intel_context_put(ce[count]); 919 return err; 920 } 921 } 922 923 count = 0; 924 while (!kthread_should_stop()) { 925 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 926 struct i915_request *old = rq[idx]; 927 struct i915_request *new; 928 929 new = intel_context_create_request(ce[idx]); 930 if (IS_ERR(new)) { 931 err = PTR_ERR(new); 932 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err); 933 break; 934 } 935 936 rq[idx] = i915_request_get(new); 937 i915_request_add(new); 938 939 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) { 940 struct i915_sched_attr attr = { 941 .priority = 942 i915_prandom_u32_max_state(512, &prng), 943 }; 944 engine->sched_engine->schedule(rq[idx], &attr); 945 } 946 947 err = active_request_put(old); 948 if (err) { 949 pr_err("[%s] Request put failed: %d!\n", engine->name, err); 950 break; 951 } 952 953 cond_resched(); 954 } 955 956 for (count = 0; count < ARRAY_SIZE(rq); count++) { 957 int err__ = active_request_put(rq[count]); 958 959 if (err) 960 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err); 961 962 /* Keep the first error */ 963 if (!err) 964 err = err__; 965 966 intel_context_put(ce[count]); 967 } 968 969 return err; 970 } 971 972 static int __igt_reset_engines(struct intel_gt *gt, 973 const char *test_name, 974 unsigned int flags) 975 { 976 struct i915_gpu_error *global = >->i915->gpu_error; 977 struct intel_engine_cs *engine, *other; 978 enum intel_engine_id id, tmp; 979 struct hang h; 980 int err = 0; 981 982 /* Check that issuing a reset on one engine does not interfere 983 * with any other engine. 984 */ 985 986 if (!intel_has_reset_engine(gt)) 987 return 0; 988 989 if (flags & TEST_ACTIVE) { 990 err = hang_init(&h, gt); 991 if (err) 992 return err; 993 994 if (flags & TEST_PRIORITY) 995 h.ctx->sched.priority = 1024; 996 } 997 998 for_each_engine(engine, gt, id) { 999 struct active_engine threads[I915_NUM_ENGINES] = {}; 1000 unsigned long device = i915_reset_count(global); 1001 unsigned long count = 0, reported; 1002 bool using_guc = intel_engine_uses_guc(engine); 1003 IGT_TIMEOUT(end_time); 1004 1005 if (flags & TEST_ACTIVE) { 1006 if (!intel_engine_can_store_dword(engine)) 1007 continue; 1008 } else if (using_guc) 1009 continue; 1010 1011 if (!wait_for_idle(engine)) { 1012 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 1013 engine->name, test_name); 1014 err = -EIO; 1015 break; 1016 } 1017 1018 memset(threads, 0, sizeof(threads)); 1019 for_each_engine(other, gt, tmp) { 1020 struct task_struct *tsk; 1021 1022 threads[tmp].resets = 1023 i915_reset_engine_count(global, other); 1024 1025 if (other == engine && !(flags & TEST_SELF)) 1026 continue; 1027 1028 if (other != engine && !(flags & TEST_OTHERS)) 1029 continue; 1030 1031 threads[tmp].engine = other; 1032 threads[tmp].flags = flags; 1033 1034 tsk = kthread_run(active_engine, &threads[tmp], 1035 "igt/%s", other->name); 1036 if (IS_ERR(tsk)) { 1037 err = PTR_ERR(tsk); 1038 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1039 goto unwind; 1040 } 1041 1042 threads[tmp].task = tsk; 1043 get_task_struct(tsk); 1044 } 1045 1046 yield(); /* start all threads before we begin */ 1047 1048 st_engine_heartbeat_disable_no_pm(engine); 1049 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 1050 >->reset.flags)); 1051 do { 1052 struct i915_request *rq = NULL; 1053 struct intel_selftest_saved_policy saved; 1054 int err2; 1055 1056 err = intel_selftest_modify_policy(engine, &saved, 1057 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 1058 if (err) { 1059 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1060 break; 1061 } 1062 1063 if (flags & TEST_ACTIVE) { 1064 rq = hang_create_request(&h, engine); 1065 if (IS_ERR(rq)) { 1066 err = PTR_ERR(rq); 1067 pr_err("[%s] Create hang request failed: %d!\n", 1068 engine->name, err); 1069 goto restore; 1070 } 1071 1072 i915_request_get(rq); 1073 i915_request_add(rq); 1074 1075 if (!wait_until_running(&h, rq)) { 1076 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1077 1078 pr_err("%s: Failed to start request %llx, at %x\n", 1079 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1080 intel_engine_dump(engine, &p, 1081 "%s\n", engine->name); 1082 1083 i915_request_put(rq); 1084 err = -EIO; 1085 goto restore; 1086 } 1087 } else { 1088 intel_engine_pm_get(engine); 1089 } 1090 1091 if (!using_guc) { 1092 err = intel_engine_reset(engine, NULL); 1093 if (err) { 1094 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 1095 engine->name, test_name, err); 1096 goto restore; 1097 } 1098 } 1099 1100 if (rq) { 1101 /* Ensure the reset happens and kills the engine */ 1102 err = intel_selftest_wait_for_rq(rq); 1103 if (err) 1104 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 1105 engine->name, rq->fence.context, 1106 rq->fence.seqno, rq->context->guc_id.id, err); 1107 } 1108 1109 count++; 1110 1111 if (rq) { 1112 if (rq->fence.error != -EIO) { 1113 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n", 1114 engine->name, test_name, 1115 rq->fence.context, 1116 rq->fence.seqno, rq->context->guc_id.id); 1117 i915_request_put(rq); 1118 1119 GEM_TRACE_DUMP(); 1120 intel_gt_set_wedged(gt); 1121 err = -EIO; 1122 goto restore; 1123 } 1124 1125 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1126 struct drm_printer p = 1127 drm_info_printer(gt->i915->drm.dev); 1128 1129 pr_err("i915_reset_engine(%s:%s):" 1130 " failed to complete request %llx:%lld after reset\n", 1131 engine->name, test_name, 1132 rq->fence.context, 1133 rq->fence.seqno); 1134 intel_engine_dump(engine, &p, 1135 "%s\n", engine->name); 1136 i915_request_put(rq); 1137 1138 GEM_TRACE_DUMP(); 1139 intel_gt_set_wedged(gt); 1140 err = -EIO; 1141 goto restore; 1142 } 1143 1144 i915_request_put(rq); 1145 } 1146 1147 if (!(flags & TEST_ACTIVE)) 1148 intel_engine_pm_put(engine); 1149 1150 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1151 struct drm_printer p = 1152 drm_info_printer(gt->i915->drm.dev); 1153 1154 pr_err("i915_reset_engine(%s:%s):" 1155 " failed to idle after reset\n", 1156 engine->name, test_name); 1157 intel_engine_dump(engine, &p, 1158 "%s\n", engine->name); 1159 1160 err = -EIO; 1161 goto restore; 1162 } 1163 1164 restore: 1165 err2 = intel_selftest_restore_policy(engine, &saved); 1166 if (err2) 1167 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2); 1168 if (err == 0) 1169 err = err2; 1170 if (err) 1171 break; 1172 } while (time_before(jiffies, end_time)); 1173 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 1174 st_engine_heartbeat_enable_no_pm(engine); 1175 1176 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1177 engine->name, test_name, count); 1178 1179 /* GuC based resets are not logged per engine */ 1180 if (!using_guc) { 1181 reported = i915_reset_engine_count(global, engine); 1182 reported -= threads[engine->id].resets; 1183 if (reported != count) { 1184 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1185 engine->name, test_name, count, reported); 1186 if (!err) 1187 err = -EINVAL; 1188 } 1189 } 1190 1191 unwind: 1192 for_each_engine(other, gt, tmp) { 1193 int ret; 1194 1195 if (!threads[tmp].task) 1196 continue; 1197 1198 ret = kthread_stop(threads[tmp].task); 1199 if (ret) { 1200 pr_err("kthread for other engine %s failed, err=%d\n", 1201 other->name, ret); 1202 if (!err) 1203 err = ret; 1204 } 1205 put_task_struct(threads[tmp].task); 1206 1207 /* GuC based resets are not logged per engine */ 1208 if (!using_guc) { 1209 if (other->uabi_class != engine->uabi_class && 1210 threads[tmp].resets != 1211 i915_reset_engine_count(global, other)) { 1212 pr_err("Innocent engine %s was reset (count=%ld)\n", 1213 other->name, 1214 i915_reset_engine_count(global, other) - 1215 threads[tmp].resets); 1216 if (!err) 1217 err = -EINVAL; 1218 } 1219 } 1220 } 1221 1222 if (device != i915_reset_count(global)) { 1223 pr_err("Global reset (count=%ld)!\n", 1224 i915_reset_count(global) - device); 1225 if (!err) 1226 err = -EINVAL; 1227 } 1228 1229 if (err) 1230 break; 1231 1232 err = igt_flush_test(gt->i915); 1233 if (err) { 1234 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1235 break; 1236 } 1237 } 1238 1239 if (intel_gt_is_wedged(gt)) 1240 err = -EIO; 1241 1242 if (flags & TEST_ACTIVE) 1243 hang_fini(&h); 1244 1245 return err; 1246 } 1247 1248 static int igt_reset_engines(void *arg) 1249 { 1250 static const struct { 1251 const char *name; 1252 unsigned int flags; 1253 } phases[] = { 1254 { "idle", 0 }, 1255 { "active", TEST_ACTIVE }, 1256 { "others-idle", TEST_OTHERS }, 1257 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1258 { 1259 "others-priority", 1260 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1261 }, 1262 { 1263 "self-priority", 1264 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1265 }, 1266 { } 1267 }; 1268 struct intel_gt *gt = arg; 1269 typeof(*phases) *p; 1270 int err; 1271 1272 for (p = phases; p->name; p++) { 1273 if (p->flags & TEST_PRIORITY) { 1274 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1275 continue; 1276 } 1277 1278 err = __igt_reset_engines(arg, p->name, p->flags); 1279 if (err) 1280 return err; 1281 } 1282 1283 return 0; 1284 } 1285 1286 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1287 { 1288 u32 count = i915_reset_count(>->i915->gpu_error); 1289 1290 intel_gt_reset(gt, mask, NULL); 1291 1292 return count; 1293 } 1294 1295 static int igt_reset_wait(void *arg) 1296 { 1297 struct intel_gt *gt = arg; 1298 struct i915_gpu_error *global = >->i915->gpu_error; 1299 struct intel_engine_cs *engine = gt->engine[RCS0]; 1300 struct i915_request *rq; 1301 unsigned int reset_count; 1302 struct hang h; 1303 long timeout; 1304 int err; 1305 1306 if (!engine || !intel_engine_can_store_dword(engine)) 1307 return 0; 1308 1309 /* Check that we detect a stuck waiter and issue a reset */ 1310 1311 igt_global_reset_lock(gt); 1312 1313 err = hang_init(&h, gt); 1314 if (err) { 1315 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1316 goto unlock; 1317 } 1318 1319 rq = hang_create_request(&h, engine); 1320 if (IS_ERR(rq)) { 1321 err = PTR_ERR(rq); 1322 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1323 goto fini; 1324 } 1325 1326 i915_request_get(rq); 1327 i915_request_add(rq); 1328 1329 if (!wait_until_running(&h, rq)) { 1330 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1331 1332 pr_err("%s: Failed to start request %llx, at %x\n", 1333 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1334 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1335 1336 intel_gt_set_wedged(gt); 1337 1338 err = -EIO; 1339 goto out_rq; 1340 } 1341 1342 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1343 1344 timeout = i915_request_wait(rq, 0, 10); 1345 if (timeout < 0) { 1346 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1347 timeout); 1348 err = timeout; 1349 goto out_rq; 1350 } 1351 1352 if (i915_reset_count(global) == reset_count) { 1353 pr_err("No GPU reset recorded!\n"); 1354 err = -EINVAL; 1355 goto out_rq; 1356 } 1357 1358 out_rq: 1359 i915_request_put(rq); 1360 fini: 1361 hang_fini(&h); 1362 unlock: 1363 igt_global_reset_unlock(gt); 1364 1365 if (intel_gt_is_wedged(gt)) 1366 return -EIO; 1367 1368 return err; 1369 } 1370 1371 struct evict_vma { 1372 struct completion completion; 1373 struct i915_vma *vma; 1374 }; 1375 1376 static int evict_vma(void *data) 1377 { 1378 struct evict_vma *arg = data; 1379 struct i915_address_space *vm = arg->vma->vm; 1380 struct drm_mm_node evict = arg->vma->node; 1381 int err; 1382 1383 complete(&arg->completion); 1384 1385 mutex_lock(&vm->mutex); 1386 err = i915_gem_evict_for_node(vm, &evict, 0); 1387 mutex_unlock(&vm->mutex); 1388 1389 return err; 1390 } 1391 1392 static int evict_fence(void *data) 1393 { 1394 struct evict_vma *arg = data; 1395 int err; 1396 1397 complete(&arg->completion); 1398 1399 /* Mark the fence register as dirty to force the mmio update. */ 1400 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1401 if (err) { 1402 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1403 return err; 1404 } 1405 1406 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1407 if (err) { 1408 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1409 return err; 1410 } 1411 1412 err = i915_vma_pin_fence(arg->vma); 1413 i915_vma_unpin(arg->vma); 1414 if (err) { 1415 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1416 return err; 1417 } 1418 1419 i915_vma_unpin_fence(arg->vma); 1420 1421 return 0; 1422 } 1423 1424 static int __igt_reset_evict_vma(struct intel_gt *gt, 1425 struct i915_address_space *vm, 1426 int (*fn)(void *), 1427 unsigned int flags) 1428 { 1429 struct intel_engine_cs *engine = gt->engine[RCS0]; 1430 struct drm_i915_gem_object *obj; 1431 struct task_struct *tsk = NULL; 1432 struct i915_request *rq; 1433 struct evict_vma arg; 1434 struct hang h; 1435 unsigned int pin_flags; 1436 int err; 1437 1438 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1439 return 0; 1440 1441 if (!engine || !intel_engine_can_store_dword(engine)) 1442 return 0; 1443 1444 /* Check that we can recover an unbind stuck on a hanging request */ 1445 1446 err = hang_init(&h, gt); 1447 if (err) { 1448 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1449 return err; 1450 } 1451 1452 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1453 if (IS_ERR(obj)) { 1454 err = PTR_ERR(obj); 1455 pr_err("[%s] Create object failed: %d!\n", engine->name, err); 1456 goto fini; 1457 } 1458 1459 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1460 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1461 if (err) { 1462 pr_err("Invalid X-tiling settings; err:%d\n", err); 1463 goto out_obj; 1464 } 1465 } 1466 1467 arg.vma = i915_vma_instance(obj, vm, NULL); 1468 if (IS_ERR(arg.vma)) { 1469 err = PTR_ERR(arg.vma); 1470 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err); 1471 goto out_obj; 1472 } 1473 1474 rq = hang_create_request(&h, engine); 1475 if (IS_ERR(rq)) { 1476 err = PTR_ERR(rq); 1477 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1478 goto out_obj; 1479 } 1480 1481 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1482 1483 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1484 pin_flags |= PIN_MAPPABLE; 1485 1486 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1487 if (err) { 1488 i915_request_add(rq); 1489 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err); 1490 goto out_obj; 1491 } 1492 1493 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1494 err = i915_vma_pin_fence(arg.vma); 1495 if (err) { 1496 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1497 i915_vma_unpin(arg.vma); 1498 i915_request_add(rq); 1499 goto out_obj; 1500 } 1501 } 1502 1503 i915_vma_lock(arg.vma); 1504 err = i915_request_await_object(rq, arg.vma->obj, 1505 flags & EXEC_OBJECT_WRITE); 1506 if (err == 0) { 1507 err = i915_vma_move_to_active(arg.vma, rq, flags); 1508 if (err) 1509 pr_err("[%s] Move to active failed: %d!\n", engine->name, err); 1510 } else { 1511 pr_err("[%s] Request await failed: %d!\n", engine->name, err); 1512 } 1513 1514 i915_vma_unlock(arg.vma); 1515 1516 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1517 i915_vma_unpin_fence(arg.vma); 1518 i915_vma_unpin(arg.vma); 1519 1520 i915_request_get(rq); 1521 i915_request_add(rq); 1522 if (err) 1523 goto out_rq; 1524 1525 if (!wait_until_running(&h, rq)) { 1526 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1527 1528 pr_err("%s: Failed to start request %llx, at %x\n", 1529 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1530 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1531 1532 intel_gt_set_wedged(gt); 1533 goto out_reset; 1534 } 1535 1536 init_completion(&arg.completion); 1537 1538 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1539 if (IS_ERR(tsk)) { 1540 err = PTR_ERR(tsk); 1541 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1542 tsk = NULL; 1543 goto out_reset; 1544 } 1545 get_task_struct(tsk); 1546 1547 wait_for_completion(&arg.completion); 1548 1549 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1550 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1551 1552 pr_err("igt/evict_vma kthread did not wait\n"); 1553 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1554 1555 intel_gt_set_wedged(gt); 1556 goto out_reset; 1557 } 1558 1559 out_reset: 1560 igt_global_reset_lock(gt); 1561 fake_hangcheck(gt, rq->engine->mask); 1562 igt_global_reset_unlock(gt); 1563 1564 if (tsk) { 1565 struct intel_wedge_me w; 1566 1567 /* The reset, even indirectly, should take less than 10ms. */ 1568 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1569 err = kthread_stop(tsk); 1570 1571 put_task_struct(tsk); 1572 } 1573 1574 out_rq: 1575 i915_request_put(rq); 1576 out_obj: 1577 i915_gem_object_put(obj); 1578 fini: 1579 hang_fini(&h); 1580 if (intel_gt_is_wedged(gt)) 1581 return -EIO; 1582 1583 return err; 1584 } 1585 1586 static int igt_reset_evict_ggtt(void *arg) 1587 { 1588 struct intel_gt *gt = arg; 1589 1590 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1591 evict_vma, EXEC_OBJECT_WRITE); 1592 } 1593 1594 static int igt_reset_evict_ppgtt(void *arg) 1595 { 1596 struct intel_gt *gt = arg; 1597 struct i915_ppgtt *ppgtt; 1598 int err; 1599 1600 /* aliasing == global gtt locking, covered above */ 1601 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1602 return 0; 1603 1604 ppgtt = i915_ppgtt_create(gt, 0); 1605 if (IS_ERR(ppgtt)) 1606 return PTR_ERR(ppgtt); 1607 1608 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1609 evict_vma, EXEC_OBJECT_WRITE); 1610 i915_vm_put(&ppgtt->vm); 1611 1612 return err; 1613 } 1614 1615 static int igt_reset_evict_fence(void *arg) 1616 { 1617 struct intel_gt *gt = arg; 1618 1619 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1620 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1621 } 1622 1623 static int wait_for_others(struct intel_gt *gt, 1624 struct intel_engine_cs *exclude) 1625 { 1626 struct intel_engine_cs *engine; 1627 enum intel_engine_id id; 1628 1629 for_each_engine(engine, gt, id) { 1630 if (engine == exclude) 1631 continue; 1632 1633 if (!wait_for_idle(engine)) 1634 return -EIO; 1635 } 1636 1637 return 0; 1638 } 1639 1640 static int igt_reset_queue(void *arg) 1641 { 1642 struct intel_gt *gt = arg; 1643 struct i915_gpu_error *global = >->i915->gpu_error; 1644 struct intel_engine_cs *engine; 1645 enum intel_engine_id id; 1646 struct hang h; 1647 int err; 1648 1649 /* Check that we replay pending requests following a hang */ 1650 1651 igt_global_reset_lock(gt); 1652 1653 err = hang_init(&h, gt); 1654 if (err) 1655 goto unlock; 1656 1657 for_each_engine(engine, gt, id) { 1658 struct intel_selftest_saved_policy saved; 1659 struct i915_request *prev; 1660 IGT_TIMEOUT(end_time); 1661 unsigned int count; 1662 bool using_guc = intel_engine_uses_guc(engine); 1663 1664 if (!intel_engine_can_store_dword(engine)) 1665 continue; 1666 1667 if (using_guc) { 1668 err = intel_selftest_modify_policy(engine, &saved, 1669 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK); 1670 if (err) { 1671 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1672 goto fini; 1673 } 1674 } 1675 1676 prev = hang_create_request(&h, engine); 1677 if (IS_ERR(prev)) { 1678 err = PTR_ERR(prev); 1679 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err); 1680 goto restore; 1681 } 1682 1683 i915_request_get(prev); 1684 i915_request_add(prev); 1685 1686 count = 0; 1687 do { 1688 struct i915_request *rq; 1689 unsigned int reset_count; 1690 1691 rq = hang_create_request(&h, engine); 1692 if (IS_ERR(rq)) { 1693 err = PTR_ERR(rq); 1694 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1695 goto restore; 1696 } 1697 1698 i915_request_get(rq); 1699 i915_request_add(rq); 1700 1701 /* 1702 * XXX We don't handle resetting the kernel context 1703 * very well. If we trigger a device reset twice in 1704 * quick succession while the kernel context is 1705 * executing, we may end up skipping the breadcrumb. 1706 * This is really only a problem for the selftest as 1707 * normally there is a large interlude between resets 1708 * (hangcheck), or we focus on resetting just one 1709 * engine and so avoid repeatedly resetting innocents. 1710 */ 1711 err = wait_for_others(gt, engine); 1712 if (err) { 1713 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1714 __func__, engine->name); 1715 i915_request_put(rq); 1716 i915_request_put(prev); 1717 1718 GEM_TRACE_DUMP(); 1719 intel_gt_set_wedged(gt); 1720 goto restore; 1721 } 1722 1723 if (!wait_until_running(&h, prev)) { 1724 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1725 1726 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1727 __func__, engine->name, 1728 prev->fence.seqno, hws_seqno(&h, prev)); 1729 intel_engine_dump(engine, &p, 1730 "%s\n", engine->name); 1731 1732 i915_request_put(rq); 1733 i915_request_put(prev); 1734 1735 intel_gt_set_wedged(gt); 1736 1737 err = -EIO; 1738 goto restore; 1739 } 1740 1741 reset_count = fake_hangcheck(gt, BIT(id)); 1742 1743 if (prev->fence.error != -EIO) { 1744 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1745 prev->fence.error); 1746 i915_request_put(rq); 1747 i915_request_put(prev); 1748 err = -EINVAL; 1749 goto restore; 1750 } 1751 1752 if (rq->fence.error) { 1753 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1754 rq->fence.error); 1755 i915_request_put(rq); 1756 i915_request_put(prev); 1757 err = -EINVAL; 1758 goto restore; 1759 } 1760 1761 if (i915_reset_count(global) == reset_count) { 1762 pr_err("No GPU reset recorded!\n"); 1763 i915_request_put(rq); 1764 i915_request_put(prev); 1765 err = -EINVAL; 1766 goto restore; 1767 } 1768 1769 i915_request_put(prev); 1770 prev = rq; 1771 count++; 1772 } while (time_before(jiffies, end_time)); 1773 pr_info("%s: Completed %d queued resets\n", 1774 engine->name, count); 1775 1776 *h.batch = MI_BATCH_BUFFER_END; 1777 intel_gt_chipset_flush(engine->gt); 1778 1779 i915_request_put(prev); 1780 1781 restore: 1782 if (using_guc) { 1783 int err2 = intel_selftest_restore_policy(engine, &saved); 1784 1785 if (err2) 1786 pr_err("%s:%d> [%s] Restore policy failed: %d!\n", 1787 __func__, __LINE__, engine->name, err2); 1788 if (err == 0) 1789 err = err2; 1790 } 1791 if (err) 1792 goto fini; 1793 1794 err = igt_flush_test(gt->i915); 1795 if (err) { 1796 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1797 break; 1798 } 1799 } 1800 1801 fini: 1802 hang_fini(&h); 1803 unlock: 1804 igt_global_reset_unlock(gt); 1805 1806 if (intel_gt_is_wedged(gt)) 1807 return -EIO; 1808 1809 return err; 1810 } 1811 1812 static int igt_handle_error(void *arg) 1813 { 1814 struct intel_gt *gt = arg; 1815 struct i915_gpu_error *global = >->i915->gpu_error; 1816 struct intel_engine_cs *engine = gt->engine[RCS0]; 1817 struct hang h; 1818 struct i915_request *rq; 1819 struct i915_gpu_coredump *error; 1820 int err; 1821 1822 /* Check that we can issue a global GPU and engine reset */ 1823 1824 if (!intel_has_reset_engine(gt)) 1825 return 0; 1826 1827 if (!engine || !intel_engine_can_store_dword(engine)) 1828 return 0; 1829 1830 err = hang_init(&h, gt); 1831 if (err) { 1832 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1833 return err; 1834 } 1835 1836 rq = hang_create_request(&h, engine); 1837 if (IS_ERR(rq)) { 1838 err = PTR_ERR(rq); 1839 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1840 goto err_fini; 1841 } 1842 1843 i915_request_get(rq); 1844 i915_request_add(rq); 1845 1846 if (!wait_until_running(&h, rq)) { 1847 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1848 1849 pr_err("%s: Failed to start request %llx, at %x\n", 1850 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1851 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1852 1853 intel_gt_set_wedged(gt); 1854 1855 err = -EIO; 1856 goto err_request; 1857 } 1858 1859 /* Temporarily disable error capture */ 1860 error = xchg(&global->first_error, (void *)-1); 1861 1862 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1863 1864 xchg(&global->first_error, error); 1865 1866 if (rq->fence.error != -EIO) { 1867 pr_err("Guilty request not identified!\n"); 1868 err = -EINVAL; 1869 goto err_request; 1870 } 1871 1872 err_request: 1873 i915_request_put(rq); 1874 err_fini: 1875 hang_fini(&h); 1876 return err; 1877 } 1878 1879 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1880 const struct igt_atomic_section *p, 1881 const char *mode) 1882 { 1883 struct tasklet_struct * const t = &engine->sched_engine->tasklet; 1884 int err; 1885 1886 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1887 engine->name, mode, p->name); 1888 1889 if (t->func) 1890 tasklet_disable(t); 1891 if (strcmp(p->name, "softirq")) 1892 local_bh_disable(); 1893 p->critical_section_begin(); 1894 1895 err = __intel_engine_reset_bh(engine, NULL); 1896 1897 p->critical_section_end(); 1898 if (strcmp(p->name, "softirq")) 1899 local_bh_enable(); 1900 if (t->func) { 1901 tasklet_enable(t); 1902 tasklet_hi_schedule(t); 1903 } 1904 1905 if (err) 1906 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1907 engine->name, mode, p->name); 1908 1909 return err; 1910 } 1911 1912 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1913 const struct igt_atomic_section *p) 1914 { 1915 struct i915_request *rq; 1916 struct hang h; 1917 int err; 1918 1919 err = __igt_atomic_reset_engine(engine, p, "idle"); 1920 if (err) 1921 return err; 1922 1923 err = hang_init(&h, engine->gt); 1924 if (err) { 1925 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1926 return err; 1927 } 1928 1929 rq = hang_create_request(&h, engine); 1930 if (IS_ERR(rq)) { 1931 err = PTR_ERR(rq); 1932 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1933 goto out; 1934 } 1935 1936 i915_request_get(rq); 1937 i915_request_add(rq); 1938 1939 if (wait_until_running(&h, rq)) { 1940 err = __igt_atomic_reset_engine(engine, p, "active"); 1941 } else { 1942 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1943 __func__, engine->name, 1944 rq->fence.seqno, hws_seqno(&h, rq)); 1945 intel_gt_set_wedged(engine->gt); 1946 err = -EIO; 1947 } 1948 1949 if (err == 0) { 1950 struct intel_wedge_me w; 1951 1952 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1953 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1954 if (intel_gt_is_wedged(engine->gt)) 1955 err = -EIO; 1956 } 1957 1958 i915_request_put(rq); 1959 out: 1960 hang_fini(&h); 1961 return err; 1962 } 1963 1964 static int igt_reset_engines_atomic(void *arg) 1965 { 1966 struct intel_gt *gt = arg; 1967 const typeof(*igt_atomic_phases) *p; 1968 int err = 0; 1969 1970 /* Check that the engines resets are usable from atomic context */ 1971 1972 if (!intel_has_reset_engine(gt)) 1973 return 0; 1974 1975 if (intel_uc_uses_guc_submission(>->uc)) 1976 return 0; 1977 1978 igt_global_reset_lock(gt); 1979 1980 /* Flush any requests before we get started and check basics */ 1981 if (!igt_force_reset(gt)) 1982 goto unlock; 1983 1984 for (p = igt_atomic_phases; p->name; p++) { 1985 struct intel_engine_cs *engine; 1986 enum intel_engine_id id; 1987 1988 for_each_engine(engine, gt, id) { 1989 err = igt_atomic_reset_engine(engine, p); 1990 if (err) 1991 goto out; 1992 } 1993 } 1994 1995 out: 1996 /* As we poke around the guts, do a full reset before continuing. */ 1997 igt_force_reset(gt); 1998 unlock: 1999 igt_global_reset_unlock(gt); 2000 2001 return err; 2002 } 2003 2004 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 2005 { 2006 static const struct i915_subtest tests[] = { 2007 SUBTEST(igt_hang_sanitycheck), 2008 SUBTEST(igt_reset_nop), 2009 SUBTEST(igt_reset_nop_engine), 2010 SUBTEST(igt_reset_idle_engine), 2011 SUBTEST(igt_reset_active_engine), 2012 SUBTEST(igt_reset_fail_engine), 2013 SUBTEST(igt_reset_engines), 2014 SUBTEST(igt_reset_engines_atomic), 2015 SUBTEST(igt_reset_queue), 2016 SUBTEST(igt_reset_wait), 2017 SUBTEST(igt_reset_evict_ggtt), 2018 SUBTEST(igt_reset_evict_ppgtt), 2019 SUBTEST(igt_reset_evict_fence), 2020 SUBTEST(igt_handle_error), 2021 }; 2022 struct intel_gt *gt = to_gt(i915); 2023 intel_wakeref_t wakeref; 2024 int err; 2025 2026 if (!intel_has_gpu_reset(gt)) 2027 return 0; 2028 2029 if (intel_gt_is_wedged(gt)) 2030 return -EIO; /* we're long past hope of a successful reset */ 2031 2032 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 2033 2034 err = intel_gt_live_subtests(tests, gt); 2035 2036 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 2037 2038 return err; 2039 } 2040