1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2016 Intel Corporation 4 */ 5 6 #include <linux/kthread.h> 7 8 #include "gem/i915_gem_context.h" 9 #include "gem/i915_gem_internal.h" 10 11 #include "i915_gem_evict.h" 12 #include "intel_gt.h" 13 #include "intel_engine_heartbeat.h" 14 #include "intel_engine_pm.h" 15 #include "selftest_engine_heartbeat.h" 16 17 #include "i915_selftest.h" 18 #include "selftests/i915_random.h" 19 #include "selftests/igt_flush_test.h" 20 #include "selftests/igt_reset.h" 21 #include "selftests/igt_atomic.h" 22 #include "selftests/igt_spinner.h" 23 #include "selftests/intel_scheduler_helpers.h" 24 25 #include "selftests/mock_drm.h" 26 27 #include "gem/selftests/mock_context.h" 28 #include "gem/selftests/igt_gem_utils.h" 29 30 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 31 32 struct hang { 33 struct intel_gt *gt; 34 struct drm_i915_gem_object *hws; 35 struct drm_i915_gem_object *obj; 36 struct i915_gem_context *ctx; 37 u32 *seqno; 38 u32 *batch; 39 }; 40 41 static int hang_init(struct hang *h, struct intel_gt *gt) 42 { 43 void *vaddr; 44 int err; 45 46 memset(h, 0, sizeof(*h)); 47 h->gt = gt; 48 49 h->ctx = kernel_context(gt->i915, NULL); 50 if (IS_ERR(h->ctx)) 51 return PTR_ERR(h->ctx); 52 53 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 54 55 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 56 if (IS_ERR(h->hws)) { 57 err = PTR_ERR(h->hws); 58 goto err_ctx; 59 } 60 61 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 62 if (IS_ERR(h->obj)) { 63 err = PTR_ERR(h->obj); 64 goto err_hws; 65 } 66 67 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 68 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB); 69 if (IS_ERR(vaddr)) { 70 err = PTR_ERR(vaddr); 71 goto err_obj; 72 } 73 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 74 75 vaddr = i915_gem_object_pin_map_unlocked(h->obj, 76 i915_coherent_map_type(gt->i915, h->obj, false)); 77 if (IS_ERR(vaddr)) { 78 err = PTR_ERR(vaddr); 79 goto err_unpin_hws; 80 } 81 h->batch = vaddr; 82 83 return 0; 84 85 err_unpin_hws: 86 i915_gem_object_unpin_map(h->hws); 87 err_obj: 88 i915_gem_object_put(h->obj); 89 err_hws: 90 i915_gem_object_put(h->hws); 91 err_ctx: 92 kernel_context_close(h->ctx); 93 return err; 94 } 95 96 static u64 hws_address(const struct i915_vma *hws, 97 const struct i915_request *rq) 98 { 99 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 100 } 101 102 static int move_to_active(struct i915_vma *vma, 103 struct i915_request *rq, 104 unsigned int flags) 105 { 106 int err; 107 108 i915_vma_lock(vma); 109 err = i915_vma_move_to_active(vma, rq, flags); 110 i915_vma_unlock(vma); 111 112 return err; 113 } 114 115 static struct i915_request * 116 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 117 { 118 struct intel_gt *gt = h->gt; 119 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx); 120 struct drm_i915_gem_object *obj; 121 struct i915_request *rq = NULL; 122 struct i915_vma *hws, *vma; 123 unsigned int flags; 124 void *vaddr; 125 u32 *batch; 126 int err; 127 128 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 129 if (IS_ERR(obj)) { 130 i915_vm_put(vm); 131 return ERR_CAST(obj); 132 } 133 134 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false)); 135 if (IS_ERR(vaddr)) { 136 i915_gem_object_put(obj); 137 i915_vm_put(vm); 138 return ERR_CAST(vaddr); 139 } 140 141 i915_gem_object_unpin_map(h->obj); 142 i915_gem_object_put(h->obj); 143 144 h->obj = obj; 145 h->batch = vaddr; 146 147 vma = i915_vma_instance(h->obj, vm, NULL); 148 if (IS_ERR(vma)) { 149 i915_vm_put(vm); 150 return ERR_CAST(vma); 151 } 152 153 hws = i915_vma_instance(h->hws, vm, NULL); 154 if (IS_ERR(hws)) { 155 i915_vm_put(vm); 156 return ERR_CAST(hws); 157 } 158 159 err = i915_vma_pin(vma, 0, 0, PIN_USER); 160 if (err) { 161 i915_vm_put(vm); 162 return ERR_PTR(err); 163 } 164 165 err = i915_vma_pin(hws, 0, 0, PIN_USER); 166 if (err) 167 goto unpin_vma; 168 169 rq = igt_request_alloc(h->ctx, engine); 170 if (IS_ERR(rq)) { 171 err = PTR_ERR(rq); 172 goto unpin_hws; 173 } 174 175 err = move_to_active(vma, rq, 0); 176 if (err) 177 goto cancel_rq; 178 179 err = move_to_active(hws, rq, 0); 180 if (err) 181 goto cancel_rq; 182 183 batch = h->batch; 184 if (GRAPHICS_VER(gt->i915) >= 8) { 185 *batch++ = MI_STORE_DWORD_IMM_GEN4; 186 *batch++ = lower_32_bits(hws_address(hws, rq)); 187 *batch++ = upper_32_bits(hws_address(hws, rq)); 188 *batch++ = rq->fence.seqno; 189 *batch++ = MI_NOOP; 190 191 memset(batch, 0, 1024); 192 batch += 1024 / sizeof(*batch); 193 194 *batch++ = MI_NOOP; 195 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 196 *batch++ = lower_32_bits(vma->node.start); 197 *batch++ = upper_32_bits(vma->node.start); 198 } else if (GRAPHICS_VER(gt->i915) >= 6) { 199 *batch++ = MI_STORE_DWORD_IMM_GEN4; 200 *batch++ = 0; 201 *batch++ = lower_32_bits(hws_address(hws, rq)); 202 *batch++ = rq->fence.seqno; 203 *batch++ = MI_NOOP; 204 205 memset(batch, 0, 1024); 206 batch += 1024 / sizeof(*batch); 207 208 *batch++ = MI_NOOP; 209 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 210 *batch++ = lower_32_bits(vma->node.start); 211 } else if (GRAPHICS_VER(gt->i915) >= 4) { 212 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 213 *batch++ = 0; 214 *batch++ = lower_32_bits(hws_address(hws, rq)); 215 *batch++ = rq->fence.seqno; 216 *batch++ = MI_NOOP; 217 218 memset(batch, 0, 1024); 219 batch += 1024 / sizeof(*batch); 220 221 *batch++ = MI_NOOP; 222 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 223 *batch++ = lower_32_bits(vma->node.start); 224 } else { 225 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 226 *batch++ = lower_32_bits(hws_address(hws, rq)); 227 *batch++ = rq->fence.seqno; 228 *batch++ = MI_NOOP; 229 230 memset(batch, 0, 1024); 231 batch += 1024 / sizeof(*batch); 232 233 *batch++ = MI_NOOP; 234 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 235 *batch++ = lower_32_bits(vma->node.start); 236 } 237 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 238 intel_gt_chipset_flush(engine->gt); 239 240 if (rq->engine->emit_init_breadcrumb) { 241 err = rq->engine->emit_init_breadcrumb(rq); 242 if (err) 243 goto cancel_rq; 244 } 245 246 flags = 0; 247 if (GRAPHICS_VER(gt->i915) <= 5) 248 flags |= I915_DISPATCH_SECURE; 249 250 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 251 252 cancel_rq: 253 if (err) { 254 i915_request_set_error_once(rq, err); 255 i915_request_add(rq); 256 } 257 unpin_hws: 258 i915_vma_unpin(hws); 259 unpin_vma: 260 i915_vma_unpin(vma); 261 i915_vm_put(vm); 262 return err ? ERR_PTR(err) : rq; 263 } 264 265 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 266 { 267 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 268 } 269 270 static void hang_fini(struct hang *h) 271 { 272 *h->batch = MI_BATCH_BUFFER_END; 273 intel_gt_chipset_flush(h->gt); 274 275 i915_gem_object_unpin_map(h->obj); 276 i915_gem_object_put(h->obj); 277 278 i915_gem_object_unpin_map(h->hws); 279 i915_gem_object_put(h->hws); 280 281 kernel_context_close(h->ctx); 282 283 igt_flush_test(h->gt->i915); 284 } 285 286 static bool wait_until_running(struct hang *h, struct i915_request *rq) 287 { 288 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 289 rq->fence.seqno), 290 10) && 291 wait_for(i915_seqno_passed(hws_seqno(h, rq), 292 rq->fence.seqno), 293 1000)); 294 } 295 296 static int igt_hang_sanitycheck(void *arg) 297 { 298 struct intel_gt *gt = arg; 299 struct i915_request *rq; 300 struct intel_engine_cs *engine; 301 enum intel_engine_id id; 302 struct hang h; 303 int err; 304 305 /* Basic check that we can execute our hanging batch */ 306 307 err = hang_init(&h, gt); 308 if (err) 309 return err; 310 311 for_each_engine(engine, gt, id) { 312 struct intel_wedge_me w; 313 long timeout; 314 315 if (!intel_engine_can_store_dword(engine)) 316 continue; 317 318 rq = hang_create_request(&h, engine); 319 if (IS_ERR(rq)) { 320 err = PTR_ERR(rq); 321 pr_err("Failed to create request for %s, err=%d\n", 322 engine->name, err); 323 goto fini; 324 } 325 326 i915_request_get(rq); 327 328 *h.batch = MI_BATCH_BUFFER_END; 329 intel_gt_chipset_flush(engine->gt); 330 331 i915_request_add(rq); 332 333 timeout = 0; 334 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 335 timeout = i915_request_wait(rq, 0, 336 MAX_SCHEDULE_TIMEOUT); 337 if (intel_gt_is_wedged(gt)) 338 timeout = -EIO; 339 340 i915_request_put(rq); 341 342 if (timeout < 0) { 343 err = timeout; 344 pr_err("Wait for request failed on %s, err=%d\n", 345 engine->name, err); 346 goto fini; 347 } 348 } 349 350 fini: 351 hang_fini(&h); 352 return err; 353 } 354 355 static bool wait_for_idle(struct intel_engine_cs *engine) 356 { 357 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 358 } 359 360 static int igt_reset_nop(void *arg) 361 { 362 struct intel_gt *gt = arg; 363 struct i915_gpu_error *global = >->i915->gpu_error; 364 struct intel_engine_cs *engine; 365 unsigned int reset_count, count; 366 enum intel_engine_id id; 367 IGT_TIMEOUT(end_time); 368 int err = 0; 369 370 /* Check that we can reset during non-user portions of requests */ 371 372 reset_count = i915_reset_count(global); 373 count = 0; 374 do { 375 for_each_engine(engine, gt, id) { 376 struct intel_context *ce; 377 int i; 378 379 ce = intel_context_create(engine); 380 if (IS_ERR(ce)) { 381 err = PTR_ERR(ce); 382 pr_err("[%s] Create context failed: %d!\n", engine->name, err); 383 break; 384 } 385 386 for (i = 0; i < 16; i++) { 387 struct i915_request *rq; 388 389 rq = intel_context_create_request(ce); 390 if (IS_ERR(rq)) { 391 err = PTR_ERR(rq); 392 pr_err("[%s] Create request failed: %d!\n", 393 engine->name, err); 394 break; 395 } 396 397 i915_request_add(rq); 398 } 399 400 intel_context_put(ce); 401 } 402 403 igt_global_reset_lock(gt); 404 intel_gt_reset(gt, ALL_ENGINES, NULL); 405 igt_global_reset_unlock(gt); 406 407 if (intel_gt_is_wedged(gt)) { 408 pr_err("[%s] GT is wedged!\n", engine->name); 409 err = -EIO; 410 break; 411 } 412 413 if (i915_reset_count(global) != reset_count + ++count) { 414 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n", 415 engine->name, i915_reset_count(global), reset_count, count); 416 err = -EINVAL; 417 break; 418 } 419 420 err = igt_flush_test(gt->i915); 421 if (err) { 422 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 423 break; 424 } 425 } while (time_before(jiffies, end_time)); 426 pr_info("%s: %d resets\n", __func__, count); 427 428 if (igt_flush_test(gt->i915)) { 429 pr_err("Post flush failed: %d!\n", err); 430 err = -EIO; 431 } 432 433 return err; 434 } 435 436 static int igt_reset_nop_engine(void *arg) 437 { 438 struct intel_gt *gt = arg; 439 struct i915_gpu_error *global = >->i915->gpu_error; 440 struct intel_engine_cs *engine; 441 enum intel_engine_id id; 442 443 /* Check that we can engine-reset during non-user portions */ 444 445 if (!intel_has_reset_engine(gt)) 446 return 0; 447 448 for_each_engine(engine, gt, id) { 449 unsigned int reset_count, reset_engine_count, count; 450 struct intel_context *ce; 451 IGT_TIMEOUT(end_time); 452 int err; 453 454 if (intel_engine_uses_guc(engine)) { 455 /* Engine level resets are triggered by GuC when a hang 456 * is detected. They can't be triggered by the KMD any 457 * more. Thus a nop batch cannot be used as a reset test 458 */ 459 continue; 460 } 461 462 ce = intel_context_create(engine); 463 if (IS_ERR(ce)) { 464 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 465 return PTR_ERR(ce); 466 } 467 468 reset_count = i915_reset_count(global); 469 reset_engine_count = i915_reset_engine_count(global, engine); 470 count = 0; 471 472 st_engine_heartbeat_disable(engine); 473 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 474 >->reset.flags)); 475 do { 476 int i; 477 478 if (!wait_for_idle(engine)) { 479 pr_err("%s failed to idle before reset\n", 480 engine->name); 481 err = -EIO; 482 break; 483 } 484 485 for (i = 0; i < 16; i++) { 486 struct i915_request *rq; 487 488 rq = intel_context_create_request(ce); 489 if (IS_ERR(rq)) { 490 struct drm_printer p = 491 drm_info_printer(gt->i915->drm.dev); 492 intel_engine_dump(engine, &p, 493 "%s(%s): failed to submit request\n", 494 __func__, 495 engine->name); 496 497 GEM_TRACE("%s(%s): failed to submit request\n", 498 __func__, 499 engine->name); 500 GEM_TRACE_DUMP(); 501 502 intel_gt_set_wedged(gt); 503 504 err = PTR_ERR(rq); 505 break; 506 } 507 508 i915_request_add(rq); 509 } 510 err = intel_engine_reset(engine, NULL); 511 if (err) { 512 pr_err("intel_engine_reset(%s) failed, err:%d\n", 513 engine->name, err); 514 break; 515 } 516 517 if (i915_reset_count(global) != reset_count) { 518 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 519 err = -EINVAL; 520 break; 521 } 522 523 if (i915_reset_engine_count(global, engine) != 524 reset_engine_count + ++count) { 525 pr_err("%s engine reset not recorded!\n", 526 engine->name); 527 err = -EINVAL; 528 break; 529 } 530 } while (time_before(jiffies, end_time)); 531 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 532 st_engine_heartbeat_enable(engine); 533 534 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 535 536 intel_context_put(ce); 537 if (igt_flush_test(gt->i915)) 538 err = -EIO; 539 if (err) 540 return err; 541 } 542 543 return 0; 544 } 545 546 static void force_reset_timeout(struct intel_engine_cs *engine) 547 { 548 engine->reset_timeout.probability = 999; 549 atomic_set(&engine->reset_timeout.times, -1); 550 } 551 552 static void cancel_reset_timeout(struct intel_engine_cs *engine) 553 { 554 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 555 } 556 557 static int igt_reset_fail_engine(void *arg) 558 { 559 struct intel_gt *gt = arg; 560 struct intel_engine_cs *engine; 561 enum intel_engine_id id; 562 563 /* Check that we can recover from engine-reset failues */ 564 565 if (!intel_has_reset_engine(gt)) 566 return 0; 567 568 for_each_engine(engine, gt, id) { 569 unsigned int count; 570 struct intel_context *ce; 571 IGT_TIMEOUT(end_time); 572 int err; 573 574 /* Can't manually break the reset if i915 doesn't perform it */ 575 if (intel_engine_uses_guc(engine)) 576 continue; 577 578 ce = intel_context_create(engine); 579 if (IS_ERR(ce)) { 580 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 581 return PTR_ERR(ce); 582 } 583 584 st_engine_heartbeat_disable(engine); 585 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 586 >->reset.flags)); 587 588 force_reset_timeout(engine); 589 err = intel_engine_reset(engine, NULL); 590 cancel_reset_timeout(engine); 591 if (err == 0) /* timeouts only generated on gen8+ */ 592 goto skip; 593 594 count = 0; 595 do { 596 struct i915_request *last = NULL; 597 int i; 598 599 if (!wait_for_idle(engine)) { 600 pr_err("%s failed to idle before reset\n", 601 engine->name); 602 err = -EIO; 603 break; 604 } 605 606 for (i = 0; i < count % 15; i++) { 607 struct i915_request *rq; 608 609 rq = intel_context_create_request(ce); 610 if (IS_ERR(rq)) { 611 struct drm_printer p = 612 drm_info_printer(gt->i915->drm.dev); 613 intel_engine_dump(engine, &p, 614 "%s(%s): failed to submit request\n", 615 __func__, 616 engine->name); 617 618 GEM_TRACE("%s(%s): failed to submit request\n", 619 __func__, 620 engine->name); 621 GEM_TRACE_DUMP(); 622 623 intel_gt_set_wedged(gt); 624 if (last) 625 i915_request_put(last); 626 627 err = PTR_ERR(rq); 628 goto out; 629 } 630 631 if (last) 632 i915_request_put(last); 633 last = i915_request_get(rq); 634 i915_request_add(rq); 635 } 636 637 if (count & 1) { 638 err = intel_engine_reset(engine, NULL); 639 if (err) { 640 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 641 engine->name, err); 642 GEM_TRACE_DUMP(); 643 i915_request_put(last); 644 break; 645 } 646 } else { 647 force_reset_timeout(engine); 648 err = intel_engine_reset(engine, NULL); 649 cancel_reset_timeout(engine); 650 if (err != -ETIMEDOUT) { 651 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 652 engine->name, err); 653 i915_request_put(last); 654 break; 655 } 656 } 657 658 err = 0; 659 if (last) { 660 if (i915_request_wait(last, 0, HZ / 2) < 0) { 661 struct drm_printer p = 662 drm_info_printer(gt->i915->drm.dev); 663 664 intel_engine_dump(engine, &p, 665 "%s(%s): failed to complete request\n", 666 __func__, 667 engine->name); 668 669 GEM_TRACE("%s(%s): failed to complete request\n", 670 __func__, 671 engine->name); 672 GEM_TRACE_DUMP(); 673 674 err = -EIO; 675 } 676 i915_request_put(last); 677 } 678 count++; 679 } while (err == 0 && time_before(jiffies, end_time)); 680 out: 681 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 682 skip: 683 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 684 st_engine_heartbeat_enable(engine); 685 intel_context_put(ce); 686 687 if (igt_flush_test(gt->i915)) 688 err = -EIO; 689 if (err) 690 return err; 691 } 692 693 return 0; 694 } 695 696 static int __igt_reset_engine(struct intel_gt *gt, bool active) 697 { 698 struct i915_gpu_error *global = >->i915->gpu_error; 699 struct intel_engine_cs *engine; 700 enum intel_engine_id id; 701 struct hang h; 702 int err = 0; 703 704 /* Check that we can issue an engine reset on an idle engine (no-op) */ 705 706 if (!intel_has_reset_engine(gt)) 707 return 0; 708 709 if (active) { 710 err = hang_init(&h, gt); 711 if (err) 712 return err; 713 } 714 715 for_each_engine(engine, gt, id) { 716 unsigned int reset_count, reset_engine_count; 717 unsigned long count; 718 bool using_guc = intel_engine_uses_guc(engine); 719 IGT_TIMEOUT(end_time); 720 721 if (using_guc && !active) 722 continue; 723 724 if (active && !intel_engine_can_store_dword(engine)) 725 continue; 726 727 if (!wait_for_idle(engine)) { 728 pr_err("%s failed to idle before reset\n", 729 engine->name); 730 err = -EIO; 731 break; 732 } 733 734 reset_count = i915_reset_count(global); 735 reset_engine_count = i915_reset_engine_count(global, engine); 736 737 st_engine_heartbeat_disable(engine); 738 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 739 >->reset.flags)); 740 count = 0; 741 do { 742 struct i915_request *rq = NULL; 743 struct intel_selftest_saved_policy saved; 744 int err2; 745 746 err = intel_selftest_modify_policy(engine, &saved, 747 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 748 if (err) { 749 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 750 break; 751 } 752 753 if (active) { 754 rq = hang_create_request(&h, engine); 755 if (IS_ERR(rq)) { 756 err = PTR_ERR(rq); 757 pr_err("[%s] Create hang request failed: %d!\n", 758 engine->name, err); 759 goto restore; 760 } 761 762 i915_request_get(rq); 763 i915_request_add(rq); 764 765 if (!wait_until_running(&h, rq)) { 766 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 767 768 pr_err("%s: Failed to start request %llx, at %x\n", 769 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 770 intel_engine_dump(engine, &p, 771 "%s\n", engine->name); 772 773 i915_request_put(rq); 774 err = -EIO; 775 goto restore; 776 } 777 } 778 779 if (!using_guc) { 780 err = intel_engine_reset(engine, NULL); 781 if (err) { 782 pr_err("intel_engine_reset(%s) failed, err:%d\n", 783 engine->name, err); 784 goto skip; 785 } 786 } 787 788 if (rq) { 789 /* Ensure the reset happens and kills the engine */ 790 err = intel_selftest_wait_for_rq(rq); 791 if (err) 792 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 793 engine->name, rq->fence.context, 794 rq->fence.seqno, rq->context->guc_id.id, err); 795 } 796 797 skip: 798 if (rq) 799 i915_request_put(rq); 800 801 if (i915_reset_count(global) != reset_count) { 802 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 803 err = -EINVAL; 804 goto restore; 805 } 806 807 /* GuC based resets are not logged per engine */ 808 if (!using_guc) { 809 if (i915_reset_engine_count(global, engine) != 810 ++reset_engine_count) { 811 pr_err("%s engine reset not recorded!\n", 812 engine->name); 813 err = -EINVAL; 814 goto restore; 815 } 816 } 817 818 count++; 819 820 restore: 821 err2 = intel_selftest_restore_policy(engine, &saved); 822 if (err2) 823 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err); 824 if (err == 0) 825 err = err2; 826 if (err) 827 break; 828 } while (time_before(jiffies, end_time)); 829 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 830 st_engine_heartbeat_enable(engine); 831 pr_info("%s: Completed %lu %s resets\n", 832 engine->name, count, active ? "active" : "idle"); 833 834 if (err) 835 break; 836 837 err = igt_flush_test(gt->i915); 838 if (err) { 839 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 840 break; 841 } 842 } 843 844 if (intel_gt_is_wedged(gt)) { 845 pr_err("GT is wedged!\n"); 846 err = -EIO; 847 } 848 849 if (active) 850 hang_fini(&h); 851 852 return err; 853 } 854 855 static int igt_reset_idle_engine(void *arg) 856 { 857 return __igt_reset_engine(arg, false); 858 } 859 860 static int igt_reset_active_engine(void *arg) 861 { 862 return __igt_reset_engine(arg, true); 863 } 864 865 struct active_engine { 866 struct kthread_worker *worker; 867 struct kthread_work work; 868 struct intel_engine_cs *engine; 869 unsigned long resets; 870 unsigned int flags; 871 bool stop; 872 int result; 873 }; 874 875 #define TEST_ACTIVE BIT(0) 876 #define TEST_OTHERS BIT(1) 877 #define TEST_SELF BIT(2) 878 #define TEST_PRIORITY BIT(3) 879 880 static int active_request_put(struct i915_request *rq) 881 { 882 int err = 0; 883 884 if (!rq) 885 return 0; 886 887 if (i915_request_wait(rq, 0, 10 * HZ) < 0) { 888 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 889 rq->engine->name, 890 rq->fence.context, 891 rq->fence.seqno); 892 GEM_TRACE_DUMP(); 893 894 intel_gt_set_wedged(rq->engine->gt); 895 err = -EIO; 896 } 897 898 i915_request_put(rq); 899 900 return err; 901 } 902 903 static void active_engine(struct kthread_work *work) 904 { 905 I915_RND_STATE(prng); 906 struct active_engine *arg = container_of(work, typeof(*arg), work); 907 struct intel_engine_cs *engine = arg->engine; 908 struct i915_request *rq[8] = {}; 909 struct intel_context *ce[ARRAY_SIZE(rq)]; 910 unsigned long count; 911 int err = 0; 912 913 for (count = 0; count < ARRAY_SIZE(ce); count++) { 914 ce[count] = intel_context_create(engine); 915 if (IS_ERR(ce[count])) { 916 arg->result = PTR_ERR(ce[count]); 917 pr_err("[%s] Create context #%ld failed: %d!\n", 918 engine->name, count, arg->result); 919 while (--count) 920 intel_context_put(ce[count]); 921 return; 922 } 923 } 924 925 count = 0; 926 while (!READ_ONCE(arg->stop)) { 927 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 928 struct i915_request *old = rq[idx]; 929 struct i915_request *new; 930 931 new = intel_context_create_request(ce[idx]); 932 if (IS_ERR(new)) { 933 err = PTR_ERR(new); 934 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err); 935 break; 936 } 937 938 rq[idx] = i915_request_get(new); 939 i915_request_add(new); 940 941 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) { 942 struct i915_sched_attr attr = { 943 .priority = 944 i915_prandom_u32_max_state(512, &prng), 945 }; 946 engine->sched_engine->schedule(rq[idx], &attr); 947 } 948 949 err = active_request_put(old); 950 if (err) { 951 pr_err("[%s] Request put failed: %d!\n", engine->name, err); 952 break; 953 } 954 955 cond_resched(); 956 } 957 958 for (count = 0; count < ARRAY_SIZE(rq); count++) { 959 int err__ = active_request_put(rq[count]); 960 961 if (err) 962 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err); 963 964 /* Keep the first error */ 965 if (!err) 966 err = err__; 967 968 intel_context_put(ce[count]); 969 } 970 971 arg->result = err; 972 } 973 974 static int __igt_reset_engines(struct intel_gt *gt, 975 const char *test_name, 976 unsigned int flags) 977 { 978 struct i915_gpu_error *global = >->i915->gpu_error; 979 struct intel_engine_cs *engine, *other; 980 struct active_engine *threads; 981 enum intel_engine_id id, tmp; 982 struct hang h; 983 int err = 0; 984 985 /* Check that issuing a reset on one engine does not interfere 986 * with any other engine. 987 */ 988 989 if (!intel_has_reset_engine(gt)) 990 return 0; 991 992 if (flags & TEST_ACTIVE) { 993 err = hang_init(&h, gt); 994 if (err) 995 return err; 996 997 if (flags & TEST_PRIORITY) 998 h.ctx->sched.priority = 1024; 999 } 1000 1001 threads = kmalloc_array(I915_NUM_ENGINES, sizeof(*threads), GFP_KERNEL); 1002 if (!threads) 1003 return -ENOMEM; 1004 1005 for_each_engine(engine, gt, id) { 1006 unsigned long device = i915_reset_count(global); 1007 unsigned long count = 0, reported; 1008 bool using_guc = intel_engine_uses_guc(engine); 1009 IGT_TIMEOUT(end_time); 1010 1011 if (flags & TEST_ACTIVE) { 1012 if (!intel_engine_can_store_dword(engine)) 1013 continue; 1014 } else if (using_guc) 1015 continue; 1016 1017 if (!wait_for_idle(engine)) { 1018 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 1019 engine->name, test_name); 1020 err = -EIO; 1021 break; 1022 } 1023 1024 memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES); 1025 for_each_engine(other, gt, tmp) { 1026 struct kthread_worker *worker; 1027 1028 threads[tmp].resets = 1029 i915_reset_engine_count(global, other); 1030 1031 if (other == engine && !(flags & TEST_SELF)) 1032 continue; 1033 1034 if (other != engine && !(flags & TEST_OTHERS)) 1035 continue; 1036 1037 threads[tmp].engine = other; 1038 threads[tmp].flags = flags; 1039 1040 worker = kthread_create_worker(0, "igt/%s", 1041 other->name); 1042 if (IS_ERR(worker)) { 1043 err = PTR_ERR(worker); 1044 pr_err("[%s] Worker create failed: %d!\n", 1045 engine->name, err); 1046 goto unwind; 1047 } 1048 1049 threads[tmp].worker = worker; 1050 1051 kthread_init_work(&threads[tmp].work, active_engine); 1052 kthread_queue_work(threads[tmp].worker, 1053 &threads[tmp].work); 1054 } 1055 1056 st_engine_heartbeat_disable_no_pm(engine); 1057 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 1058 >->reset.flags)); 1059 do { 1060 struct i915_request *rq = NULL; 1061 struct intel_selftest_saved_policy saved; 1062 int err2; 1063 1064 err = intel_selftest_modify_policy(engine, &saved, 1065 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 1066 if (err) { 1067 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1068 break; 1069 } 1070 1071 if (flags & TEST_ACTIVE) { 1072 rq = hang_create_request(&h, engine); 1073 if (IS_ERR(rq)) { 1074 err = PTR_ERR(rq); 1075 pr_err("[%s] Create hang request failed: %d!\n", 1076 engine->name, err); 1077 goto restore; 1078 } 1079 1080 i915_request_get(rq); 1081 i915_request_add(rq); 1082 1083 if (!wait_until_running(&h, rq)) { 1084 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1085 1086 pr_err("%s: Failed to start request %llx, at %x\n", 1087 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1088 intel_engine_dump(engine, &p, 1089 "%s\n", engine->name); 1090 1091 i915_request_put(rq); 1092 err = -EIO; 1093 goto restore; 1094 } 1095 } else { 1096 intel_engine_pm_get(engine); 1097 } 1098 1099 if (!using_guc) { 1100 err = intel_engine_reset(engine, NULL); 1101 if (err) { 1102 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 1103 engine->name, test_name, err); 1104 goto restore; 1105 } 1106 } 1107 1108 if (rq) { 1109 /* Ensure the reset happens and kills the engine */ 1110 err = intel_selftest_wait_for_rq(rq); 1111 if (err) 1112 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 1113 engine->name, rq->fence.context, 1114 rq->fence.seqno, rq->context->guc_id.id, err); 1115 } 1116 1117 count++; 1118 1119 if (rq) { 1120 if (rq->fence.error != -EIO) { 1121 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n", 1122 engine->name, test_name, 1123 rq->fence.context, 1124 rq->fence.seqno, rq->context->guc_id.id); 1125 i915_request_put(rq); 1126 1127 GEM_TRACE_DUMP(); 1128 intel_gt_set_wedged(gt); 1129 err = -EIO; 1130 goto restore; 1131 } 1132 1133 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1134 struct drm_printer p = 1135 drm_info_printer(gt->i915->drm.dev); 1136 1137 pr_err("i915_reset_engine(%s:%s):" 1138 " failed to complete request %llx:%lld after reset\n", 1139 engine->name, test_name, 1140 rq->fence.context, 1141 rq->fence.seqno); 1142 intel_engine_dump(engine, &p, 1143 "%s\n", engine->name); 1144 i915_request_put(rq); 1145 1146 GEM_TRACE_DUMP(); 1147 intel_gt_set_wedged(gt); 1148 err = -EIO; 1149 goto restore; 1150 } 1151 1152 i915_request_put(rq); 1153 } 1154 1155 if (!(flags & TEST_ACTIVE)) 1156 intel_engine_pm_put(engine); 1157 1158 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1159 struct drm_printer p = 1160 drm_info_printer(gt->i915->drm.dev); 1161 1162 pr_err("i915_reset_engine(%s:%s):" 1163 " failed to idle after reset\n", 1164 engine->name, test_name); 1165 intel_engine_dump(engine, &p, 1166 "%s\n", engine->name); 1167 1168 err = -EIO; 1169 goto restore; 1170 } 1171 1172 restore: 1173 err2 = intel_selftest_restore_policy(engine, &saved); 1174 if (err2) 1175 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2); 1176 if (err == 0) 1177 err = err2; 1178 if (err) 1179 break; 1180 } while (time_before(jiffies, end_time)); 1181 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 1182 st_engine_heartbeat_enable_no_pm(engine); 1183 1184 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1185 engine->name, test_name, count); 1186 1187 /* GuC based resets are not logged per engine */ 1188 if (!using_guc) { 1189 reported = i915_reset_engine_count(global, engine); 1190 reported -= threads[engine->id].resets; 1191 if (reported != count) { 1192 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1193 engine->name, test_name, count, reported); 1194 if (!err) 1195 err = -EINVAL; 1196 } 1197 } 1198 1199 unwind: 1200 for_each_engine(other, gt, tmp) { 1201 int ret; 1202 1203 if (!threads[tmp].worker) 1204 continue; 1205 1206 WRITE_ONCE(threads[tmp].stop, true); 1207 kthread_flush_work(&threads[tmp].work); 1208 ret = READ_ONCE(threads[tmp].result); 1209 if (ret) { 1210 pr_err("kthread for other engine %s failed, err=%d\n", 1211 other->name, ret); 1212 if (!err) 1213 err = ret; 1214 } 1215 1216 kthread_destroy_worker(threads[tmp].worker); 1217 1218 /* GuC based resets are not logged per engine */ 1219 if (!using_guc) { 1220 if (other->uabi_class != engine->uabi_class && 1221 threads[tmp].resets != 1222 i915_reset_engine_count(global, other)) { 1223 pr_err("Innocent engine %s was reset (count=%ld)\n", 1224 other->name, 1225 i915_reset_engine_count(global, other) - 1226 threads[tmp].resets); 1227 if (!err) 1228 err = -EINVAL; 1229 } 1230 } 1231 } 1232 1233 if (device != i915_reset_count(global)) { 1234 pr_err("Global reset (count=%ld)!\n", 1235 i915_reset_count(global) - device); 1236 if (!err) 1237 err = -EINVAL; 1238 } 1239 1240 if (err) 1241 break; 1242 1243 err = igt_flush_test(gt->i915); 1244 if (err) { 1245 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1246 break; 1247 } 1248 } 1249 kfree(threads); 1250 1251 if (intel_gt_is_wedged(gt)) 1252 err = -EIO; 1253 1254 if (flags & TEST_ACTIVE) 1255 hang_fini(&h); 1256 1257 return err; 1258 } 1259 1260 static int igt_reset_engines(void *arg) 1261 { 1262 static const struct { 1263 const char *name; 1264 unsigned int flags; 1265 } phases[] = { 1266 { "idle", 0 }, 1267 { "active", TEST_ACTIVE }, 1268 { "others-idle", TEST_OTHERS }, 1269 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1270 { 1271 "others-priority", 1272 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1273 }, 1274 { 1275 "self-priority", 1276 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1277 }, 1278 { } 1279 }; 1280 struct intel_gt *gt = arg; 1281 typeof(*phases) *p; 1282 int err; 1283 1284 for (p = phases; p->name; p++) { 1285 if (p->flags & TEST_PRIORITY) { 1286 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1287 continue; 1288 } 1289 1290 err = __igt_reset_engines(arg, p->name, p->flags); 1291 if (err) 1292 return err; 1293 } 1294 1295 return 0; 1296 } 1297 1298 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1299 { 1300 u32 count = i915_reset_count(>->i915->gpu_error); 1301 1302 intel_gt_reset(gt, mask, NULL); 1303 1304 return count; 1305 } 1306 1307 static int igt_reset_wait(void *arg) 1308 { 1309 struct intel_gt *gt = arg; 1310 struct i915_gpu_error *global = >->i915->gpu_error; 1311 struct intel_engine_cs *engine; 1312 struct i915_request *rq; 1313 unsigned int reset_count; 1314 struct hang h; 1315 long timeout; 1316 int err; 1317 1318 engine = intel_selftest_find_any_engine(gt); 1319 1320 if (!engine || !intel_engine_can_store_dword(engine)) 1321 return 0; 1322 1323 /* Check that we detect a stuck waiter and issue a reset */ 1324 1325 igt_global_reset_lock(gt); 1326 1327 err = hang_init(&h, gt); 1328 if (err) { 1329 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1330 goto unlock; 1331 } 1332 1333 rq = hang_create_request(&h, engine); 1334 if (IS_ERR(rq)) { 1335 err = PTR_ERR(rq); 1336 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1337 goto fini; 1338 } 1339 1340 i915_request_get(rq); 1341 i915_request_add(rq); 1342 1343 if (!wait_until_running(&h, rq)) { 1344 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1345 1346 pr_err("%s: Failed to start request %llx, at %x\n", 1347 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1348 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1349 1350 intel_gt_set_wedged(gt); 1351 1352 err = -EIO; 1353 goto out_rq; 1354 } 1355 1356 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1357 1358 timeout = i915_request_wait(rq, 0, 10); 1359 if (timeout < 0) { 1360 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1361 timeout); 1362 err = timeout; 1363 goto out_rq; 1364 } 1365 1366 if (i915_reset_count(global) == reset_count) { 1367 pr_err("No GPU reset recorded!\n"); 1368 err = -EINVAL; 1369 goto out_rq; 1370 } 1371 1372 out_rq: 1373 i915_request_put(rq); 1374 fini: 1375 hang_fini(&h); 1376 unlock: 1377 igt_global_reset_unlock(gt); 1378 1379 if (intel_gt_is_wedged(gt)) 1380 return -EIO; 1381 1382 return err; 1383 } 1384 1385 struct evict_vma { 1386 struct completion completion; 1387 struct i915_vma *vma; 1388 }; 1389 1390 static int evict_vma(void *data) 1391 { 1392 struct evict_vma *arg = data; 1393 struct i915_address_space *vm = arg->vma->vm; 1394 struct drm_mm_node evict = arg->vma->node; 1395 int err; 1396 1397 complete(&arg->completion); 1398 1399 mutex_lock(&vm->mutex); 1400 err = i915_gem_evict_for_node(vm, NULL, &evict, 0); 1401 mutex_unlock(&vm->mutex); 1402 1403 return err; 1404 } 1405 1406 static int evict_fence(void *data) 1407 { 1408 struct evict_vma *arg = data; 1409 int err; 1410 1411 complete(&arg->completion); 1412 1413 /* Mark the fence register as dirty to force the mmio update. */ 1414 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1415 if (err) { 1416 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1417 return err; 1418 } 1419 1420 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1421 if (err) { 1422 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1423 return err; 1424 } 1425 1426 err = i915_vma_pin_fence(arg->vma); 1427 i915_vma_unpin(arg->vma); 1428 if (err) { 1429 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1430 return err; 1431 } 1432 1433 i915_vma_unpin_fence(arg->vma); 1434 1435 return 0; 1436 } 1437 1438 static int __igt_reset_evict_vma(struct intel_gt *gt, 1439 struct i915_address_space *vm, 1440 int (*fn)(void *), 1441 unsigned int flags) 1442 { 1443 struct intel_engine_cs *engine; 1444 struct drm_i915_gem_object *obj; 1445 struct task_struct *tsk = NULL; 1446 struct i915_request *rq; 1447 struct evict_vma arg; 1448 struct hang h; 1449 unsigned int pin_flags; 1450 int err; 1451 1452 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1453 return 0; 1454 1455 engine = intel_selftest_find_any_engine(gt); 1456 1457 if (!engine || !intel_engine_can_store_dword(engine)) 1458 return 0; 1459 1460 /* Check that we can recover an unbind stuck on a hanging request */ 1461 1462 err = hang_init(&h, gt); 1463 if (err) { 1464 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1465 return err; 1466 } 1467 1468 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1469 if (IS_ERR(obj)) { 1470 err = PTR_ERR(obj); 1471 pr_err("[%s] Create object failed: %d!\n", engine->name, err); 1472 goto fini; 1473 } 1474 1475 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1476 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1477 if (err) { 1478 pr_err("Invalid X-tiling settings; err:%d\n", err); 1479 goto out_obj; 1480 } 1481 } 1482 1483 arg.vma = i915_vma_instance(obj, vm, NULL); 1484 if (IS_ERR(arg.vma)) { 1485 err = PTR_ERR(arg.vma); 1486 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err); 1487 goto out_obj; 1488 } 1489 1490 rq = hang_create_request(&h, engine); 1491 if (IS_ERR(rq)) { 1492 err = PTR_ERR(rq); 1493 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1494 goto out_obj; 1495 } 1496 1497 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1498 1499 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1500 pin_flags |= PIN_MAPPABLE; 1501 1502 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1503 if (err) { 1504 i915_request_add(rq); 1505 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err); 1506 goto out_obj; 1507 } 1508 1509 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1510 err = i915_vma_pin_fence(arg.vma); 1511 if (err) { 1512 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1513 i915_vma_unpin(arg.vma); 1514 i915_request_add(rq); 1515 goto out_obj; 1516 } 1517 } 1518 1519 i915_vma_lock(arg.vma); 1520 err = i915_vma_move_to_active(arg.vma, rq, flags); 1521 if (err) 1522 pr_err("[%s] Move to active failed: %d!\n", engine->name, err); 1523 1524 i915_vma_unlock(arg.vma); 1525 1526 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1527 i915_vma_unpin_fence(arg.vma); 1528 i915_vma_unpin(arg.vma); 1529 1530 i915_request_get(rq); 1531 i915_request_add(rq); 1532 if (err) 1533 goto out_rq; 1534 1535 if (!wait_until_running(&h, rq)) { 1536 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1537 1538 pr_err("%s: Failed to start request %llx, at %x\n", 1539 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1540 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1541 1542 intel_gt_set_wedged(gt); 1543 goto out_reset; 1544 } 1545 1546 init_completion(&arg.completion); 1547 1548 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1549 if (IS_ERR(tsk)) { 1550 err = PTR_ERR(tsk); 1551 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1552 tsk = NULL; 1553 goto out_reset; 1554 } 1555 get_task_struct(tsk); 1556 1557 wait_for_completion(&arg.completion); 1558 1559 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1560 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1561 1562 pr_err("igt/evict_vma kthread did not wait\n"); 1563 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1564 1565 intel_gt_set_wedged(gt); 1566 goto out_reset; 1567 } 1568 1569 out_reset: 1570 igt_global_reset_lock(gt); 1571 fake_hangcheck(gt, rq->engine->mask); 1572 igt_global_reset_unlock(gt); 1573 1574 if (tsk) { 1575 struct intel_wedge_me w; 1576 1577 /* The reset, even indirectly, should take less than 10ms. */ 1578 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1579 err = kthread_stop(tsk); 1580 1581 put_task_struct(tsk); 1582 } 1583 1584 out_rq: 1585 i915_request_put(rq); 1586 out_obj: 1587 i915_gem_object_put(obj); 1588 fini: 1589 hang_fini(&h); 1590 if (intel_gt_is_wedged(gt)) 1591 return -EIO; 1592 1593 return err; 1594 } 1595 1596 static int igt_reset_evict_ggtt(void *arg) 1597 { 1598 struct intel_gt *gt = arg; 1599 1600 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1601 evict_vma, EXEC_OBJECT_WRITE); 1602 } 1603 1604 static int igt_reset_evict_ppgtt(void *arg) 1605 { 1606 struct intel_gt *gt = arg; 1607 struct i915_ppgtt *ppgtt; 1608 int err; 1609 1610 /* aliasing == global gtt locking, covered above */ 1611 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1612 return 0; 1613 1614 ppgtt = i915_ppgtt_create(gt, 0); 1615 if (IS_ERR(ppgtt)) 1616 return PTR_ERR(ppgtt); 1617 1618 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1619 evict_vma, EXEC_OBJECT_WRITE); 1620 i915_vm_put(&ppgtt->vm); 1621 1622 return err; 1623 } 1624 1625 static int igt_reset_evict_fence(void *arg) 1626 { 1627 struct intel_gt *gt = arg; 1628 1629 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1630 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1631 } 1632 1633 static int wait_for_others(struct intel_gt *gt, 1634 struct intel_engine_cs *exclude) 1635 { 1636 struct intel_engine_cs *engine; 1637 enum intel_engine_id id; 1638 1639 for_each_engine(engine, gt, id) { 1640 if (engine == exclude) 1641 continue; 1642 1643 if (!wait_for_idle(engine)) 1644 return -EIO; 1645 } 1646 1647 return 0; 1648 } 1649 1650 static int igt_reset_queue(void *arg) 1651 { 1652 struct intel_gt *gt = arg; 1653 struct i915_gpu_error *global = >->i915->gpu_error; 1654 struct intel_engine_cs *engine; 1655 enum intel_engine_id id; 1656 struct hang h; 1657 int err; 1658 1659 /* Check that we replay pending requests following a hang */ 1660 1661 igt_global_reset_lock(gt); 1662 1663 err = hang_init(&h, gt); 1664 if (err) 1665 goto unlock; 1666 1667 for_each_engine(engine, gt, id) { 1668 struct intel_selftest_saved_policy saved; 1669 struct i915_request *prev; 1670 IGT_TIMEOUT(end_time); 1671 unsigned int count; 1672 bool using_guc = intel_engine_uses_guc(engine); 1673 1674 if (!intel_engine_can_store_dword(engine)) 1675 continue; 1676 1677 if (using_guc) { 1678 err = intel_selftest_modify_policy(engine, &saved, 1679 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK); 1680 if (err) { 1681 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1682 goto fini; 1683 } 1684 } 1685 1686 prev = hang_create_request(&h, engine); 1687 if (IS_ERR(prev)) { 1688 err = PTR_ERR(prev); 1689 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err); 1690 goto restore; 1691 } 1692 1693 i915_request_get(prev); 1694 i915_request_add(prev); 1695 1696 count = 0; 1697 do { 1698 struct i915_request *rq; 1699 unsigned int reset_count; 1700 1701 rq = hang_create_request(&h, engine); 1702 if (IS_ERR(rq)) { 1703 err = PTR_ERR(rq); 1704 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1705 goto restore; 1706 } 1707 1708 i915_request_get(rq); 1709 i915_request_add(rq); 1710 1711 /* 1712 * XXX We don't handle resetting the kernel context 1713 * very well. If we trigger a device reset twice in 1714 * quick succession while the kernel context is 1715 * executing, we may end up skipping the breadcrumb. 1716 * This is really only a problem for the selftest as 1717 * normally there is a large interlude between resets 1718 * (hangcheck), or we focus on resetting just one 1719 * engine and so avoid repeatedly resetting innocents. 1720 */ 1721 err = wait_for_others(gt, engine); 1722 if (err) { 1723 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1724 __func__, engine->name); 1725 i915_request_put(rq); 1726 i915_request_put(prev); 1727 1728 GEM_TRACE_DUMP(); 1729 intel_gt_set_wedged(gt); 1730 goto restore; 1731 } 1732 1733 if (!wait_until_running(&h, prev)) { 1734 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1735 1736 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1737 __func__, engine->name, 1738 prev->fence.seqno, hws_seqno(&h, prev)); 1739 intel_engine_dump(engine, &p, 1740 "%s\n", engine->name); 1741 1742 i915_request_put(rq); 1743 i915_request_put(prev); 1744 1745 intel_gt_set_wedged(gt); 1746 1747 err = -EIO; 1748 goto restore; 1749 } 1750 1751 reset_count = fake_hangcheck(gt, BIT(id)); 1752 1753 if (prev->fence.error != -EIO) { 1754 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1755 prev->fence.error); 1756 i915_request_put(rq); 1757 i915_request_put(prev); 1758 err = -EINVAL; 1759 goto restore; 1760 } 1761 1762 if (rq->fence.error) { 1763 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1764 rq->fence.error); 1765 i915_request_put(rq); 1766 i915_request_put(prev); 1767 err = -EINVAL; 1768 goto restore; 1769 } 1770 1771 if (i915_reset_count(global) == reset_count) { 1772 pr_err("No GPU reset recorded!\n"); 1773 i915_request_put(rq); 1774 i915_request_put(prev); 1775 err = -EINVAL; 1776 goto restore; 1777 } 1778 1779 i915_request_put(prev); 1780 prev = rq; 1781 count++; 1782 } while (time_before(jiffies, end_time)); 1783 pr_info("%s: Completed %d queued resets\n", 1784 engine->name, count); 1785 1786 *h.batch = MI_BATCH_BUFFER_END; 1787 intel_gt_chipset_flush(engine->gt); 1788 1789 i915_request_put(prev); 1790 1791 restore: 1792 if (using_guc) { 1793 int err2 = intel_selftest_restore_policy(engine, &saved); 1794 1795 if (err2) 1796 pr_err("%s:%d> [%s] Restore policy failed: %d!\n", 1797 __func__, __LINE__, engine->name, err2); 1798 if (err == 0) 1799 err = err2; 1800 } 1801 if (err) 1802 goto fini; 1803 1804 err = igt_flush_test(gt->i915); 1805 if (err) { 1806 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1807 break; 1808 } 1809 } 1810 1811 fini: 1812 hang_fini(&h); 1813 unlock: 1814 igt_global_reset_unlock(gt); 1815 1816 if (intel_gt_is_wedged(gt)) 1817 return -EIO; 1818 1819 return err; 1820 } 1821 1822 static int igt_handle_error(void *arg) 1823 { 1824 struct intel_gt *gt = arg; 1825 struct i915_gpu_error *global = >->i915->gpu_error; 1826 struct intel_engine_cs *engine; 1827 struct hang h; 1828 struct i915_request *rq; 1829 struct i915_gpu_coredump *error; 1830 int err; 1831 1832 engine = intel_selftest_find_any_engine(gt); 1833 1834 /* Check that we can issue a global GPU and engine reset */ 1835 1836 if (!intel_has_reset_engine(gt)) 1837 return 0; 1838 1839 if (!engine || !intel_engine_can_store_dword(engine)) 1840 return 0; 1841 1842 err = hang_init(&h, gt); 1843 if (err) { 1844 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1845 return err; 1846 } 1847 1848 rq = hang_create_request(&h, engine); 1849 if (IS_ERR(rq)) { 1850 err = PTR_ERR(rq); 1851 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1852 goto err_fini; 1853 } 1854 1855 i915_request_get(rq); 1856 i915_request_add(rq); 1857 1858 if (!wait_until_running(&h, rq)) { 1859 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1860 1861 pr_err("%s: Failed to start request %llx, at %x\n", 1862 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1863 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1864 1865 intel_gt_set_wedged(gt); 1866 1867 err = -EIO; 1868 goto err_request; 1869 } 1870 1871 /* Temporarily disable error capture */ 1872 error = xchg(&global->first_error, (void *)-1); 1873 1874 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1875 1876 xchg(&global->first_error, error); 1877 1878 if (rq->fence.error != -EIO) { 1879 pr_err("Guilty request not identified!\n"); 1880 err = -EINVAL; 1881 goto err_request; 1882 } 1883 1884 err_request: 1885 i915_request_put(rq); 1886 err_fini: 1887 hang_fini(&h); 1888 return err; 1889 } 1890 1891 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1892 const struct igt_atomic_section *p, 1893 const char *mode) 1894 { 1895 struct tasklet_struct * const t = &engine->sched_engine->tasklet; 1896 int err; 1897 1898 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1899 engine->name, mode, p->name); 1900 1901 if (t->func) 1902 tasklet_disable(t); 1903 if (strcmp(p->name, "softirq")) 1904 local_bh_disable(); 1905 p->critical_section_begin(); 1906 1907 err = __intel_engine_reset_bh(engine, NULL); 1908 1909 p->critical_section_end(); 1910 if (strcmp(p->name, "softirq")) 1911 local_bh_enable(); 1912 if (t->func) { 1913 tasklet_enable(t); 1914 tasklet_hi_schedule(t); 1915 } 1916 1917 if (err) 1918 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1919 engine->name, mode, p->name); 1920 1921 return err; 1922 } 1923 1924 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1925 const struct igt_atomic_section *p) 1926 { 1927 struct i915_request *rq; 1928 struct hang h; 1929 int err; 1930 1931 err = __igt_atomic_reset_engine(engine, p, "idle"); 1932 if (err) 1933 return err; 1934 1935 err = hang_init(&h, engine->gt); 1936 if (err) { 1937 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1938 return err; 1939 } 1940 1941 rq = hang_create_request(&h, engine); 1942 if (IS_ERR(rq)) { 1943 err = PTR_ERR(rq); 1944 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1945 goto out; 1946 } 1947 1948 i915_request_get(rq); 1949 i915_request_add(rq); 1950 1951 if (wait_until_running(&h, rq)) { 1952 err = __igt_atomic_reset_engine(engine, p, "active"); 1953 } else { 1954 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1955 __func__, engine->name, 1956 rq->fence.seqno, hws_seqno(&h, rq)); 1957 intel_gt_set_wedged(engine->gt); 1958 err = -EIO; 1959 } 1960 1961 if (err == 0) { 1962 struct intel_wedge_me w; 1963 1964 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1965 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1966 if (intel_gt_is_wedged(engine->gt)) 1967 err = -EIO; 1968 } 1969 1970 i915_request_put(rq); 1971 out: 1972 hang_fini(&h); 1973 return err; 1974 } 1975 1976 static int igt_reset_engines_atomic(void *arg) 1977 { 1978 struct intel_gt *gt = arg; 1979 const typeof(*igt_atomic_phases) *p; 1980 int err = 0; 1981 1982 /* Check that the engines resets are usable from atomic context */ 1983 1984 if (!intel_has_reset_engine(gt)) 1985 return 0; 1986 1987 if (intel_uc_uses_guc_submission(>->uc)) 1988 return 0; 1989 1990 igt_global_reset_lock(gt); 1991 1992 /* Flush any requests before we get started and check basics */ 1993 if (!igt_force_reset(gt)) 1994 goto unlock; 1995 1996 for (p = igt_atomic_phases; p->name; p++) { 1997 struct intel_engine_cs *engine; 1998 enum intel_engine_id id; 1999 2000 for_each_engine(engine, gt, id) { 2001 err = igt_atomic_reset_engine(engine, p); 2002 if (err) 2003 goto out; 2004 } 2005 } 2006 2007 out: 2008 /* As we poke around the guts, do a full reset before continuing. */ 2009 igt_force_reset(gt); 2010 unlock: 2011 igt_global_reset_unlock(gt); 2012 2013 return err; 2014 } 2015 2016 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 2017 { 2018 static const struct i915_subtest tests[] = { 2019 SUBTEST(igt_hang_sanitycheck), 2020 SUBTEST(igt_reset_nop), 2021 SUBTEST(igt_reset_nop_engine), 2022 SUBTEST(igt_reset_idle_engine), 2023 SUBTEST(igt_reset_active_engine), 2024 SUBTEST(igt_reset_fail_engine), 2025 SUBTEST(igt_reset_engines), 2026 SUBTEST(igt_reset_engines_atomic), 2027 SUBTEST(igt_reset_queue), 2028 SUBTEST(igt_reset_wait), 2029 SUBTEST(igt_reset_evict_ggtt), 2030 SUBTEST(igt_reset_evict_ppgtt), 2031 SUBTEST(igt_reset_evict_fence), 2032 SUBTEST(igt_handle_error), 2033 }; 2034 struct intel_gt *gt = to_gt(i915); 2035 intel_wakeref_t wakeref; 2036 int err; 2037 2038 if (!intel_has_gpu_reset(gt)) 2039 return 0; 2040 2041 if (intel_gt_is_wedged(gt)) 2042 return -EIO; /* we're long past hope of a successful reset */ 2043 2044 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 2045 2046 err = intel_gt_live_subtests(tests, gt); 2047 2048 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 2049 2050 return err; 2051 } 2052