1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2016 Intel Corporation 4 */ 5 6 #include <linux/kthread.h> 7 8 #include "gem/i915_gem_context.h" 9 #include "gem/i915_gem_internal.h" 10 11 #include "i915_gem_evict.h" 12 #include "intel_gt.h" 13 #include "intel_engine_heartbeat.h" 14 #include "intel_engine_pm.h" 15 #include "selftest_engine_heartbeat.h" 16 17 #include "i915_selftest.h" 18 #include "selftests/i915_random.h" 19 #include "selftests/igt_flush_test.h" 20 #include "selftests/igt_reset.h" 21 #include "selftests/igt_atomic.h" 22 #include "selftests/igt_spinner.h" 23 #include "selftests/intel_scheduler_helpers.h" 24 25 #include "selftests/mock_drm.h" 26 27 #include "gem/selftests/mock_context.h" 28 #include "gem/selftests/igt_gem_utils.h" 29 30 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 31 32 struct hang { 33 struct intel_gt *gt; 34 struct drm_i915_gem_object *hws; 35 struct drm_i915_gem_object *obj; 36 struct i915_gem_context *ctx; 37 u32 *seqno; 38 u32 *batch; 39 }; 40 41 static int hang_init(struct hang *h, struct intel_gt *gt) 42 { 43 void *vaddr; 44 int err; 45 46 memset(h, 0, sizeof(*h)); 47 h->gt = gt; 48 49 h->ctx = kernel_context(gt->i915, NULL); 50 if (IS_ERR(h->ctx)) 51 return PTR_ERR(h->ctx); 52 53 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 54 55 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 56 if (IS_ERR(h->hws)) { 57 err = PTR_ERR(h->hws); 58 goto err_ctx; 59 } 60 61 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 62 if (IS_ERR(h->obj)) { 63 err = PTR_ERR(h->obj); 64 goto err_hws; 65 } 66 67 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 68 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB); 69 if (IS_ERR(vaddr)) { 70 err = PTR_ERR(vaddr); 71 goto err_obj; 72 } 73 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 74 75 vaddr = i915_gem_object_pin_map_unlocked(h->obj, 76 intel_gt_coherent_map_type(gt, h->obj, false)); 77 if (IS_ERR(vaddr)) { 78 err = PTR_ERR(vaddr); 79 goto err_unpin_hws; 80 } 81 h->batch = vaddr; 82 83 return 0; 84 85 err_unpin_hws: 86 i915_gem_object_unpin_map(h->hws); 87 err_obj: 88 i915_gem_object_put(h->obj); 89 err_hws: 90 i915_gem_object_put(h->hws); 91 err_ctx: 92 kernel_context_close(h->ctx); 93 return err; 94 } 95 96 static u64 hws_address(const struct i915_vma *hws, 97 const struct i915_request *rq) 98 { 99 return i915_vma_offset(hws) + 100 offset_in_page(sizeof(u32) * rq->fence.context); 101 } 102 103 static struct i915_request * 104 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 105 { 106 struct intel_gt *gt = h->gt; 107 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx); 108 struct drm_i915_gem_object *obj; 109 struct i915_request *rq = NULL; 110 struct i915_vma *hws, *vma; 111 unsigned int flags; 112 void *vaddr; 113 u32 *batch; 114 int err; 115 116 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 117 if (IS_ERR(obj)) { 118 i915_vm_put(vm); 119 return ERR_CAST(obj); 120 } 121 122 vaddr = i915_gem_object_pin_map_unlocked(obj, intel_gt_coherent_map_type(gt, obj, false)); 123 if (IS_ERR(vaddr)) { 124 i915_gem_object_put(obj); 125 i915_vm_put(vm); 126 return ERR_CAST(vaddr); 127 } 128 129 i915_gem_object_unpin_map(h->obj); 130 i915_gem_object_put(h->obj); 131 132 h->obj = obj; 133 h->batch = vaddr; 134 135 vma = i915_vma_instance(h->obj, vm, NULL); 136 if (IS_ERR(vma)) { 137 i915_vm_put(vm); 138 return ERR_CAST(vma); 139 } 140 141 hws = i915_vma_instance(h->hws, vm, NULL); 142 if (IS_ERR(hws)) { 143 i915_vm_put(vm); 144 return ERR_CAST(hws); 145 } 146 147 err = i915_vma_pin(vma, 0, 0, PIN_USER); 148 if (err) { 149 i915_vm_put(vm); 150 return ERR_PTR(err); 151 } 152 153 err = i915_vma_pin(hws, 0, 0, PIN_USER); 154 if (err) 155 goto unpin_vma; 156 157 rq = igt_request_alloc(h->ctx, engine); 158 if (IS_ERR(rq)) { 159 err = PTR_ERR(rq); 160 goto unpin_hws; 161 } 162 163 err = igt_vma_move_to_active_unlocked(vma, rq, 0); 164 if (err) 165 goto cancel_rq; 166 167 err = igt_vma_move_to_active_unlocked(hws, rq, 0); 168 if (err) 169 goto cancel_rq; 170 171 batch = h->batch; 172 if (GRAPHICS_VER(gt->i915) >= 8) { 173 *batch++ = MI_STORE_DWORD_IMM_GEN4; 174 *batch++ = lower_32_bits(hws_address(hws, rq)); 175 *batch++ = upper_32_bits(hws_address(hws, rq)); 176 *batch++ = rq->fence.seqno; 177 *batch++ = MI_NOOP; 178 179 memset(batch, 0, 1024); 180 batch += 1024 / sizeof(*batch); 181 182 *batch++ = MI_NOOP; 183 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 184 *batch++ = lower_32_bits(i915_vma_offset(vma)); 185 *batch++ = upper_32_bits(i915_vma_offset(vma)); 186 } else if (GRAPHICS_VER(gt->i915) >= 6) { 187 *batch++ = MI_STORE_DWORD_IMM_GEN4; 188 *batch++ = 0; 189 *batch++ = lower_32_bits(hws_address(hws, rq)); 190 *batch++ = rq->fence.seqno; 191 *batch++ = MI_NOOP; 192 193 memset(batch, 0, 1024); 194 batch += 1024 / sizeof(*batch); 195 196 *batch++ = MI_NOOP; 197 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 198 *batch++ = lower_32_bits(i915_vma_offset(vma)); 199 } else if (GRAPHICS_VER(gt->i915) >= 4) { 200 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 201 *batch++ = 0; 202 *batch++ = lower_32_bits(hws_address(hws, rq)); 203 *batch++ = rq->fence.seqno; 204 *batch++ = MI_NOOP; 205 206 memset(batch, 0, 1024); 207 batch += 1024 / sizeof(*batch); 208 209 *batch++ = MI_NOOP; 210 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 211 *batch++ = lower_32_bits(i915_vma_offset(vma)); 212 } else { 213 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 214 *batch++ = lower_32_bits(hws_address(hws, rq)); 215 *batch++ = rq->fence.seqno; 216 *batch++ = MI_NOOP; 217 218 memset(batch, 0, 1024); 219 batch += 1024 / sizeof(*batch); 220 221 *batch++ = MI_NOOP; 222 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 223 *batch++ = lower_32_bits(i915_vma_offset(vma)); 224 } 225 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 226 intel_gt_chipset_flush(engine->gt); 227 228 if (rq->engine->emit_init_breadcrumb) { 229 err = rq->engine->emit_init_breadcrumb(rq); 230 if (err) 231 goto cancel_rq; 232 } 233 234 flags = 0; 235 if (GRAPHICS_VER(gt->i915) <= 5) 236 flags |= I915_DISPATCH_SECURE; 237 238 err = rq->engine->emit_bb_start(rq, i915_vma_offset(vma), PAGE_SIZE, flags); 239 240 cancel_rq: 241 if (err) { 242 i915_request_set_error_once(rq, err); 243 i915_request_add(rq); 244 } 245 unpin_hws: 246 i915_vma_unpin(hws); 247 unpin_vma: 248 i915_vma_unpin(vma); 249 i915_vm_put(vm); 250 return err ? ERR_PTR(err) : rq; 251 } 252 253 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 254 { 255 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 256 } 257 258 static void hang_fini(struct hang *h) 259 { 260 *h->batch = MI_BATCH_BUFFER_END; 261 intel_gt_chipset_flush(h->gt); 262 263 i915_gem_object_unpin_map(h->obj); 264 i915_gem_object_put(h->obj); 265 266 i915_gem_object_unpin_map(h->hws); 267 i915_gem_object_put(h->hws); 268 269 kernel_context_close(h->ctx); 270 271 igt_flush_test(h->gt->i915); 272 } 273 274 static bool wait_until_running(struct hang *h, struct i915_request *rq) 275 { 276 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 277 rq->fence.seqno), 278 10) && 279 wait_for(i915_seqno_passed(hws_seqno(h, rq), 280 rq->fence.seqno), 281 1000)); 282 } 283 284 static int igt_hang_sanitycheck(void *arg) 285 { 286 struct intel_gt *gt = arg; 287 struct i915_request *rq; 288 struct intel_engine_cs *engine; 289 enum intel_engine_id id; 290 struct hang h; 291 int err; 292 293 /* Basic check that we can execute our hanging batch */ 294 295 err = hang_init(&h, gt); 296 if (err) 297 return err; 298 299 for_each_engine(engine, gt, id) { 300 struct intel_wedge_me w; 301 long timeout; 302 303 if (!intel_engine_can_store_dword(engine)) 304 continue; 305 306 rq = hang_create_request(&h, engine); 307 if (IS_ERR(rq)) { 308 err = PTR_ERR(rq); 309 pr_err("Failed to create request for %s, err=%d\n", 310 engine->name, err); 311 goto fini; 312 } 313 314 i915_request_get(rq); 315 316 *h.batch = MI_BATCH_BUFFER_END; 317 intel_gt_chipset_flush(engine->gt); 318 319 i915_request_add(rq); 320 321 timeout = 0; 322 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 323 timeout = i915_request_wait(rq, 0, 324 MAX_SCHEDULE_TIMEOUT); 325 if (intel_gt_is_wedged(gt)) 326 timeout = -EIO; 327 328 i915_request_put(rq); 329 330 if (timeout < 0) { 331 err = timeout; 332 pr_err("Wait for request failed on %s, err=%d\n", 333 engine->name, err); 334 goto fini; 335 } 336 } 337 338 fini: 339 hang_fini(&h); 340 return err; 341 } 342 343 static bool wait_for_idle(struct intel_engine_cs *engine) 344 { 345 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 346 } 347 348 static int igt_reset_nop(void *arg) 349 { 350 struct intel_gt *gt = arg; 351 struct i915_gpu_error *global = >->i915->gpu_error; 352 struct intel_engine_cs *engine; 353 unsigned int reset_count, count; 354 enum intel_engine_id id; 355 IGT_TIMEOUT(end_time); 356 int err = 0; 357 358 /* Check that we can reset during non-user portions of requests */ 359 360 reset_count = i915_reset_count(global); 361 count = 0; 362 do { 363 for_each_engine(engine, gt, id) { 364 struct intel_context *ce; 365 int i; 366 367 ce = intel_context_create(engine); 368 if (IS_ERR(ce)) { 369 err = PTR_ERR(ce); 370 pr_err("[%s] Create context failed: %d!\n", engine->name, err); 371 break; 372 } 373 374 for (i = 0; i < 16; i++) { 375 struct i915_request *rq; 376 377 rq = intel_context_create_request(ce); 378 if (IS_ERR(rq)) { 379 err = PTR_ERR(rq); 380 pr_err("[%s] Create request failed: %d!\n", 381 engine->name, err); 382 break; 383 } 384 385 i915_request_add(rq); 386 } 387 388 intel_context_put(ce); 389 } 390 391 igt_global_reset_lock(gt); 392 intel_gt_reset(gt, ALL_ENGINES, NULL); 393 igt_global_reset_unlock(gt); 394 395 if (intel_gt_is_wedged(gt)) { 396 pr_err("[%s] GT is wedged!\n", engine->name); 397 err = -EIO; 398 break; 399 } 400 401 if (i915_reset_count(global) != reset_count + ++count) { 402 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n", 403 engine->name, i915_reset_count(global), reset_count, count); 404 err = -EINVAL; 405 break; 406 } 407 408 err = igt_flush_test(gt->i915); 409 if (err) { 410 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 411 break; 412 } 413 } while (time_before(jiffies, end_time)); 414 pr_info("%s: %d resets\n", __func__, count); 415 416 if (igt_flush_test(gt->i915)) { 417 pr_err("Post flush failed: %d!\n", err); 418 err = -EIO; 419 } 420 421 return err; 422 } 423 424 static int igt_reset_nop_engine(void *arg) 425 { 426 struct intel_gt *gt = arg; 427 struct i915_gpu_error *global = >->i915->gpu_error; 428 struct intel_engine_cs *engine; 429 enum intel_engine_id id; 430 431 /* Check that we can engine-reset during non-user portions */ 432 433 if (!intel_has_reset_engine(gt)) 434 return 0; 435 436 for_each_engine(engine, gt, id) { 437 unsigned int reset_count, reset_engine_count, count; 438 struct intel_context *ce; 439 IGT_TIMEOUT(end_time); 440 int err; 441 442 if (intel_engine_uses_guc(engine)) { 443 /* Engine level resets are triggered by GuC when a hang 444 * is detected. They can't be triggered by the KMD any 445 * more. Thus a nop batch cannot be used as a reset test 446 */ 447 continue; 448 } 449 450 ce = intel_context_create(engine); 451 if (IS_ERR(ce)) { 452 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 453 return PTR_ERR(ce); 454 } 455 456 reset_count = i915_reset_count(global); 457 reset_engine_count = i915_reset_engine_count(global, engine); 458 count = 0; 459 460 st_engine_heartbeat_disable(engine); 461 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 462 >->reset.flags)); 463 do { 464 int i; 465 466 if (!wait_for_idle(engine)) { 467 pr_err("%s failed to idle before reset\n", 468 engine->name); 469 err = -EIO; 470 break; 471 } 472 473 for (i = 0; i < 16; i++) { 474 struct i915_request *rq; 475 476 rq = intel_context_create_request(ce); 477 if (IS_ERR(rq)) { 478 struct drm_printer p = 479 drm_info_printer(gt->i915->drm.dev); 480 intel_engine_dump(engine, &p, 481 "%s(%s): failed to submit request\n", 482 __func__, 483 engine->name); 484 485 GEM_TRACE("%s(%s): failed to submit request\n", 486 __func__, 487 engine->name); 488 GEM_TRACE_DUMP(); 489 490 intel_gt_set_wedged(gt); 491 492 err = PTR_ERR(rq); 493 break; 494 } 495 496 i915_request_add(rq); 497 } 498 err = intel_engine_reset(engine, NULL); 499 if (err) { 500 pr_err("intel_engine_reset(%s) failed, err:%d\n", 501 engine->name, err); 502 break; 503 } 504 505 if (i915_reset_count(global) != reset_count) { 506 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 507 err = -EINVAL; 508 break; 509 } 510 511 if (i915_reset_engine_count(global, engine) != 512 reset_engine_count + ++count) { 513 pr_err("%s engine reset not recorded!\n", 514 engine->name); 515 err = -EINVAL; 516 break; 517 } 518 } while (time_before(jiffies, end_time)); 519 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 520 st_engine_heartbeat_enable(engine); 521 522 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 523 524 intel_context_put(ce); 525 if (igt_flush_test(gt->i915)) 526 err = -EIO; 527 if (err) 528 return err; 529 } 530 531 return 0; 532 } 533 534 static void force_reset_timeout(struct intel_engine_cs *engine) 535 { 536 engine->reset_timeout.probability = 999; 537 atomic_set(&engine->reset_timeout.times, -1); 538 } 539 540 static void cancel_reset_timeout(struct intel_engine_cs *engine) 541 { 542 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 543 } 544 545 static int igt_reset_fail_engine(void *arg) 546 { 547 struct intel_gt *gt = arg; 548 struct intel_engine_cs *engine; 549 enum intel_engine_id id; 550 551 /* Check that we can recover from engine-reset failues */ 552 553 if (!intel_has_reset_engine(gt)) 554 return 0; 555 556 for_each_engine(engine, gt, id) { 557 unsigned int count; 558 struct intel_context *ce; 559 IGT_TIMEOUT(end_time); 560 int err; 561 562 /* Can't manually break the reset if i915 doesn't perform it */ 563 if (intel_engine_uses_guc(engine)) 564 continue; 565 566 ce = intel_context_create(engine); 567 if (IS_ERR(ce)) { 568 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 569 return PTR_ERR(ce); 570 } 571 572 st_engine_heartbeat_disable(engine); 573 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 574 >->reset.flags)); 575 576 force_reset_timeout(engine); 577 err = intel_engine_reset(engine, NULL); 578 cancel_reset_timeout(engine); 579 if (err == 0) /* timeouts only generated on gen8+ */ 580 goto skip; 581 582 count = 0; 583 do { 584 struct i915_request *last = NULL; 585 int i; 586 587 if (!wait_for_idle(engine)) { 588 pr_err("%s failed to idle before reset\n", 589 engine->name); 590 err = -EIO; 591 break; 592 } 593 594 for (i = 0; i < count % 15; i++) { 595 struct i915_request *rq; 596 597 rq = intel_context_create_request(ce); 598 if (IS_ERR(rq)) { 599 struct drm_printer p = 600 drm_info_printer(gt->i915->drm.dev); 601 intel_engine_dump(engine, &p, 602 "%s(%s): failed to submit request\n", 603 __func__, 604 engine->name); 605 606 GEM_TRACE("%s(%s): failed to submit request\n", 607 __func__, 608 engine->name); 609 GEM_TRACE_DUMP(); 610 611 intel_gt_set_wedged(gt); 612 if (last) 613 i915_request_put(last); 614 615 err = PTR_ERR(rq); 616 goto out; 617 } 618 619 if (last) 620 i915_request_put(last); 621 last = i915_request_get(rq); 622 i915_request_add(rq); 623 } 624 625 if (count & 1) { 626 err = intel_engine_reset(engine, NULL); 627 if (err) { 628 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 629 engine->name, err); 630 GEM_TRACE_DUMP(); 631 i915_request_put(last); 632 break; 633 } 634 } else { 635 force_reset_timeout(engine); 636 err = intel_engine_reset(engine, NULL); 637 cancel_reset_timeout(engine); 638 if (err != -ETIMEDOUT) { 639 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 640 engine->name, err); 641 i915_request_put(last); 642 break; 643 } 644 } 645 646 err = 0; 647 if (last) { 648 if (i915_request_wait(last, 0, HZ / 2) < 0) { 649 struct drm_printer p = 650 drm_info_printer(gt->i915->drm.dev); 651 652 intel_engine_dump(engine, &p, 653 "%s(%s): failed to complete request\n", 654 __func__, 655 engine->name); 656 657 GEM_TRACE("%s(%s): failed to complete request\n", 658 __func__, 659 engine->name); 660 GEM_TRACE_DUMP(); 661 662 err = -EIO; 663 } 664 i915_request_put(last); 665 } 666 count++; 667 } while (err == 0 && time_before(jiffies, end_time)); 668 out: 669 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 670 skip: 671 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 672 st_engine_heartbeat_enable(engine); 673 intel_context_put(ce); 674 675 if (igt_flush_test(gt->i915)) 676 err = -EIO; 677 if (err) 678 return err; 679 } 680 681 return 0; 682 } 683 684 static int __igt_reset_engine(struct intel_gt *gt, bool active) 685 { 686 struct i915_gpu_error *global = >->i915->gpu_error; 687 struct intel_engine_cs *engine; 688 enum intel_engine_id id; 689 struct hang h; 690 int err = 0; 691 692 /* Check that we can issue an engine reset on an idle engine (no-op) */ 693 694 if (!intel_has_reset_engine(gt)) 695 return 0; 696 697 if (active) { 698 err = hang_init(&h, gt); 699 if (err) 700 return err; 701 } 702 703 for_each_engine(engine, gt, id) { 704 unsigned int reset_count, reset_engine_count; 705 unsigned long count; 706 bool using_guc = intel_engine_uses_guc(engine); 707 IGT_TIMEOUT(end_time); 708 709 if (using_guc && !active) 710 continue; 711 712 if (active && !intel_engine_can_store_dword(engine)) 713 continue; 714 715 if (!wait_for_idle(engine)) { 716 pr_err("%s failed to idle before reset\n", 717 engine->name); 718 err = -EIO; 719 break; 720 } 721 722 reset_count = i915_reset_count(global); 723 reset_engine_count = i915_reset_engine_count(global, engine); 724 725 st_engine_heartbeat_disable(engine); 726 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 727 >->reset.flags)); 728 count = 0; 729 do { 730 struct i915_request *rq = NULL; 731 struct intel_selftest_saved_policy saved; 732 int err2; 733 734 err = intel_selftest_modify_policy(engine, &saved, 735 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 736 if (err) { 737 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 738 break; 739 } 740 741 if (active) { 742 rq = hang_create_request(&h, engine); 743 if (IS_ERR(rq)) { 744 err = PTR_ERR(rq); 745 pr_err("[%s] Create hang request failed: %d!\n", 746 engine->name, err); 747 goto restore; 748 } 749 750 i915_request_get(rq); 751 i915_request_add(rq); 752 753 if (!wait_until_running(&h, rq)) { 754 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 755 756 pr_err("%s: Failed to start request %llx, at %x\n", 757 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 758 intel_engine_dump(engine, &p, 759 "%s\n", engine->name); 760 761 i915_request_put(rq); 762 err = -EIO; 763 goto restore; 764 } 765 } 766 767 if (!using_guc) { 768 err = intel_engine_reset(engine, NULL); 769 if (err) { 770 pr_err("intel_engine_reset(%s) failed, err:%d\n", 771 engine->name, err); 772 goto skip; 773 } 774 } 775 776 if (rq) { 777 /* Ensure the reset happens and kills the engine */ 778 err = intel_selftest_wait_for_rq(rq); 779 if (err) 780 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 781 engine->name, rq->fence.context, 782 rq->fence.seqno, rq->context->guc_id.id, err); 783 } 784 785 skip: 786 if (rq) 787 i915_request_put(rq); 788 789 if (i915_reset_count(global) != reset_count) { 790 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 791 err = -EINVAL; 792 goto restore; 793 } 794 795 /* GuC based resets are not logged per engine */ 796 if (!using_guc) { 797 if (i915_reset_engine_count(global, engine) != 798 ++reset_engine_count) { 799 pr_err("%s engine reset not recorded!\n", 800 engine->name); 801 err = -EINVAL; 802 goto restore; 803 } 804 } 805 806 count++; 807 808 restore: 809 err2 = intel_selftest_restore_policy(engine, &saved); 810 if (err2) 811 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err); 812 if (err == 0) 813 err = err2; 814 if (err) 815 break; 816 } while (time_before(jiffies, end_time)); 817 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 818 st_engine_heartbeat_enable(engine); 819 pr_info("%s: Completed %lu %s resets\n", 820 engine->name, count, active ? "active" : "idle"); 821 822 if (err) 823 break; 824 825 err = igt_flush_test(gt->i915); 826 if (err) { 827 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 828 break; 829 } 830 } 831 832 if (intel_gt_is_wedged(gt)) { 833 pr_err("GT is wedged!\n"); 834 err = -EIO; 835 } 836 837 if (active) 838 hang_fini(&h); 839 840 return err; 841 } 842 843 static int igt_reset_idle_engine(void *arg) 844 { 845 return __igt_reset_engine(arg, false); 846 } 847 848 static int igt_reset_active_engine(void *arg) 849 { 850 return __igt_reset_engine(arg, true); 851 } 852 853 struct active_engine { 854 struct kthread_worker *worker; 855 struct kthread_work work; 856 struct intel_engine_cs *engine; 857 unsigned long resets; 858 unsigned int flags; 859 bool stop; 860 int result; 861 }; 862 863 #define TEST_ACTIVE BIT(0) 864 #define TEST_OTHERS BIT(1) 865 #define TEST_SELF BIT(2) 866 #define TEST_PRIORITY BIT(3) 867 868 static int active_request_put(struct i915_request *rq) 869 { 870 int err = 0; 871 872 if (!rq) 873 return 0; 874 875 if (i915_request_wait(rq, 0, 10 * HZ) < 0) { 876 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 877 rq->engine->name, 878 rq->fence.context, 879 rq->fence.seqno); 880 GEM_TRACE_DUMP(); 881 882 intel_gt_set_wedged(rq->engine->gt); 883 err = -EIO; 884 } 885 886 i915_request_put(rq); 887 888 return err; 889 } 890 891 static void active_engine(struct kthread_work *work) 892 { 893 I915_RND_STATE(prng); 894 struct active_engine *arg = container_of(work, typeof(*arg), work); 895 struct intel_engine_cs *engine = arg->engine; 896 struct i915_request *rq[8] = {}; 897 struct intel_context *ce[ARRAY_SIZE(rq)]; 898 unsigned long count; 899 int err = 0; 900 901 for (count = 0; count < ARRAY_SIZE(ce); count++) { 902 ce[count] = intel_context_create(engine); 903 if (IS_ERR(ce[count])) { 904 arg->result = PTR_ERR(ce[count]); 905 pr_err("[%s] Create context #%ld failed: %d!\n", 906 engine->name, count, arg->result); 907 while (--count) 908 intel_context_put(ce[count]); 909 return; 910 } 911 } 912 913 count = 0; 914 while (!READ_ONCE(arg->stop)) { 915 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 916 struct i915_request *old = rq[idx]; 917 struct i915_request *new; 918 919 new = intel_context_create_request(ce[idx]); 920 if (IS_ERR(new)) { 921 err = PTR_ERR(new); 922 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err); 923 break; 924 } 925 926 rq[idx] = i915_request_get(new); 927 i915_request_add(new); 928 929 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) { 930 struct i915_sched_attr attr = { 931 .priority = 932 i915_prandom_u32_max_state(512, &prng), 933 }; 934 engine->sched_engine->schedule(rq[idx], &attr); 935 } 936 937 err = active_request_put(old); 938 if (err) { 939 pr_err("[%s] Request put failed: %d!\n", engine->name, err); 940 break; 941 } 942 943 cond_resched(); 944 } 945 946 for (count = 0; count < ARRAY_SIZE(rq); count++) { 947 int err__ = active_request_put(rq[count]); 948 949 if (err) 950 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err); 951 952 /* Keep the first error */ 953 if (!err) 954 err = err__; 955 956 intel_context_put(ce[count]); 957 } 958 959 arg->result = err; 960 } 961 962 static int __igt_reset_engines(struct intel_gt *gt, 963 const char *test_name, 964 unsigned int flags) 965 { 966 struct i915_gpu_error *global = >->i915->gpu_error; 967 struct intel_engine_cs *engine, *other; 968 struct active_engine *threads; 969 enum intel_engine_id id, tmp; 970 struct hang h; 971 int err = 0; 972 973 /* Check that issuing a reset on one engine does not interfere 974 * with any other engine. 975 */ 976 977 if (!intel_has_reset_engine(gt)) 978 return 0; 979 980 if (flags & TEST_ACTIVE) { 981 err = hang_init(&h, gt); 982 if (err) 983 return err; 984 985 if (flags & TEST_PRIORITY) 986 h.ctx->sched.priority = 1024; 987 } 988 989 threads = kmalloc_array(I915_NUM_ENGINES, sizeof(*threads), GFP_KERNEL); 990 if (!threads) 991 return -ENOMEM; 992 993 for_each_engine(engine, gt, id) { 994 unsigned long device = i915_reset_count(global); 995 unsigned long count = 0, reported; 996 bool using_guc = intel_engine_uses_guc(engine); 997 IGT_TIMEOUT(end_time); 998 999 if (flags & TEST_ACTIVE) { 1000 if (!intel_engine_can_store_dword(engine)) 1001 continue; 1002 } else if (using_guc) 1003 continue; 1004 1005 if (!wait_for_idle(engine)) { 1006 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 1007 engine->name, test_name); 1008 err = -EIO; 1009 break; 1010 } 1011 1012 memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES); 1013 for_each_engine(other, gt, tmp) { 1014 struct kthread_worker *worker; 1015 1016 threads[tmp].resets = 1017 i915_reset_engine_count(global, other); 1018 1019 if (other == engine && !(flags & TEST_SELF)) 1020 continue; 1021 1022 if (other != engine && !(flags & TEST_OTHERS)) 1023 continue; 1024 1025 threads[tmp].engine = other; 1026 threads[tmp].flags = flags; 1027 1028 worker = kthread_create_worker(0, "igt/%s", 1029 other->name); 1030 if (IS_ERR(worker)) { 1031 err = PTR_ERR(worker); 1032 pr_err("[%s] Worker create failed: %d!\n", 1033 engine->name, err); 1034 goto unwind; 1035 } 1036 1037 threads[tmp].worker = worker; 1038 1039 kthread_init_work(&threads[tmp].work, active_engine); 1040 kthread_queue_work(threads[tmp].worker, 1041 &threads[tmp].work); 1042 } 1043 1044 st_engine_heartbeat_disable_no_pm(engine); 1045 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 1046 >->reset.flags)); 1047 do { 1048 struct i915_request *rq = NULL; 1049 struct intel_selftest_saved_policy saved; 1050 int err2; 1051 1052 err = intel_selftest_modify_policy(engine, &saved, 1053 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 1054 if (err) { 1055 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1056 break; 1057 } 1058 1059 if (flags & TEST_ACTIVE) { 1060 rq = hang_create_request(&h, engine); 1061 if (IS_ERR(rq)) { 1062 err = PTR_ERR(rq); 1063 pr_err("[%s] Create hang request failed: %d!\n", 1064 engine->name, err); 1065 goto restore; 1066 } 1067 1068 i915_request_get(rq); 1069 i915_request_add(rq); 1070 1071 if (!wait_until_running(&h, rq)) { 1072 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1073 1074 pr_err("%s: Failed to start request %llx, at %x\n", 1075 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1076 intel_engine_dump(engine, &p, 1077 "%s\n", engine->name); 1078 1079 i915_request_put(rq); 1080 err = -EIO; 1081 goto restore; 1082 } 1083 } else { 1084 intel_engine_pm_get(engine); 1085 } 1086 1087 if (!using_guc) { 1088 err = intel_engine_reset(engine, NULL); 1089 if (err) { 1090 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 1091 engine->name, test_name, err); 1092 goto restore; 1093 } 1094 } 1095 1096 if (rq) { 1097 /* Ensure the reset happens and kills the engine */ 1098 err = intel_selftest_wait_for_rq(rq); 1099 if (err) 1100 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 1101 engine->name, rq->fence.context, 1102 rq->fence.seqno, rq->context->guc_id.id, err); 1103 } 1104 1105 count++; 1106 1107 if (rq) { 1108 if (rq->fence.error != -EIO) { 1109 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n", 1110 engine->name, test_name, 1111 rq->fence.context, 1112 rq->fence.seqno, rq->context->guc_id.id); 1113 i915_request_put(rq); 1114 1115 GEM_TRACE_DUMP(); 1116 intel_gt_set_wedged(gt); 1117 err = -EIO; 1118 goto restore; 1119 } 1120 1121 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1122 struct drm_printer p = 1123 drm_info_printer(gt->i915->drm.dev); 1124 1125 pr_err("i915_reset_engine(%s:%s):" 1126 " failed to complete request %llx:%lld after reset\n", 1127 engine->name, test_name, 1128 rq->fence.context, 1129 rq->fence.seqno); 1130 intel_engine_dump(engine, &p, 1131 "%s\n", engine->name); 1132 i915_request_put(rq); 1133 1134 GEM_TRACE_DUMP(); 1135 intel_gt_set_wedged(gt); 1136 err = -EIO; 1137 goto restore; 1138 } 1139 1140 i915_request_put(rq); 1141 } 1142 1143 if (!(flags & TEST_ACTIVE)) 1144 intel_engine_pm_put(engine); 1145 1146 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1147 struct drm_printer p = 1148 drm_info_printer(gt->i915->drm.dev); 1149 1150 pr_err("i915_reset_engine(%s:%s):" 1151 " failed to idle after reset\n", 1152 engine->name, test_name); 1153 intel_engine_dump(engine, &p, 1154 "%s\n", engine->name); 1155 1156 err = -EIO; 1157 goto restore; 1158 } 1159 1160 restore: 1161 err2 = intel_selftest_restore_policy(engine, &saved); 1162 if (err2) 1163 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2); 1164 if (err == 0) 1165 err = err2; 1166 if (err) 1167 break; 1168 } while (time_before(jiffies, end_time)); 1169 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 1170 st_engine_heartbeat_enable_no_pm(engine); 1171 1172 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1173 engine->name, test_name, count); 1174 1175 /* GuC based resets are not logged per engine */ 1176 if (!using_guc) { 1177 reported = i915_reset_engine_count(global, engine); 1178 reported -= threads[engine->id].resets; 1179 if (reported != count) { 1180 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1181 engine->name, test_name, count, reported); 1182 if (!err) 1183 err = -EINVAL; 1184 } 1185 } 1186 1187 unwind: 1188 for_each_engine(other, gt, tmp) { 1189 int ret; 1190 1191 if (!threads[tmp].worker) 1192 continue; 1193 1194 WRITE_ONCE(threads[tmp].stop, true); 1195 kthread_flush_work(&threads[tmp].work); 1196 ret = READ_ONCE(threads[tmp].result); 1197 if (ret) { 1198 pr_err("kthread for other engine %s failed, err=%d\n", 1199 other->name, ret); 1200 if (!err) 1201 err = ret; 1202 } 1203 1204 kthread_destroy_worker(threads[tmp].worker); 1205 1206 /* GuC based resets are not logged per engine */ 1207 if (!using_guc) { 1208 if (other->uabi_class != engine->uabi_class && 1209 threads[tmp].resets != 1210 i915_reset_engine_count(global, other)) { 1211 pr_err("Innocent engine %s was reset (count=%ld)\n", 1212 other->name, 1213 i915_reset_engine_count(global, other) - 1214 threads[tmp].resets); 1215 if (!err) 1216 err = -EINVAL; 1217 } 1218 } 1219 } 1220 1221 if (device != i915_reset_count(global)) { 1222 pr_err("Global reset (count=%ld)!\n", 1223 i915_reset_count(global) - device); 1224 if (!err) 1225 err = -EINVAL; 1226 } 1227 1228 if (err) 1229 break; 1230 1231 err = igt_flush_test(gt->i915); 1232 if (err) { 1233 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1234 break; 1235 } 1236 } 1237 kfree(threads); 1238 1239 if (intel_gt_is_wedged(gt)) 1240 err = -EIO; 1241 1242 if (flags & TEST_ACTIVE) 1243 hang_fini(&h); 1244 1245 return err; 1246 } 1247 1248 static int igt_reset_engines(void *arg) 1249 { 1250 static const struct { 1251 const char *name; 1252 unsigned int flags; 1253 } phases[] = { 1254 { "idle", 0 }, 1255 { "active", TEST_ACTIVE }, 1256 { "others-idle", TEST_OTHERS }, 1257 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1258 { 1259 "others-priority", 1260 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1261 }, 1262 { 1263 "self-priority", 1264 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1265 }, 1266 { } 1267 }; 1268 struct intel_gt *gt = arg; 1269 typeof(*phases) *p; 1270 int err; 1271 1272 for (p = phases; p->name; p++) { 1273 if (p->flags & TEST_PRIORITY) { 1274 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1275 continue; 1276 } 1277 1278 err = __igt_reset_engines(arg, p->name, p->flags); 1279 if (err) 1280 return err; 1281 } 1282 1283 return 0; 1284 } 1285 1286 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1287 { 1288 u32 count = i915_reset_count(>->i915->gpu_error); 1289 1290 intel_gt_reset(gt, mask, NULL); 1291 1292 return count; 1293 } 1294 1295 static int igt_reset_wait(void *arg) 1296 { 1297 struct intel_gt *gt = arg; 1298 struct i915_gpu_error *global = >->i915->gpu_error; 1299 struct intel_engine_cs *engine; 1300 struct i915_request *rq; 1301 unsigned int reset_count; 1302 struct hang h; 1303 long timeout; 1304 int err; 1305 1306 engine = intel_selftest_find_any_engine(gt); 1307 1308 if (!engine || !intel_engine_can_store_dword(engine)) 1309 return 0; 1310 1311 /* Check that we detect a stuck waiter and issue a reset */ 1312 1313 igt_global_reset_lock(gt); 1314 1315 err = hang_init(&h, gt); 1316 if (err) { 1317 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1318 goto unlock; 1319 } 1320 1321 rq = hang_create_request(&h, engine); 1322 if (IS_ERR(rq)) { 1323 err = PTR_ERR(rq); 1324 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1325 goto fini; 1326 } 1327 1328 i915_request_get(rq); 1329 i915_request_add(rq); 1330 1331 if (!wait_until_running(&h, rq)) { 1332 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1333 1334 pr_err("%s: Failed to start request %llx, at %x\n", 1335 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1336 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1337 1338 intel_gt_set_wedged(gt); 1339 1340 err = -EIO; 1341 goto out_rq; 1342 } 1343 1344 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1345 1346 timeout = i915_request_wait(rq, 0, 10); 1347 if (timeout < 0) { 1348 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1349 timeout); 1350 err = timeout; 1351 goto out_rq; 1352 } 1353 1354 if (i915_reset_count(global) == reset_count) { 1355 pr_err("No GPU reset recorded!\n"); 1356 err = -EINVAL; 1357 goto out_rq; 1358 } 1359 1360 out_rq: 1361 i915_request_put(rq); 1362 fini: 1363 hang_fini(&h); 1364 unlock: 1365 igt_global_reset_unlock(gt); 1366 1367 if (intel_gt_is_wedged(gt)) 1368 return -EIO; 1369 1370 return err; 1371 } 1372 1373 struct evict_vma { 1374 struct completion completion; 1375 struct i915_vma *vma; 1376 }; 1377 1378 static int evict_vma(void *data) 1379 { 1380 struct evict_vma *arg = data; 1381 struct i915_address_space *vm = arg->vma->vm; 1382 struct drm_mm_node evict = arg->vma->node; 1383 int err; 1384 1385 complete(&arg->completion); 1386 1387 mutex_lock(&vm->mutex); 1388 err = i915_gem_evict_for_node(vm, NULL, &evict, 0); 1389 mutex_unlock(&vm->mutex); 1390 1391 return err; 1392 } 1393 1394 static int evict_fence(void *data) 1395 { 1396 struct evict_vma *arg = data; 1397 int err; 1398 1399 complete(&arg->completion); 1400 1401 /* Mark the fence register as dirty to force the mmio update. */ 1402 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1403 if (err) { 1404 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1405 return err; 1406 } 1407 1408 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1409 if (err) { 1410 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1411 return err; 1412 } 1413 1414 err = i915_vma_pin_fence(arg->vma); 1415 i915_vma_unpin(arg->vma); 1416 if (err) { 1417 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1418 return err; 1419 } 1420 1421 i915_vma_unpin_fence(arg->vma); 1422 1423 return 0; 1424 } 1425 1426 static int __igt_reset_evict_vma(struct intel_gt *gt, 1427 struct i915_address_space *vm, 1428 int (*fn)(void *), 1429 unsigned int flags) 1430 { 1431 struct intel_engine_cs *engine; 1432 struct drm_i915_gem_object *obj; 1433 struct task_struct *tsk = NULL; 1434 struct i915_request *rq; 1435 struct evict_vma arg; 1436 struct hang h; 1437 unsigned int pin_flags; 1438 int err; 1439 1440 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1441 return 0; 1442 1443 engine = intel_selftest_find_any_engine(gt); 1444 1445 if (!engine || !intel_engine_can_store_dword(engine)) 1446 return 0; 1447 1448 /* Check that we can recover an unbind stuck on a hanging request */ 1449 1450 err = hang_init(&h, gt); 1451 if (err) { 1452 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1453 return err; 1454 } 1455 1456 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1457 if (IS_ERR(obj)) { 1458 err = PTR_ERR(obj); 1459 pr_err("[%s] Create object failed: %d!\n", engine->name, err); 1460 goto fini; 1461 } 1462 1463 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1464 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1465 if (err) { 1466 pr_err("Invalid X-tiling settings; err:%d\n", err); 1467 goto out_obj; 1468 } 1469 } 1470 1471 arg.vma = i915_vma_instance(obj, vm, NULL); 1472 if (IS_ERR(arg.vma)) { 1473 err = PTR_ERR(arg.vma); 1474 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err); 1475 goto out_obj; 1476 } 1477 1478 rq = hang_create_request(&h, engine); 1479 if (IS_ERR(rq)) { 1480 err = PTR_ERR(rq); 1481 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1482 goto out_obj; 1483 } 1484 1485 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1486 1487 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1488 pin_flags |= PIN_MAPPABLE; 1489 1490 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1491 if (err) { 1492 i915_request_add(rq); 1493 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err); 1494 goto out_obj; 1495 } 1496 1497 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1498 err = i915_vma_pin_fence(arg.vma); 1499 if (err) { 1500 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1501 i915_vma_unpin(arg.vma); 1502 i915_request_add(rq); 1503 goto out_obj; 1504 } 1505 } 1506 1507 err = igt_vma_move_to_active_unlocked(arg.vma, rq, flags); 1508 if (err) 1509 pr_err("[%s] Move to active failed: %d!\n", engine->name, err); 1510 1511 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1512 i915_vma_unpin_fence(arg.vma); 1513 i915_vma_unpin(arg.vma); 1514 1515 i915_request_get(rq); 1516 i915_request_add(rq); 1517 if (err) 1518 goto out_rq; 1519 1520 if (!wait_until_running(&h, rq)) { 1521 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1522 1523 pr_err("%s: Failed to start request %llx, at %x\n", 1524 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1525 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1526 1527 intel_gt_set_wedged(gt); 1528 goto out_reset; 1529 } 1530 1531 init_completion(&arg.completion); 1532 1533 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1534 if (IS_ERR(tsk)) { 1535 err = PTR_ERR(tsk); 1536 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1537 tsk = NULL; 1538 goto out_reset; 1539 } 1540 get_task_struct(tsk); 1541 1542 wait_for_completion(&arg.completion); 1543 1544 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1545 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1546 1547 pr_err("igt/evict_vma kthread did not wait\n"); 1548 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1549 1550 intel_gt_set_wedged(gt); 1551 goto out_reset; 1552 } 1553 1554 out_reset: 1555 igt_global_reset_lock(gt); 1556 fake_hangcheck(gt, rq->engine->mask); 1557 igt_global_reset_unlock(gt); 1558 1559 if (tsk) { 1560 struct intel_wedge_me w; 1561 1562 /* The reset, even indirectly, should take less than 10ms. */ 1563 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1564 err = kthread_stop(tsk); 1565 1566 put_task_struct(tsk); 1567 } 1568 1569 out_rq: 1570 i915_request_put(rq); 1571 out_obj: 1572 i915_gem_object_put(obj); 1573 fini: 1574 hang_fini(&h); 1575 if (intel_gt_is_wedged(gt)) 1576 return -EIO; 1577 1578 return err; 1579 } 1580 1581 static int igt_reset_evict_ggtt(void *arg) 1582 { 1583 struct intel_gt *gt = arg; 1584 1585 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1586 evict_vma, EXEC_OBJECT_WRITE); 1587 } 1588 1589 static int igt_reset_evict_ppgtt(void *arg) 1590 { 1591 struct intel_gt *gt = arg; 1592 struct i915_ppgtt *ppgtt; 1593 int err; 1594 1595 /* aliasing == global gtt locking, covered above */ 1596 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1597 return 0; 1598 1599 ppgtt = i915_ppgtt_create(gt, 0); 1600 if (IS_ERR(ppgtt)) 1601 return PTR_ERR(ppgtt); 1602 1603 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1604 evict_vma, EXEC_OBJECT_WRITE); 1605 i915_vm_put(&ppgtt->vm); 1606 1607 return err; 1608 } 1609 1610 static int igt_reset_evict_fence(void *arg) 1611 { 1612 struct intel_gt *gt = arg; 1613 1614 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1615 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1616 } 1617 1618 static int wait_for_others(struct intel_gt *gt, 1619 struct intel_engine_cs *exclude) 1620 { 1621 struct intel_engine_cs *engine; 1622 enum intel_engine_id id; 1623 1624 for_each_engine(engine, gt, id) { 1625 if (engine == exclude) 1626 continue; 1627 1628 if (!wait_for_idle(engine)) 1629 return -EIO; 1630 } 1631 1632 return 0; 1633 } 1634 1635 static int igt_reset_queue(void *arg) 1636 { 1637 struct intel_gt *gt = arg; 1638 struct i915_gpu_error *global = >->i915->gpu_error; 1639 struct intel_engine_cs *engine; 1640 enum intel_engine_id id; 1641 struct hang h; 1642 int err; 1643 1644 /* Check that we replay pending requests following a hang */ 1645 1646 igt_global_reset_lock(gt); 1647 1648 err = hang_init(&h, gt); 1649 if (err) 1650 goto unlock; 1651 1652 for_each_engine(engine, gt, id) { 1653 struct intel_selftest_saved_policy saved; 1654 struct i915_request *prev; 1655 IGT_TIMEOUT(end_time); 1656 unsigned int count; 1657 bool using_guc = intel_engine_uses_guc(engine); 1658 1659 if (!intel_engine_can_store_dword(engine)) 1660 continue; 1661 1662 if (using_guc) { 1663 err = intel_selftest_modify_policy(engine, &saved, 1664 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK); 1665 if (err) { 1666 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1667 goto fini; 1668 } 1669 } 1670 1671 prev = hang_create_request(&h, engine); 1672 if (IS_ERR(prev)) { 1673 err = PTR_ERR(prev); 1674 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err); 1675 goto restore; 1676 } 1677 1678 i915_request_get(prev); 1679 i915_request_add(prev); 1680 1681 count = 0; 1682 do { 1683 struct i915_request *rq; 1684 unsigned int reset_count; 1685 1686 rq = hang_create_request(&h, engine); 1687 if (IS_ERR(rq)) { 1688 err = PTR_ERR(rq); 1689 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1690 goto restore; 1691 } 1692 1693 i915_request_get(rq); 1694 i915_request_add(rq); 1695 1696 /* 1697 * XXX We don't handle resetting the kernel context 1698 * very well. If we trigger a device reset twice in 1699 * quick succession while the kernel context is 1700 * executing, we may end up skipping the breadcrumb. 1701 * This is really only a problem for the selftest as 1702 * normally there is a large interlude between resets 1703 * (hangcheck), or we focus on resetting just one 1704 * engine and so avoid repeatedly resetting innocents. 1705 */ 1706 err = wait_for_others(gt, engine); 1707 if (err) { 1708 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1709 __func__, engine->name); 1710 i915_request_put(rq); 1711 i915_request_put(prev); 1712 1713 GEM_TRACE_DUMP(); 1714 intel_gt_set_wedged(gt); 1715 goto restore; 1716 } 1717 1718 if (!wait_until_running(&h, prev)) { 1719 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1720 1721 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1722 __func__, engine->name, 1723 prev->fence.seqno, hws_seqno(&h, prev)); 1724 intel_engine_dump(engine, &p, 1725 "%s\n", engine->name); 1726 1727 i915_request_put(rq); 1728 i915_request_put(prev); 1729 1730 intel_gt_set_wedged(gt); 1731 1732 err = -EIO; 1733 goto restore; 1734 } 1735 1736 reset_count = fake_hangcheck(gt, BIT(id)); 1737 1738 if (prev->fence.error != -EIO) { 1739 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1740 prev->fence.error); 1741 i915_request_put(rq); 1742 i915_request_put(prev); 1743 err = -EINVAL; 1744 goto restore; 1745 } 1746 1747 if (rq->fence.error) { 1748 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1749 rq->fence.error); 1750 i915_request_put(rq); 1751 i915_request_put(prev); 1752 err = -EINVAL; 1753 goto restore; 1754 } 1755 1756 if (i915_reset_count(global) == reset_count) { 1757 pr_err("No GPU reset recorded!\n"); 1758 i915_request_put(rq); 1759 i915_request_put(prev); 1760 err = -EINVAL; 1761 goto restore; 1762 } 1763 1764 i915_request_put(prev); 1765 prev = rq; 1766 count++; 1767 } while (time_before(jiffies, end_time)); 1768 pr_info("%s: Completed %d queued resets\n", 1769 engine->name, count); 1770 1771 *h.batch = MI_BATCH_BUFFER_END; 1772 intel_gt_chipset_flush(engine->gt); 1773 1774 i915_request_put(prev); 1775 1776 restore: 1777 if (using_guc) { 1778 int err2 = intel_selftest_restore_policy(engine, &saved); 1779 1780 if (err2) 1781 pr_err("%s:%d> [%s] Restore policy failed: %d!\n", 1782 __func__, __LINE__, engine->name, err2); 1783 if (err == 0) 1784 err = err2; 1785 } 1786 if (err) 1787 goto fini; 1788 1789 err = igt_flush_test(gt->i915); 1790 if (err) { 1791 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1792 break; 1793 } 1794 } 1795 1796 fini: 1797 hang_fini(&h); 1798 unlock: 1799 igt_global_reset_unlock(gt); 1800 1801 if (intel_gt_is_wedged(gt)) 1802 return -EIO; 1803 1804 return err; 1805 } 1806 1807 static int igt_handle_error(void *arg) 1808 { 1809 struct intel_gt *gt = arg; 1810 struct i915_gpu_error *global = >->i915->gpu_error; 1811 struct intel_engine_cs *engine; 1812 struct hang h; 1813 struct i915_request *rq; 1814 struct i915_gpu_coredump *error; 1815 int err; 1816 1817 engine = intel_selftest_find_any_engine(gt); 1818 1819 /* Check that we can issue a global GPU and engine reset */ 1820 1821 if (!intel_has_reset_engine(gt)) 1822 return 0; 1823 1824 if (!engine || !intel_engine_can_store_dword(engine)) 1825 return 0; 1826 1827 err = hang_init(&h, gt); 1828 if (err) { 1829 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1830 return err; 1831 } 1832 1833 rq = hang_create_request(&h, engine); 1834 if (IS_ERR(rq)) { 1835 err = PTR_ERR(rq); 1836 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1837 goto err_fini; 1838 } 1839 1840 i915_request_get(rq); 1841 i915_request_add(rq); 1842 1843 if (!wait_until_running(&h, rq)) { 1844 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1845 1846 pr_err("%s: Failed to start request %llx, at %x\n", 1847 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1848 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1849 1850 intel_gt_set_wedged(gt); 1851 1852 err = -EIO; 1853 goto err_request; 1854 } 1855 1856 /* Temporarily disable error capture */ 1857 error = xchg(&global->first_error, (void *)-1); 1858 1859 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1860 1861 xchg(&global->first_error, error); 1862 1863 if (rq->fence.error != -EIO) { 1864 pr_err("Guilty request not identified!\n"); 1865 err = -EINVAL; 1866 goto err_request; 1867 } 1868 1869 err_request: 1870 i915_request_put(rq); 1871 err_fini: 1872 hang_fini(&h); 1873 return err; 1874 } 1875 1876 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1877 const struct igt_atomic_section *p, 1878 const char *mode) 1879 { 1880 struct tasklet_struct * const t = &engine->sched_engine->tasklet; 1881 int err; 1882 1883 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1884 engine->name, mode, p->name); 1885 1886 if (t->func) 1887 tasklet_disable(t); 1888 if (strcmp(p->name, "softirq")) 1889 local_bh_disable(); 1890 p->critical_section_begin(); 1891 1892 err = __intel_engine_reset_bh(engine, NULL); 1893 1894 p->critical_section_end(); 1895 if (strcmp(p->name, "softirq")) 1896 local_bh_enable(); 1897 if (t->func) { 1898 tasklet_enable(t); 1899 tasklet_hi_schedule(t); 1900 } 1901 1902 if (err) 1903 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1904 engine->name, mode, p->name); 1905 1906 return err; 1907 } 1908 1909 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1910 const struct igt_atomic_section *p) 1911 { 1912 struct i915_request *rq; 1913 struct hang h; 1914 int err; 1915 1916 err = __igt_atomic_reset_engine(engine, p, "idle"); 1917 if (err) 1918 return err; 1919 1920 err = hang_init(&h, engine->gt); 1921 if (err) { 1922 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1923 return err; 1924 } 1925 1926 rq = hang_create_request(&h, engine); 1927 if (IS_ERR(rq)) { 1928 err = PTR_ERR(rq); 1929 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1930 goto out; 1931 } 1932 1933 i915_request_get(rq); 1934 i915_request_add(rq); 1935 1936 if (wait_until_running(&h, rq)) { 1937 err = __igt_atomic_reset_engine(engine, p, "active"); 1938 } else { 1939 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1940 __func__, engine->name, 1941 rq->fence.seqno, hws_seqno(&h, rq)); 1942 intel_gt_set_wedged(engine->gt); 1943 err = -EIO; 1944 } 1945 1946 if (err == 0) { 1947 struct intel_wedge_me w; 1948 1949 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1950 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1951 if (intel_gt_is_wedged(engine->gt)) 1952 err = -EIO; 1953 } 1954 1955 i915_request_put(rq); 1956 out: 1957 hang_fini(&h); 1958 return err; 1959 } 1960 1961 static int igt_reset_engines_atomic(void *arg) 1962 { 1963 struct intel_gt *gt = arg; 1964 const typeof(*igt_atomic_phases) *p; 1965 int err = 0; 1966 1967 /* Check that the engines resets are usable from atomic context */ 1968 1969 if (!intel_has_reset_engine(gt)) 1970 return 0; 1971 1972 if (intel_uc_uses_guc_submission(>->uc)) 1973 return 0; 1974 1975 igt_global_reset_lock(gt); 1976 1977 /* Flush any requests before we get started and check basics */ 1978 if (!igt_force_reset(gt)) 1979 goto unlock; 1980 1981 for (p = igt_atomic_phases; p->name; p++) { 1982 struct intel_engine_cs *engine; 1983 enum intel_engine_id id; 1984 1985 for_each_engine(engine, gt, id) { 1986 err = igt_atomic_reset_engine(engine, p); 1987 if (err) 1988 goto out; 1989 } 1990 } 1991 1992 out: 1993 /* As we poke around the guts, do a full reset before continuing. */ 1994 igt_force_reset(gt); 1995 unlock: 1996 igt_global_reset_unlock(gt); 1997 1998 return err; 1999 } 2000 2001 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 2002 { 2003 static const struct i915_subtest tests[] = { 2004 SUBTEST(igt_hang_sanitycheck), 2005 SUBTEST(igt_reset_nop), 2006 SUBTEST(igt_reset_nop_engine), 2007 SUBTEST(igt_reset_idle_engine), 2008 SUBTEST(igt_reset_active_engine), 2009 SUBTEST(igt_reset_fail_engine), 2010 SUBTEST(igt_reset_engines), 2011 SUBTEST(igt_reset_engines_atomic), 2012 SUBTEST(igt_reset_queue), 2013 SUBTEST(igt_reset_wait), 2014 SUBTEST(igt_reset_evict_ggtt), 2015 SUBTEST(igt_reset_evict_ppgtt), 2016 SUBTEST(igt_reset_evict_fence), 2017 SUBTEST(igt_handle_error), 2018 }; 2019 struct intel_gt *gt = to_gt(i915); 2020 intel_wakeref_t wakeref; 2021 int err; 2022 2023 if (!intel_has_gpu_reset(gt)) 2024 return 0; 2025 2026 if (intel_gt_is_wedged(gt)) 2027 return -EIO; /* we're long past hope of a successful reset */ 2028 2029 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 2030 2031 err = intel_gt_live_subtests(tests, gt); 2032 2033 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 2034 2035 return err; 2036 } 2037