1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2016 Intel Corporation 4 */ 5 6 #include <linux/kthread.h> 7 8 #include "gem/i915_gem_context.h" 9 #include "gem/i915_gem_internal.h" 10 11 #include "i915_gem_evict.h" 12 #include "intel_gt.h" 13 #include "intel_engine_heartbeat.h" 14 #include "intel_engine_pm.h" 15 #include "selftest_engine_heartbeat.h" 16 17 #include "i915_selftest.h" 18 #include "selftests/i915_random.h" 19 #include "selftests/igt_flush_test.h" 20 #include "selftests/igt_reset.h" 21 #include "selftests/igt_atomic.h" 22 #include "selftests/igt_spinner.h" 23 #include "selftests/intel_scheduler_helpers.h" 24 25 #include "selftests/mock_drm.h" 26 27 #include "gem/selftests/mock_context.h" 28 #include "gem/selftests/igt_gem_utils.h" 29 30 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 31 32 struct hang { 33 struct intel_gt *gt; 34 struct drm_i915_gem_object *hws; 35 struct drm_i915_gem_object *obj; 36 struct i915_gem_context *ctx; 37 u32 *seqno; 38 u32 *batch; 39 }; 40 41 static int hang_init(struct hang *h, struct intel_gt *gt) 42 { 43 void *vaddr; 44 int err; 45 46 memset(h, 0, sizeof(*h)); 47 h->gt = gt; 48 49 h->ctx = kernel_context(gt->i915, NULL); 50 if (IS_ERR(h->ctx)) 51 return PTR_ERR(h->ctx); 52 53 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 54 55 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 56 if (IS_ERR(h->hws)) { 57 err = PTR_ERR(h->hws); 58 goto err_ctx; 59 } 60 61 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 62 if (IS_ERR(h->obj)) { 63 err = PTR_ERR(h->obj); 64 goto err_hws; 65 } 66 67 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 68 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB); 69 if (IS_ERR(vaddr)) { 70 err = PTR_ERR(vaddr); 71 goto err_obj; 72 } 73 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 74 75 vaddr = i915_gem_object_pin_map_unlocked(h->obj, 76 i915_coherent_map_type(gt->i915, h->obj, false)); 77 if (IS_ERR(vaddr)) { 78 err = PTR_ERR(vaddr); 79 goto err_unpin_hws; 80 } 81 h->batch = vaddr; 82 83 return 0; 84 85 err_unpin_hws: 86 i915_gem_object_unpin_map(h->hws); 87 err_obj: 88 i915_gem_object_put(h->obj); 89 err_hws: 90 i915_gem_object_put(h->hws); 91 err_ctx: 92 kernel_context_close(h->ctx); 93 return err; 94 } 95 96 static u64 hws_address(const struct i915_vma *hws, 97 const struct i915_request *rq) 98 { 99 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 100 } 101 102 static int move_to_active(struct i915_vma *vma, 103 struct i915_request *rq, 104 unsigned int flags) 105 { 106 int err; 107 108 i915_vma_lock(vma); 109 err = i915_request_await_object(rq, vma->obj, 110 flags & EXEC_OBJECT_WRITE); 111 if (err == 0) 112 err = i915_vma_move_to_active(vma, rq, flags); 113 i915_vma_unlock(vma); 114 115 return err; 116 } 117 118 static struct i915_request * 119 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 120 { 121 struct intel_gt *gt = h->gt; 122 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx); 123 struct drm_i915_gem_object *obj; 124 struct i915_request *rq = NULL; 125 struct i915_vma *hws, *vma; 126 unsigned int flags; 127 void *vaddr; 128 u32 *batch; 129 int err; 130 131 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 132 if (IS_ERR(obj)) { 133 i915_vm_put(vm); 134 return ERR_CAST(obj); 135 } 136 137 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false)); 138 if (IS_ERR(vaddr)) { 139 i915_gem_object_put(obj); 140 i915_vm_put(vm); 141 return ERR_CAST(vaddr); 142 } 143 144 i915_gem_object_unpin_map(h->obj); 145 i915_gem_object_put(h->obj); 146 147 h->obj = obj; 148 h->batch = vaddr; 149 150 vma = i915_vma_instance(h->obj, vm, NULL); 151 if (IS_ERR(vma)) { 152 i915_vm_put(vm); 153 return ERR_CAST(vma); 154 } 155 156 hws = i915_vma_instance(h->hws, vm, NULL); 157 if (IS_ERR(hws)) { 158 i915_vm_put(vm); 159 return ERR_CAST(hws); 160 } 161 162 err = i915_vma_pin(vma, 0, 0, PIN_USER); 163 if (err) { 164 i915_vm_put(vm); 165 return ERR_PTR(err); 166 } 167 168 err = i915_vma_pin(hws, 0, 0, PIN_USER); 169 if (err) 170 goto unpin_vma; 171 172 rq = igt_request_alloc(h->ctx, engine); 173 if (IS_ERR(rq)) { 174 err = PTR_ERR(rq); 175 goto unpin_hws; 176 } 177 178 err = move_to_active(vma, rq, 0); 179 if (err) 180 goto cancel_rq; 181 182 err = move_to_active(hws, rq, 0); 183 if (err) 184 goto cancel_rq; 185 186 batch = h->batch; 187 if (GRAPHICS_VER(gt->i915) >= 8) { 188 *batch++ = MI_STORE_DWORD_IMM_GEN4; 189 *batch++ = lower_32_bits(hws_address(hws, rq)); 190 *batch++ = upper_32_bits(hws_address(hws, rq)); 191 *batch++ = rq->fence.seqno; 192 *batch++ = MI_NOOP; 193 194 memset(batch, 0, 1024); 195 batch += 1024 / sizeof(*batch); 196 197 *batch++ = MI_NOOP; 198 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 199 *batch++ = lower_32_bits(vma->node.start); 200 *batch++ = upper_32_bits(vma->node.start); 201 } else if (GRAPHICS_VER(gt->i915) >= 6) { 202 *batch++ = MI_STORE_DWORD_IMM_GEN4; 203 *batch++ = 0; 204 *batch++ = lower_32_bits(hws_address(hws, rq)); 205 *batch++ = rq->fence.seqno; 206 *batch++ = MI_NOOP; 207 208 memset(batch, 0, 1024); 209 batch += 1024 / sizeof(*batch); 210 211 *batch++ = MI_NOOP; 212 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 213 *batch++ = lower_32_bits(vma->node.start); 214 } else if (GRAPHICS_VER(gt->i915) >= 4) { 215 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 216 *batch++ = 0; 217 *batch++ = lower_32_bits(hws_address(hws, rq)); 218 *batch++ = rq->fence.seqno; 219 *batch++ = MI_NOOP; 220 221 memset(batch, 0, 1024); 222 batch += 1024 / sizeof(*batch); 223 224 *batch++ = MI_NOOP; 225 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 226 *batch++ = lower_32_bits(vma->node.start); 227 } else { 228 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 229 *batch++ = lower_32_bits(hws_address(hws, rq)); 230 *batch++ = rq->fence.seqno; 231 *batch++ = MI_NOOP; 232 233 memset(batch, 0, 1024); 234 batch += 1024 / sizeof(*batch); 235 236 *batch++ = MI_NOOP; 237 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 238 *batch++ = lower_32_bits(vma->node.start); 239 } 240 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 241 intel_gt_chipset_flush(engine->gt); 242 243 if (rq->engine->emit_init_breadcrumb) { 244 err = rq->engine->emit_init_breadcrumb(rq); 245 if (err) 246 goto cancel_rq; 247 } 248 249 flags = 0; 250 if (GRAPHICS_VER(gt->i915) <= 5) 251 flags |= I915_DISPATCH_SECURE; 252 253 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 254 255 cancel_rq: 256 if (err) { 257 i915_request_set_error_once(rq, err); 258 i915_request_add(rq); 259 } 260 unpin_hws: 261 i915_vma_unpin(hws); 262 unpin_vma: 263 i915_vma_unpin(vma); 264 i915_vm_put(vm); 265 return err ? ERR_PTR(err) : rq; 266 } 267 268 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 269 { 270 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 271 } 272 273 static void hang_fini(struct hang *h) 274 { 275 *h->batch = MI_BATCH_BUFFER_END; 276 intel_gt_chipset_flush(h->gt); 277 278 i915_gem_object_unpin_map(h->obj); 279 i915_gem_object_put(h->obj); 280 281 i915_gem_object_unpin_map(h->hws); 282 i915_gem_object_put(h->hws); 283 284 kernel_context_close(h->ctx); 285 286 igt_flush_test(h->gt->i915); 287 } 288 289 static bool wait_until_running(struct hang *h, struct i915_request *rq) 290 { 291 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 292 rq->fence.seqno), 293 10) && 294 wait_for(i915_seqno_passed(hws_seqno(h, rq), 295 rq->fence.seqno), 296 1000)); 297 } 298 299 static int igt_hang_sanitycheck(void *arg) 300 { 301 struct intel_gt *gt = arg; 302 struct i915_request *rq; 303 struct intel_engine_cs *engine; 304 enum intel_engine_id id; 305 struct hang h; 306 int err; 307 308 /* Basic check that we can execute our hanging batch */ 309 310 err = hang_init(&h, gt); 311 if (err) 312 return err; 313 314 for_each_engine(engine, gt, id) { 315 struct intel_wedge_me w; 316 long timeout; 317 318 if (!intel_engine_can_store_dword(engine)) 319 continue; 320 321 rq = hang_create_request(&h, engine); 322 if (IS_ERR(rq)) { 323 err = PTR_ERR(rq); 324 pr_err("Failed to create request for %s, err=%d\n", 325 engine->name, err); 326 goto fini; 327 } 328 329 i915_request_get(rq); 330 331 *h.batch = MI_BATCH_BUFFER_END; 332 intel_gt_chipset_flush(engine->gt); 333 334 i915_request_add(rq); 335 336 timeout = 0; 337 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 338 timeout = i915_request_wait(rq, 0, 339 MAX_SCHEDULE_TIMEOUT); 340 if (intel_gt_is_wedged(gt)) 341 timeout = -EIO; 342 343 i915_request_put(rq); 344 345 if (timeout < 0) { 346 err = timeout; 347 pr_err("Wait for request failed on %s, err=%d\n", 348 engine->name, err); 349 goto fini; 350 } 351 } 352 353 fini: 354 hang_fini(&h); 355 return err; 356 } 357 358 static bool wait_for_idle(struct intel_engine_cs *engine) 359 { 360 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 361 } 362 363 static int igt_reset_nop(void *arg) 364 { 365 struct intel_gt *gt = arg; 366 struct i915_gpu_error *global = >->i915->gpu_error; 367 struct intel_engine_cs *engine; 368 unsigned int reset_count, count; 369 enum intel_engine_id id; 370 IGT_TIMEOUT(end_time); 371 int err = 0; 372 373 /* Check that we can reset during non-user portions of requests */ 374 375 reset_count = i915_reset_count(global); 376 count = 0; 377 do { 378 for_each_engine(engine, gt, id) { 379 struct intel_context *ce; 380 int i; 381 382 ce = intel_context_create(engine); 383 if (IS_ERR(ce)) { 384 err = PTR_ERR(ce); 385 pr_err("[%s] Create context failed: %d!\n", engine->name, err); 386 break; 387 } 388 389 for (i = 0; i < 16; i++) { 390 struct i915_request *rq; 391 392 rq = intel_context_create_request(ce); 393 if (IS_ERR(rq)) { 394 err = PTR_ERR(rq); 395 pr_err("[%s] Create request failed: %d!\n", 396 engine->name, err); 397 break; 398 } 399 400 i915_request_add(rq); 401 } 402 403 intel_context_put(ce); 404 } 405 406 igt_global_reset_lock(gt); 407 intel_gt_reset(gt, ALL_ENGINES, NULL); 408 igt_global_reset_unlock(gt); 409 410 if (intel_gt_is_wedged(gt)) { 411 pr_err("[%s] GT is wedged!\n", engine->name); 412 err = -EIO; 413 break; 414 } 415 416 if (i915_reset_count(global) != reset_count + ++count) { 417 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n", 418 engine->name, i915_reset_count(global), reset_count, count); 419 err = -EINVAL; 420 break; 421 } 422 423 err = igt_flush_test(gt->i915); 424 if (err) { 425 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 426 break; 427 } 428 } while (time_before(jiffies, end_time)); 429 pr_info("%s: %d resets\n", __func__, count); 430 431 if (igt_flush_test(gt->i915)) { 432 pr_err("Post flush failed: %d!\n", err); 433 err = -EIO; 434 } 435 436 return err; 437 } 438 439 static int igt_reset_nop_engine(void *arg) 440 { 441 struct intel_gt *gt = arg; 442 struct i915_gpu_error *global = >->i915->gpu_error; 443 struct intel_engine_cs *engine; 444 enum intel_engine_id id; 445 446 /* Check that we can engine-reset during non-user portions */ 447 448 if (!intel_has_reset_engine(gt)) 449 return 0; 450 451 for_each_engine(engine, gt, id) { 452 unsigned int reset_count, reset_engine_count, count; 453 struct intel_context *ce; 454 IGT_TIMEOUT(end_time); 455 int err; 456 457 if (intel_engine_uses_guc(engine)) { 458 /* Engine level resets are triggered by GuC when a hang 459 * is detected. They can't be triggered by the KMD any 460 * more. Thus a nop batch cannot be used as a reset test 461 */ 462 continue; 463 } 464 465 ce = intel_context_create(engine); 466 if (IS_ERR(ce)) { 467 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 468 return PTR_ERR(ce); 469 } 470 471 reset_count = i915_reset_count(global); 472 reset_engine_count = i915_reset_engine_count(global, engine); 473 count = 0; 474 475 st_engine_heartbeat_disable(engine); 476 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 477 >->reset.flags)); 478 do { 479 int i; 480 481 if (!wait_for_idle(engine)) { 482 pr_err("%s failed to idle before reset\n", 483 engine->name); 484 err = -EIO; 485 break; 486 } 487 488 for (i = 0; i < 16; i++) { 489 struct i915_request *rq; 490 491 rq = intel_context_create_request(ce); 492 if (IS_ERR(rq)) { 493 struct drm_printer p = 494 drm_info_printer(gt->i915->drm.dev); 495 intel_engine_dump(engine, &p, 496 "%s(%s): failed to submit request\n", 497 __func__, 498 engine->name); 499 500 GEM_TRACE("%s(%s): failed to submit request\n", 501 __func__, 502 engine->name); 503 GEM_TRACE_DUMP(); 504 505 intel_gt_set_wedged(gt); 506 507 err = PTR_ERR(rq); 508 break; 509 } 510 511 i915_request_add(rq); 512 } 513 err = intel_engine_reset(engine, NULL); 514 if (err) { 515 pr_err("intel_engine_reset(%s) failed, err:%d\n", 516 engine->name, err); 517 break; 518 } 519 520 if (i915_reset_count(global) != reset_count) { 521 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 522 err = -EINVAL; 523 break; 524 } 525 526 if (i915_reset_engine_count(global, engine) != 527 reset_engine_count + ++count) { 528 pr_err("%s engine reset not recorded!\n", 529 engine->name); 530 err = -EINVAL; 531 break; 532 } 533 } while (time_before(jiffies, end_time)); 534 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 535 st_engine_heartbeat_enable(engine); 536 537 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 538 539 intel_context_put(ce); 540 if (igt_flush_test(gt->i915)) 541 err = -EIO; 542 if (err) 543 return err; 544 } 545 546 return 0; 547 } 548 549 static void force_reset_timeout(struct intel_engine_cs *engine) 550 { 551 engine->reset_timeout.probability = 999; 552 atomic_set(&engine->reset_timeout.times, -1); 553 } 554 555 static void cancel_reset_timeout(struct intel_engine_cs *engine) 556 { 557 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 558 } 559 560 static int igt_reset_fail_engine(void *arg) 561 { 562 struct intel_gt *gt = arg; 563 struct intel_engine_cs *engine; 564 enum intel_engine_id id; 565 566 /* Check that we can recover from engine-reset failues */ 567 568 if (!intel_has_reset_engine(gt)) 569 return 0; 570 571 for_each_engine(engine, gt, id) { 572 unsigned int count; 573 struct intel_context *ce; 574 IGT_TIMEOUT(end_time); 575 int err; 576 577 /* Can't manually break the reset if i915 doesn't perform it */ 578 if (intel_engine_uses_guc(engine)) 579 continue; 580 581 ce = intel_context_create(engine); 582 if (IS_ERR(ce)) { 583 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 584 return PTR_ERR(ce); 585 } 586 587 st_engine_heartbeat_disable(engine); 588 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 589 >->reset.flags)); 590 591 force_reset_timeout(engine); 592 err = intel_engine_reset(engine, NULL); 593 cancel_reset_timeout(engine); 594 if (err == 0) /* timeouts only generated on gen8+ */ 595 goto skip; 596 597 count = 0; 598 do { 599 struct i915_request *last = NULL; 600 int i; 601 602 if (!wait_for_idle(engine)) { 603 pr_err("%s failed to idle before reset\n", 604 engine->name); 605 err = -EIO; 606 break; 607 } 608 609 for (i = 0; i < count % 15; i++) { 610 struct i915_request *rq; 611 612 rq = intel_context_create_request(ce); 613 if (IS_ERR(rq)) { 614 struct drm_printer p = 615 drm_info_printer(gt->i915->drm.dev); 616 intel_engine_dump(engine, &p, 617 "%s(%s): failed to submit request\n", 618 __func__, 619 engine->name); 620 621 GEM_TRACE("%s(%s): failed to submit request\n", 622 __func__, 623 engine->name); 624 GEM_TRACE_DUMP(); 625 626 intel_gt_set_wedged(gt); 627 if (last) 628 i915_request_put(last); 629 630 err = PTR_ERR(rq); 631 goto out; 632 } 633 634 if (last) 635 i915_request_put(last); 636 last = i915_request_get(rq); 637 i915_request_add(rq); 638 } 639 640 if (count & 1) { 641 err = intel_engine_reset(engine, NULL); 642 if (err) { 643 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 644 engine->name, err); 645 GEM_TRACE_DUMP(); 646 i915_request_put(last); 647 break; 648 } 649 } else { 650 force_reset_timeout(engine); 651 err = intel_engine_reset(engine, NULL); 652 cancel_reset_timeout(engine); 653 if (err != -ETIMEDOUT) { 654 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 655 engine->name, err); 656 i915_request_put(last); 657 break; 658 } 659 } 660 661 err = 0; 662 if (last) { 663 if (i915_request_wait(last, 0, HZ / 2) < 0) { 664 struct drm_printer p = 665 drm_info_printer(gt->i915->drm.dev); 666 667 intel_engine_dump(engine, &p, 668 "%s(%s): failed to complete request\n", 669 __func__, 670 engine->name); 671 672 GEM_TRACE("%s(%s): failed to complete request\n", 673 __func__, 674 engine->name); 675 GEM_TRACE_DUMP(); 676 677 err = -EIO; 678 } 679 i915_request_put(last); 680 } 681 count++; 682 } while (err == 0 && time_before(jiffies, end_time)); 683 out: 684 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 685 skip: 686 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 687 st_engine_heartbeat_enable(engine); 688 intel_context_put(ce); 689 690 if (igt_flush_test(gt->i915)) 691 err = -EIO; 692 if (err) 693 return err; 694 } 695 696 return 0; 697 } 698 699 static int __igt_reset_engine(struct intel_gt *gt, bool active) 700 { 701 struct i915_gpu_error *global = >->i915->gpu_error; 702 struct intel_engine_cs *engine; 703 enum intel_engine_id id; 704 struct hang h; 705 int err = 0; 706 707 /* Check that we can issue an engine reset on an idle engine (no-op) */ 708 709 if (!intel_has_reset_engine(gt)) 710 return 0; 711 712 if (active) { 713 err = hang_init(&h, gt); 714 if (err) 715 return err; 716 } 717 718 for_each_engine(engine, gt, id) { 719 unsigned int reset_count, reset_engine_count; 720 unsigned long count; 721 bool using_guc = intel_engine_uses_guc(engine); 722 IGT_TIMEOUT(end_time); 723 724 if (using_guc && !active) 725 continue; 726 727 if (active && !intel_engine_can_store_dword(engine)) 728 continue; 729 730 if (!wait_for_idle(engine)) { 731 pr_err("%s failed to idle before reset\n", 732 engine->name); 733 err = -EIO; 734 break; 735 } 736 737 reset_count = i915_reset_count(global); 738 reset_engine_count = i915_reset_engine_count(global, engine); 739 740 st_engine_heartbeat_disable(engine); 741 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 742 >->reset.flags)); 743 count = 0; 744 do { 745 struct i915_request *rq = NULL; 746 struct intel_selftest_saved_policy saved; 747 int err2; 748 749 err = intel_selftest_modify_policy(engine, &saved, 750 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 751 if (err) { 752 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 753 break; 754 } 755 756 if (active) { 757 rq = hang_create_request(&h, engine); 758 if (IS_ERR(rq)) { 759 err = PTR_ERR(rq); 760 pr_err("[%s] Create hang request failed: %d!\n", 761 engine->name, err); 762 goto restore; 763 } 764 765 i915_request_get(rq); 766 i915_request_add(rq); 767 768 if (!wait_until_running(&h, rq)) { 769 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 770 771 pr_err("%s: Failed to start request %llx, at %x\n", 772 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 773 intel_engine_dump(engine, &p, 774 "%s\n", engine->name); 775 776 i915_request_put(rq); 777 err = -EIO; 778 goto restore; 779 } 780 } 781 782 if (!using_guc) { 783 err = intel_engine_reset(engine, NULL); 784 if (err) { 785 pr_err("intel_engine_reset(%s) failed, err:%d\n", 786 engine->name, err); 787 goto skip; 788 } 789 } 790 791 if (rq) { 792 /* Ensure the reset happens and kills the engine */ 793 err = intel_selftest_wait_for_rq(rq); 794 if (err) 795 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 796 engine->name, rq->fence.context, 797 rq->fence.seqno, rq->context->guc_id.id, err); 798 } 799 800 skip: 801 if (rq) 802 i915_request_put(rq); 803 804 if (i915_reset_count(global) != reset_count) { 805 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 806 err = -EINVAL; 807 goto restore; 808 } 809 810 /* GuC based resets are not logged per engine */ 811 if (!using_guc) { 812 if (i915_reset_engine_count(global, engine) != 813 ++reset_engine_count) { 814 pr_err("%s engine reset not recorded!\n", 815 engine->name); 816 err = -EINVAL; 817 goto restore; 818 } 819 } 820 821 count++; 822 823 restore: 824 err2 = intel_selftest_restore_policy(engine, &saved); 825 if (err2) 826 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err); 827 if (err == 0) 828 err = err2; 829 if (err) 830 break; 831 } while (time_before(jiffies, end_time)); 832 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 833 st_engine_heartbeat_enable(engine); 834 pr_info("%s: Completed %lu %s resets\n", 835 engine->name, count, active ? "active" : "idle"); 836 837 if (err) 838 break; 839 840 err = igt_flush_test(gt->i915); 841 if (err) { 842 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 843 break; 844 } 845 } 846 847 if (intel_gt_is_wedged(gt)) { 848 pr_err("GT is wedged!\n"); 849 err = -EIO; 850 } 851 852 if (active) 853 hang_fini(&h); 854 855 return err; 856 } 857 858 static int igt_reset_idle_engine(void *arg) 859 { 860 return __igt_reset_engine(arg, false); 861 } 862 863 static int igt_reset_active_engine(void *arg) 864 { 865 return __igt_reset_engine(arg, true); 866 } 867 868 struct active_engine { 869 struct kthread_worker *worker; 870 struct kthread_work work; 871 struct intel_engine_cs *engine; 872 unsigned long resets; 873 unsigned int flags; 874 bool stop; 875 int result; 876 }; 877 878 #define TEST_ACTIVE BIT(0) 879 #define TEST_OTHERS BIT(1) 880 #define TEST_SELF BIT(2) 881 #define TEST_PRIORITY BIT(3) 882 883 static int active_request_put(struct i915_request *rq) 884 { 885 int err = 0; 886 887 if (!rq) 888 return 0; 889 890 if (i915_request_wait(rq, 0, 10 * HZ) < 0) { 891 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 892 rq->engine->name, 893 rq->fence.context, 894 rq->fence.seqno); 895 GEM_TRACE_DUMP(); 896 897 intel_gt_set_wedged(rq->engine->gt); 898 err = -EIO; 899 } 900 901 i915_request_put(rq); 902 903 return err; 904 } 905 906 static void active_engine(struct kthread_work *work) 907 { 908 I915_RND_STATE(prng); 909 struct active_engine *arg = container_of(work, typeof(*arg), work); 910 struct intel_engine_cs *engine = arg->engine; 911 struct i915_request *rq[8] = {}; 912 struct intel_context *ce[ARRAY_SIZE(rq)]; 913 unsigned long count; 914 int err = 0; 915 916 for (count = 0; count < ARRAY_SIZE(ce); count++) { 917 ce[count] = intel_context_create(engine); 918 if (IS_ERR(ce[count])) { 919 arg->result = PTR_ERR(ce[count]); 920 pr_err("[%s] Create context #%ld failed: %d!\n", 921 engine->name, count, arg->result); 922 while (--count) 923 intel_context_put(ce[count]); 924 return; 925 } 926 } 927 928 count = 0; 929 while (!READ_ONCE(arg->stop)) { 930 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 931 struct i915_request *old = rq[idx]; 932 struct i915_request *new; 933 934 new = intel_context_create_request(ce[idx]); 935 if (IS_ERR(new)) { 936 err = PTR_ERR(new); 937 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err); 938 break; 939 } 940 941 rq[idx] = i915_request_get(new); 942 i915_request_add(new); 943 944 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) { 945 struct i915_sched_attr attr = { 946 .priority = 947 i915_prandom_u32_max_state(512, &prng), 948 }; 949 engine->sched_engine->schedule(rq[idx], &attr); 950 } 951 952 err = active_request_put(old); 953 if (err) { 954 pr_err("[%s] Request put failed: %d!\n", engine->name, err); 955 break; 956 } 957 958 cond_resched(); 959 } 960 961 for (count = 0; count < ARRAY_SIZE(rq); count++) { 962 int err__ = active_request_put(rq[count]); 963 964 if (err) 965 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err); 966 967 /* Keep the first error */ 968 if (!err) 969 err = err__; 970 971 intel_context_put(ce[count]); 972 } 973 974 arg->result = err; 975 } 976 977 static int __igt_reset_engines(struct intel_gt *gt, 978 const char *test_name, 979 unsigned int flags) 980 { 981 struct i915_gpu_error *global = >->i915->gpu_error; 982 struct intel_engine_cs *engine, *other; 983 struct active_engine *threads; 984 enum intel_engine_id id, tmp; 985 struct hang h; 986 int err = 0; 987 988 /* Check that issuing a reset on one engine does not interfere 989 * with any other engine. 990 */ 991 992 if (!intel_has_reset_engine(gt)) 993 return 0; 994 995 if (flags & TEST_ACTIVE) { 996 err = hang_init(&h, gt); 997 if (err) 998 return err; 999 1000 if (flags & TEST_PRIORITY) 1001 h.ctx->sched.priority = 1024; 1002 } 1003 1004 threads = kmalloc_array(I915_NUM_ENGINES, sizeof(*threads), GFP_KERNEL); 1005 if (!threads) 1006 return -ENOMEM; 1007 1008 for_each_engine(engine, gt, id) { 1009 unsigned long device = i915_reset_count(global); 1010 unsigned long count = 0, reported; 1011 bool using_guc = intel_engine_uses_guc(engine); 1012 IGT_TIMEOUT(end_time); 1013 1014 if (flags & TEST_ACTIVE) { 1015 if (!intel_engine_can_store_dword(engine)) 1016 continue; 1017 } else if (using_guc) 1018 continue; 1019 1020 if (!wait_for_idle(engine)) { 1021 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 1022 engine->name, test_name); 1023 err = -EIO; 1024 break; 1025 } 1026 1027 memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES); 1028 for_each_engine(other, gt, tmp) { 1029 struct kthread_worker *worker; 1030 1031 threads[tmp].resets = 1032 i915_reset_engine_count(global, other); 1033 1034 if (other == engine && !(flags & TEST_SELF)) 1035 continue; 1036 1037 if (other != engine && !(flags & TEST_OTHERS)) 1038 continue; 1039 1040 threads[tmp].engine = other; 1041 threads[tmp].flags = flags; 1042 1043 worker = kthread_create_worker(0, "igt/%s", 1044 other->name); 1045 if (IS_ERR(worker)) { 1046 err = PTR_ERR(worker); 1047 pr_err("[%s] Worker create failed: %d!\n", 1048 engine->name, err); 1049 goto unwind; 1050 } 1051 1052 threads[tmp].worker = worker; 1053 1054 kthread_init_work(&threads[tmp].work, active_engine); 1055 kthread_queue_work(threads[tmp].worker, 1056 &threads[tmp].work); 1057 } 1058 1059 st_engine_heartbeat_disable_no_pm(engine); 1060 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 1061 >->reset.flags)); 1062 do { 1063 struct i915_request *rq = NULL; 1064 struct intel_selftest_saved_policy saved; 1065 int err2; 1066 1067 err = intel_selftest_modify_policy(engine, &saved, 1068 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 1069 if (err) { 1070 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1071 break; 1072 } 1073 1074 if (flags & TEST_ACTIVE) { 1075 rq = hang_create_request(&h, engine); 1076 if (IS_ERR(rq)) { 1077 err = PTR_ERR(rq); 1078 pr_err("[%s] Create hang request failed: %d!\n", 1079 engine->name, err); 1080 goto restore; 1081 } 1082 1083 i915_request_get(rq); 1084 i915_request_add(rq); 1085 1086 if (!wait_until_running(&h, rq)) { 1087 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1088 1089 pr_err("%s: Failed to start request %llx, at %x\n", 1090 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1091 intel_engine_dump(engine, &p, 1092 "%s\n", engine->name); 1093 1094 i915_request_put(rq); 1095 err = -EIO; 1096 goto restore; 1097 } 1098 } else { 1099 intel_engine_pm_get(engine); 1100 } 1101 1102 if (!using_guc) { 1103 err = intel_engine_reset(engine, NULL); 1104 if (err) { 1105 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 1106 engine->name, test_name, err); 1107 goto restore; 1108 } 1109 } 1110 1111 if (rq) { 1112 /* Ensure the reset happens and kills the engine */ 1113 err = intel_selftest_wait_for_rq(rq); 1114 if (err) 1115 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 1116 engine->name, rq->fence.context, 1117 rq->fence.seqno, rq->context->guc_id.id, err); 1118 } 1119 1120 count++; 1121 1122 if (rq) { 1123 if (rq->fence.error != -EIO) { 1124 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n", 1125 engine->name, test_name, 1126 rq->fence.context, 1127 rq->fence.seqno, rq->context->guc_id.id); 1128 i915_request_put(rq); 1129 1130 GEM_TRACE_DUMP(); 1131 intel_gt_set_wedged(gt); 1132 err = -EIO; 1133 goto restore; 1134 } 1135 1136 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1137 struct drm_printer p = 1138 drm_info_printer(gt->i915->drm.dev); 1139 1140 pr_err("i915_reset_engine(%s:%s):" 1141 " failed to complete request %llx:%lld after reset\n", 1142 engine->name, test_name, 1143 rq->fence.context, 1144 rq->fence.seqno); 1145 intel_engine_dump(engine, &p, 1146 "%s\n", engine->name); 1147 i915_request_put(rq); 1148 1149 GEM_TRACE_DUMP(); 1150 intel_gt_set_wedged(gt); 1151 err = -EIO; 1152 goto restore; 1153 } 1154 1155 i915_request_put(rq); 1156 } 1157 1158 if (!(flags & TEST_ACTIVE)) 1159 intel_engine_pm_put(engine); 1160 1161 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1162 struct drm_printer p = 1163 drm_info_printer(gt->i915->drm.dev); 1164 1165 pr_err("i915_reset_engine(%s:%s):" 1166 " failed to idle after reset\n", 1167 engine->name, test_name); 1168 intel_engine_dump(engine, &p, 1169 "%s\n", engine->name); 1170 1171 err = -EIO; 1172 goto restore; 1173 } 1174 1175 restore: 1176 err2 = intel_selftest_restore_policy(engine, &saved); 1177 if (err2) 1178 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2); 1179 if (err == 0) 1180 err = err2; 1181 if (err) 1182 break; 1183 } while (time_before(jiffies, end_time)); 1184 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 1185 st_engine_heartbeat_enable_no_pm(engine); 1186 1187 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1188 engine->name, test_name, count); 1189 1190 /* GuC based resets are not logged per engine */ 1191 if (!using_guc) { 1192 reported = i915_reset_engine_count(global, engine); 1193 reported -= threads[engine->id].resets; 1194 if (reported != count) { 1195 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1196 engine->name, test_name, count, reported); 1197 if (!err) 1198 err = -EINVAL; 1199 } 1200 } 1201 1202 unwind: 1203 for_each_engine(other, gt, tmp) { 1204 int ret; 1205 1206 if (!threads[tmp].worker) 1207 continue; 1208 1209 WRITE_ONCE(threads[tmp].stop, true); 1210 kthread_flush_work(&threads[tmp].work); 1211 ret = READ_ONCE(threads[tmp].result); 1212 if (ret) { 1213 pr_err("kthread for other engine %s failed, err=%d\n", 1214 other->name, ret); 1215 if (!err) 1216 err = ret; 1217 } 1218 1219 kthread_destroy_worker(threads[tmp].worker); 1220 1221 /* GuC based resets are not logged per engine */ 1222 if (!using_guc) { 1223 if (other->uabi_class != engine->uabi_class && 1224 threads[tmp].resets != 1225 i915_reset_engine_count(global, other)) { 1226 pr_err("Innocent engine %s was reset (count=%ld)\n", 1227 other->name, 1228 i915_reset_engine_count(global, other) - 1229 threads[tmp].resets); 1230 if (!err) 1231 err = -EINVAL; 1232 } 1233 } 1234 } 1235 1236 if (device != i915_reset_count(global)) { 1237 pr_err("Global reset (count=%ld)!\n", 1238 i915_reset_count(global) - device); 1239 if (!err) 1240 err = -EINVAL; 1241 } 1242 1243 if (err) 1244 break; 1245 1246 err = igt_flush_test(gt->i915); 1247 if (err) { 1248 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1249 break; 1250 } 1251 } 1252 kfree(threads); 1253 1254 if (intel_gt_is_wedged(gt)) 1255 err = -EIO; 1256 1257 if (flags & TEST_ACTIVE) 1258 hang_fini(&h); 1259 1260 return err; 1261 } 1262 1263 static int igt_reset_engines(void *arg) 1264 { 1265 static const struct { 1266 const char *name; 1267 unsigned int flags; 1268 } phases[] = { 1269 { "idle", 0 }, 1270 { "active", TEST_ACTIVE }, 1271 { "others-idle", TEST_OTHERS }, 1272 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1273 { 1274 "others-priority", 1275 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1276 }, 1277 { 1278 "self-priority", 1279 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1280 }, 1281 { } 1282 }; 1283 struct intel_gt *gt = arg; 1284 typeof(*phases) *p; 1285 int err; 1286 1287 for (p = phases; p->name; p++) { 1288 if (p->flags & TEST_PRIORITY) { 1289 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1290 continue; 1291 } 1292 1293 err = __igt_reset_engines(arg, p->name, p->flags); 1294 if (err) 1295 return err; 1296 } 1297 1298 return 0; 1299 } 1300 1301 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1302 { 1303 u32 count = i915_reset_count(>->i915->gpu_error); 1304 1305 intel_gt_reset(gt, mask, NULL); 1306 1307 return count; 1308 } 1309 1310 static int igt_reset_wait(void *arg) 1311 { 1312 struct intel_gt *gt = arg; 1313 struct i915_gpu_error *global = >->i915->gpu_error; 1314 struct intel_engine_cs *engine; 1315 struct i915_request *rq; 1316 unsigned int reset_count; 1317 struct hang h; 1318 long timeout; 1319 int err; 1320 1321 engine = intel_selftest_find_any_engine(gt); 1322 1323 if (!engine || !intel_engine_can_store_dword(engine)) 1324 return 0; 1325 1326 /* Check that we detect a stuck waiter and issue a reset */ 1327 1328 igt_global_reset_lock(gt); 1329 1330 err = hang_init(&h, gt); 1331 if (err) { 1332 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1333 goto unlock; 1334 } 1335 1336 rq = hang_create_request(&h, engine); 1337 if (IS_ERR(rq)) { 1338 err = PTR_ERR(rq); 1339 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1340 goto fini; 1341 } 1342 1343 i915_request_get(rq); 1344 i915_request_add(rq); 1345 1346 if (!wait_until_running(&h, rq)) { 1347 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1348 1349 pr_err("%s: Failed to start request %llx, at %x\n", 1350 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1351 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1352 1353 intel_gt_set_wedged(gt); 1354 1355 err = -EIO; 1356 goto out_rq; 1357 } 1358 1359 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1360 1361 timeout = i915_request_wait(rq, 0, 10); 1362 if (timeout < 0) { 1363 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1364 timeout); 1365 err = timeout; 1366 goto out_rq; 1367 } 1368 1369 if (i915_reset_count(global) == reset_count) { 1370 pr_err("No GPU reset recorded!\n"); 1371 err = -EINVAL; 1372 goto out_rq; 1373 } 1374 1375 out_rq: 1376 i915_request_put(rq); 1377 fini: 1378 hang_fini(&h); 1379 unlock: 1380 igt_global_reset_unlock(gt); 1381 1382 if (intel_gt_is_wedged(gt)) 1383 return -EIO; 1384 1385 return err; 1386 } 1387 1388 struct evict_vma { 1389 struct completion completion; 1390 struct i915_vma *vma; 1391 }; 1392 1393 static int evict_vma(void *data) 1394 { 1395 struct evict_vma *arg = data; 1396 struct i915_address_space *vm = arg->vma->vm; 1397 struct drm_mm_node evict = arg->vma->node; 1398 int err; 1399 1400 complete(&arg->completion); 1401 1402 mutex_lock(&vm->mutex); 1403 err = i915_gem_evict_for_node(vm, NULL, &evict, 0); 1404 mutex_unlock(&vm->mutex); 1405 1406 return err; 1407 } 1408 1409 static int evict_fence(void *data) 1410 { 1411 struct evict_vma *arg = data; 1412 int err; 1413 1414 complete(&arg->completion); 1415 1416 /* Mark the fence register as dirty to force the mmio update. */ 1417 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1418 if (err) { 1419 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1420 return err; 1421 } 1422 1423 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1424 if (err) { 1425 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1426 return err; 1427 } 1428 1429 err = i915_vma_pin_fence(arg->vma); 1430 i915_vma_unpin(arg->vma); 1431 if (err) { 1432 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1433 return err; 1434 } 1435 1436 i915_vma_unpin_fence(arg->vma); 1437 1438 return 0; 1439 } 1440 1441 static int __igt_reset_evict_vma(struct intel_gt *gt, 1442 struct i915_address_space *vm, 1443 int (*fn)(void *), 1444 unsigned int flags) 1445 { 1446 struct intel_engine_cs *engine; 1447 struct drm_i915_gem_object *obj; 1448 struct task_struct *tsk = NULL; 1449 struct i915_request *rq; 1450 struct evict_vma arg; 1451 struct hang h; 1452 unsigned int pin_flags; 1453 int err; 1454 1455 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1456 return 0; 1457 1458 engine = intel_selftest_find_any_engine(gt); 1459 1460 if (!engine || !intel_engine_can_store_dword(engine)) 1461 return 0; 1462 1463 /* Check that we can recover an unbind stuck on a hanging request */ 1464 1465 err = hang_init(&h, gt); 1466 if (err) { 1467 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1468 return err; 1469 } 1470 1471 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1472 if (IS_ERR(obj)) { 1473 err = PTR_ERR(obj); 1474 pr_err("[%s] Create object failed: %d!\n", engine->name, err); 1475 goto fini; 1476 } 1477 1478 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1479 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1480 if (err) { 1481 pr_err("Invalid X-tiling settings; err:%d\n", err); 1482 goto out_obj; 1483 } 1484 } 1485 1486 arg.vma = i915_vma_instance(obj, vm, NULL); 1487 if (IS_ERR(arg.vma)) { 1488 err = PTR_ERR(arg.vma); 1489 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err); 1490 goto out_obj; 1491 } 1492 1493 rq = hang_create_request(&h, engine); 1494 if (IS_ERR(rq)) { 1495 err = PTR_ERR(rq); 1496 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1497 goto out_obj; 1498 } 1499 1500 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1501 1502 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1503 pin_flags |= PIN_MAPPABLE; 1504 1505 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1506 if (err) { 1507 i915_request_add(rq); 1508 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err); 1509 goto out_obj; 1510 } 1511 1512 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1513 err = i915_vma_pin_fence(arg.vma); 1514 if (err) { 1515 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1516 i915_vma_unpin(arg.vma); 1517 i915_request_add(rq); 1518 goto out_obj; 1519 } 1520 } 1521 1522 i915_vma_lock(arg.vma); 1523 err = i915_request_await_object(rq, arg.vma->obj, 1524 flags & EXEC_OBJECT_WRITE); 1525 if (err == 0) { 1526 err = i915_vma_move_to_active(arg.vma, rq, flags); 1527 if (err) 1528 pr_err("[%s] Move to active failed: %d!\n", engine->name, err); 1529 } else { 1530 pr_err("[%s] Request await failed: %d!\n", engine->name, err); 1531 } 1532 1533 i915_vma_unlock(arg.vma); 1534 1535 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1536 i915_vma_unpin_fence(arg.vma); 1537 i915_vma_unpin(arg.vma); 1538 1539 i915_request_get(rq); 1540 i915_request_add(rq); 1541 if (err) 1542 goto out_rq; 1543 1544 if (!wait_until_running(&h, rq)) { 1545 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1546 1547 pr_err("%s: Failed to start request %llx, at %x\n", 1548 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1549 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1550 1551 intel_gt_set_wedged(gt); 1552 goto out_reset; 1553 } 1554 1555 init_completion(&arg.completion); 1556 1557 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1558 if (IS_ERR(tsk)) { 1559 err = PTR_ERR(tsk); 1560 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1561 tsk = NULL; 1562 goto out_reset; 1563 } 1564 get_task_struct(tsk); 1565 1566 wait_for_completion(&arg.completion); 1567 1568 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1569 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1570 1571 pr_err("igt/evict_vma kthread did not wait\n"); 1572 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1573 1574 intel_gt_set_wedged(gt); 1575 goto out_reset; 1576 } 1577 1578 out_reset: 1579 igt_global_reset_lock(gt); 1580 fake_hangcheck(gt, rq->engine->mask); 1581 igt_global_reset_unlock(gt); 1582 1583 if (tsk) { 1584 struct intel_wedge_me w; 1585 1586 /* The reset, even indirectly, should take less than 10ms. */ 1587 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1588 err = kthread_stop(tsk); 1589 1590 put_task_struct(tsk); 1591 } 1592 1593 out_rq: 1594 i915_request_put(rq); 1595 out_obj: 1596 i915_gem_object_put(obj); 1597 fini: 1598 hang_fini(&h); 1599 if (intel_gt_is_wedged(gt)) 1600 return -EIO; 1601 1602 return err; 1603 } 1604 1605 static int igt_reset_evict_ggtt(void *arg) 1606 { 1607 struct intel_gt *gt = arg; 1608 1609 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1610 evict_vma, EXEC_OBJECT_WRITE); 1611 } 1612 1613 static int igt_reset_evict_ppgtt(void *arg) 1614 { 1615 struct intel_gt *gt = arg; 1616 struct i915_ppgtt *ppgtt; 1617 int err; 1618 1619 /* aliasing == global gtt locking, covered above */ 1620 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1621 return 0; 1622 1623 ppgtt = i915_ppgtt_create(gt, 0); 1624 if (IS_ERR(ppgtt)) 1625 return PTR_ERR(ppgtt); 1626 1627 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1628 evict_vma, EXEC_OBJECT_WRITE); 1629 i915_vm_put(&ppgtt->vm); 1630 1631 return err; 1632 } 1633 1634 static int igt_reset_evict_fence(void *arg) 1635 { 1636 struct intel_gt *gt = arg; 1637 1638 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1639 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1640 } 1641 1642 static int wait_for_others(struct intel_gt *gt, 1643 struct intel_engine_cs *exclude) 1644 { 1645 struct intel_engine_cs *engine; 1646 enum intel_engine_id id; 1647 1648 for_each_engine(engine, gt, id) { 1649 if (engine == exclude) 1650 continue; 1651 1652 if (!wait_for_idle(engine)) 1653 return -EIO; 1654 } 1655 1656 return 0; 1657 } 1658 1659 static int igt_reset_queue(void *arg) 1660 { 1661 struct intel_gt *gt = arg; 1662 struct i915_gpu_error *global = >->i915->gpu_error; 1663 struct intel_engine_cs *engine; 1664 enum intel_engine_id id; 1665 struct hang h; 1666 int err; 1667 1668 /* Check that we replay pending requests following a hang */ 1669 1670 igt_global_reset_lock(gt); 1671 1672 err = hang_init(&h, gt); 1673 if (err) 1674 goto unlock; 1675 1676 for_each_engine(engine, gt, id) { 1677 struct intel_selftest_saved_policy saved; 1678 struct i915_request *prev; 1679 IGT_TIMEOUT(end_time); 1680 unsigned int count; 1681 bool using_guc = intel_engine_uses_guc(engine); 1682 1683 if (!intel_engine_can_store_dword(engine)) 1684 continue; 1685 1686 if (using_guc) { 1687 err = intel_selftest_modify_policy(engine, &saved, 1688 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK); 1689 if (err) { 1690 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1691 goto fini; 1692 } 1693 } 1694 1695 prev = hang_create_request(&h, engine); 1696 if (IS_ERR(prev)) { 1697 err = PTR_ERR(prev); 1698 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err); 1699 goto restore; 1700 } 1701 1702 i915_request_get(prev); 1703 i915_request_add(prev); 1704 1705 count = 0; 1706 do { 1707 struct i915_request *rq; 1708 unsigned int reset_count; 1709 1710 rq = hang_create_request(&h, engine); 1711 if (IS_ERR(rq)) { 1712 err = PTR_ERR(rq); 1713 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1714 goto restore; 1715 } 1716 1717 i915_request_get(rq); 1718 i915_request_add(rq); 1719 1720 /* 1721 * XXX We don't handle resetting the kernel context 1722 * very well. If we trigger a device reset twice in 1723 * quick succession while the kernel context is 1724 * executing, we may end up skipping the breadcrumb. 1725 * This is really only a problem for the selftest as 1726 * normally there is a large interlude between resets 1727 * (hangcheck), or we focus on resetting just one 1728 * engine and so avoid repeatedly resetting innocents. 1729 */ 1730 err = wait_for_others(gt, engine); 1731 if (err) { 1732 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1733 __func__, engine->name); 1734 i915_request_put(rq); 1735 i915_request_put(prev); 1736 1737 GEM_TRACE_DUMP(); 1738 intel_gt_set_wedged(gt); 1739 goto restore; 1740 } 1741 1742 if (!wait_until_running(&h, prev)) { 1743 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1744 1745 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1746 __func__, engine->name, 1747 prev->fence.seqno, hws_seqno(&h, prev)); 1748 intel_engine_dump(engine, &p, 1749 "%s\n", engine->name); 1750 1751 i915_request_put(rq); 1752 i915_request_put(prev); 1753 1754 intel_gt_set_wedged(gt); 1755 1756 err = -EIO; 1757 goto restore; 1758 } 1759 1760 reset_count = fake_hangcheck(gt, BIT(id)); 1761 1762 if (prev->fence.error != -EIO) { 1763 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1764 prev->fence.error); 1765 i915_request_put(rq); 1766 i915_request_put(prev); 1767 err = -EINVAL; 1768 goto restore; 1769 } 1770 1771 if (rq->fence.error) { 1772 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1773 rq->fence.error); 1774 i915_request_put(rq); 1775 i915_request_put(prev); 1776 err = -EINVAL; 1777 goto restore; 1778 } 1779 1780 if (i915_reset_count(global) == reset_count) { 1781 pr_err("No GPU reset recorded!\n"); 1782 i915_request_put(rq); 1783 i915_request_put(prev); 1784 err = -EINVAL; 1785 goto restore; 1786 } 1787 1788 i915_request_put(prev); 1789 prev = rq; 1790 count++; 1791 } while (time_before(jiffies, end_time)); 1792 pr_info("%s: Completed %d queued resets\n", 1793 engine->name, count); 1794 1795 *h.batch = MI_BATCH_BUFFER_END; 1796 intel_gt_chipset_flush(engine->gt); 1797 1798 i915_request_put(prev); 1799 1800 restore: 1801 if (using_guc) { 1802 int err2 = intel_selftest_restore_policy(engine, &saved); 1803 1804 if (err2) 1805 pr_err("%s:%d> [%s] Restore policy failed: %d!\n", 1806 __func__, __LINE__, engine->name, err2); 1807 if (err == 0) 1808 err = err2; 1809 } 1810 if (err) 1811 goto fini; 1812 1813 err = igt_flush_test(gt->i915); 1814 if (err) { 1815 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1816 break; 1817 } 1818 } 1819 1820 fini: 1821 hang_fini(&h); 1822 unlock: 1823 igt_global_reset_unlock(gt); 1824 1825 if (intel_gt_is_wedged(gt)) 1826 return -EIO; 1827 1828 return err; 1829 } 1830 1831 static int igt_handle_error(void *arg) 1832 { 1833 struct intel_gt *gt = arg; 1834 struct i915_gpu_error *global = >->i915->gpu_error; 1835 struct intel_engine_cs *engine; 1836 struct hang h; 1837 struct i915_request *rq; 1838 struct i915_gpu_coredump *error; 1839 int err; 1840 1841 engine = intel_selftest_find_any_engine(gt); 1842 1843 /* Check that we can issue a global GPU and engine reset */ 1844 1845 if (!intel_has_reset_engine(gt)) 1846 return 0; 1847 1848 if (!engine || !intel_engine_can_store_dword(engine)) 1849 return 0; 1850 1851 err = hang_init(&h, gt); 1852 if (err) { 1853 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1854 return err; 1855 } 1856 1857 rq = hang_create_request(&h, engine); 1858 if (IS_ERR(rq)) { 1859 err = PTR_ERR(rq); 1860 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1861 goto err_fini; 1862 } 1863 1864 i915_request_get(rq); 1865 i915_request_add(rq); 1866 1867 if (!wait_until_running(&h, rq)) { 1868 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1869 1870 pr_err("%s: Failed to start request %llx, at %x\n", 1871 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1872 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1873 1874 intel_gt_set_wedged(gt); 1875 1876 err = -EIO; 1877 goto err_request; 1878 } 1879 1880 /* Temporarily disable error capture */ 1881 error = xchg(&global->first_error, (void *)-1); 1882 1883 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1884 1885 xchg(&global->first_error, error); 1886 1887 if (rq->fence.error != -EIO) { 1888 pr_err("Guilty request not identified!\n"); 1889 err = -EINVAL; 1890 goto err_request; 1891 } 1892 1893 err_request: 1894 i915_request_put(rq); 1895 err_fini: 1896 hang_fini(&h); 1897 return err; 1898 } 1899 1900 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1901 const struct igt_atomic_section *p, 1902 const char *mode) 1903 { 1904 struct tasklet_struct * const t = &engine->sched_engine->tasklet; 1905 int err; 1906 1907 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1908 engine->name, mode, p->name); 1909 1910 if (t->func) 1911 tasklet_disable(t); 1912 if (strcmp(p->name, "softirq")) 1913 local_bh_disable(); 1914 p->critical_section_begin(); 1915 1916 err = __intel_engine_reset_bh(engine, NULL); 1917 1918 p->critical_section_end(); 1919 if (strcmp(p->name, "softirq")) 1920 local_bh_enable(); 1921 if (t->func) { 1922 tasklet_enable(t); 1923 tasklet_hi_schedule(t); 1924 } 1925 1926 if (err) 1927 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1928 engine->name, mode, p->name); 1929 1930 return err; 1931 } 1932 1933 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1934 const struct igt_atomic_section *p) 1935 { 1936 struct i915_request *rq; 1937 struct hang h; 1938 int err; 1939 1940 err = __igt_atomic_reset_engine(engine, p, "idle"); 1941 if (err) 1942 return err; 1943 1944 err = hang_init(&h, engine->gt); 1945 if (err) { 1946 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1947 return err; 1948 } 1949 1950 rq = hang_create_request(&h, engine); 1951 if (IS_ERR(rq)) { 1952 err = PTR_ERR(rq); 1953 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1954 goto out; 1955 } 1956 1957 i915_request_get(rq); 1958 i915_request_add(rq); 1959 1960 if (wait_until_running(&h, rq)) { 1961 err = __igt_atomic_reset_engine(engine, p, "active"); 1962 } else { 1963 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1964 __func__, engine->name, 1965 rq->fence.seqno, hws_seqno(&h, rq)); 1966 intel_gt_set_wedged(engine->gt); 1967 err = -EIO; 1968 } 1969 1970 if (err == 0) { 1971 struct intel_wedge_me w; 1972 1973 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1974 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1975 if (intel_gt_is_wedged(engine->gt)) 1976 err = -EIO; 1977 } 1978 1979 i915_request_put(rq); 1980 out: 1981 hang_fini(&h); 1982 return err; 1983 } 1984 1985 static int igt_reset_engines_atomic(void *arg) 1986 { 1987 struct intel_gt *gt = arg; 1988 const typeof(*igt_atomic_phases) *p; 1989 int err = 0; 1990 1991 /* Check that the engines resets are usable from atomic context */ 1992 1993 if (!intel_has_reset_engine(gt)) 1994 return 0; 1995 1996 if (intel_uc_uses_guc_submission(>->uc)) 1997 return 0; 1998 1999 igt_global_reset_lock(gt); 2000 2001 /* Flush any requests before we get started and check basics */ 2002 if (!igt_force_reset(gt)) 2003 goto unlock; 2004 2005 for (p = igt_atomic_phases; p->name; p++) { 2006 struct intel_engine_cs *engine; 2007 enum intel_engine_id id; 2008 2009 for_each_engine(engine, gt, id) { 2010 err = igt_atomic_reset_engine(engine, p); 2011 if (err) 2012 goto out; 2013 } 2014 } 2015 2016 out: 2017 /* As we poke around the guts, do a full reset before continuing. */ 2018 igt_force_reset(gt); 2019 unlock: 2020 igt_global_reset_unlock(gt); 2021 2022 return err; 2023 } 2024 2025 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 2026 { 2027 static const struct i915_subtest tests[] = { 2028 SUBTEST(igt_hang_sanitycheck), 2029 SUBTEST(igt_reset_nop), 2030 SUBTEST(igt_reset_nop_engine), 2031 SUBTEST(igt_reset_idle_engine), 2032 SUBTEST(igt_reset_active_engine), 2033 SUBTEST(igt_reset_fail_engine), 2034 SUBTEST(igt_reset_engines), 2035 SUBTEST(igt_reset_engines_atomic), 2036 SUBTEST(igt_reset_queue), 2037 SUBTEST(igt_reset_wait), 2038 SUBTEST(igt_reset_evict_ggtt), 2039 SUBTEST(igt_reset_evict_ppgtt), 2040 SUBTEST(igt_reset_evict_fence), 2041 SUBTEST(igt_handle_error), 2042 }; 2043 struct intel_gt *gt = to_gt(i915); 2044 intel_wakeref_t wakeref; 2045 int err; 2046 2047 if (!intel_has_gpu_reset(gt)) 2048 return 0; 2049 2050 if (intel_gt_is_wedged(gt)) 2051 return -EIO; /* we're long past hope of a successful reset */ 2052 2053 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 2054 2055 err = intel_gt_live_subtests(tests, gt); 2056 2057 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 2058 2059 return err; 2060 } 2061