1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2016 Intel Corporation 4 */ 5 6 #include <linux/kthread.h> 7 8 #include "gem/i915_gem_context.h" 9 10 #include "intel_gt.h" 11 #include "intel_engine_heartbeat.h" 12 #include "intel_engine_pm.h" 13 #include "selftest_engine_heartbeat.h" 14 15 #include "i915_selftest.h" 16 #include "selftests/i915_random.h" 17 #include "selftests/igt_flush_test.h" 18 #include "selftests/igt_reset.h" 19 #include "selftests/igt_atomic.h" 20 #include "selftests/igt_spinner.h" 21 #include "selftests/intel_scheduler_helpers.h" 22 23 #include "selftests/mock_drm.h" 24 25 #include "gem/selftests/mock_context.h" 26 #include "gem/selftests/igt_gem_utils.h" 27 28 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 29 30 struct hang { 31 struct intel_gt *gt; 32 struct drm_i915_gem_object *hws; 33 struct drm_i915_gem_object *obj; 34 struct i915_gem_context *ctx; 35 u32 *seqno; 36 u32 *batch; 37 }; 38 39 static int hang_init(struct hang *h, struct intel_gt *gt) 40 { 41 void *vaddr; 42 int err; 43 44 memset(h, 0, sizeof(*h)); 45 h->gt = gt; 46 47 h->ctx = kernel_context(gt->i915, NULL); 48 if (IS_ERR(h->ctx)) 49 return PTR_ERR(h->ctx); 50 51 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 52 53 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 54 if (IS_ERR(h->hws)) { 55 err = PTR_ERR(h->hws); 56 goto err_ctx; 57 } 58 59 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 60 if (IS_ERR(h->obj)) { 61 err = PTR_ERR(h->obj); 62 goto err_hws; 63 } 64 65 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 66 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB); 67 if (IS_ERR(vaddr)) { 68 err = PTR_ERR(vaddr); 69 goto err_obj; 70 } 71 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 72 73 vaddr = i915_gem_object_pin_map_unlocked(h->obj, 74 i915_coherent_map_type(gt->i915, h->obj, false)); 75 if (IS_ERR(vaddr)) { 76 err = PTR_ERR(vaddr); 77 goto err_unpin_hws; 78 } 79 h->batch = vaddr; 80 81 return 0; 82 83 err_unpin_hws: 84 i915_gem_object_unpin_map(h->hws); 85 err_obj: 86 i915_gem_object_put(h->obj); 87 err_hws: 88 i915_gem_object_put(h->hws); 89 err_ctx: 90 kernel_context_close(h->ctx); 91 return err; 92 } 93 94 static u64 hws_address(const struct i915_vma *hws, 95 const struct i915_request *rq) 96 { 97 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 98 } 99 100 static int move_to_active(struct i915_vma *vma, 101 struct i915_request *rq, 102 unsigned int flags) 103 { 104 int err; 105 106 i915_vma_lock(vma); 107 err = i915_request_await_object(rq, vma->obj, 108 flags & EXEC_OBJECT_WRITE); 109 if (err == 0) 110 err = i915_vma_move_to_active(vma, rq, flags); 111 i915_vma_unlock(vma); 112 113 return err; 114 } 115 116 static struct i915_request * 117 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 118 { 119 struct intel_gt *gt = h->gt; 120 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx); 121 struct drm_i915_gem_object *obj; 122 struct i915_request *rq = NULL; 123 struct i915_vma *hws, *vma; 124 unsigned int flags; 125 void *vaddr; 126 u32 *batch; 127 int err; 128 129 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 130 if (IS_ERR(obj)) { 131 i915_vm_put(vm); 132 return ERR_CAST(obj); 133 } 134 135 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false)); 136 if (IS_ERR(vaddr)) { 137 i915_gem_object_put(obj); 138 i915_vm_put(vm); 139 return ERR_CAST(vaddr); 140 } 141 142 i915_gem_object_unpin_map(h->obj); 143 i915_gem_object_put(h->obj); 144 145 h->obj = obj; 146 h->batch = vaddr; 147 148 vma = i915_vma_instance(h->obj, vm, NULL); 149 if (IS_ERR(vma)) { 150 i915_vm_put(vm); 151 return ERR_CAST(vma); 152 } 153 154 hws = i915_vma_instance(h->hws, vm, NULL); 155 if (IS_ERR(hws)) { 156 i915_vm_put(vm); 157 return ERR_CAST(hws); 158 } 159 160 err = i915_vma_pin(vma, 0, 0, PIN_USER); 161 if (err) { 162 i915_vm_put(vm); 163 return ERR_PTR(err); 164 } 165 166 err = i915_vma_pin(hws, 0, 0, PIN_USER); 167 if (err) 168 goto unpin_vma; 169 170 rq = igt_request_alloc(h->ctx, engine); 171 if (IS_ERR(rq)) { 172 err = PTR_ERR(rq); 173 goto unpin_hws; 174 } 175 176 err = move_to_active(vma, rq, 0); 177 if (err) 178 goto cancel_rq; 179 180 err = move_to_active(hws, rq, 0); 181 if (err) 182 goto cancel_rq; 183 184 batch = h->batch; 185 if (GRAPHICS_VER(gt->i915) >= 8) { 186 *batch++ = MI_STORE_DWORD_IMM_GEN4; 187 *batch++ = lower_32_bits(hws_address(hws, rq)); 188 *batch++ = upper_32_bits(hws_address(hws, rq)); 189 *batch++ = rq->fence.seqno; 190 *batch++ = MI_NOOP; 191 192 memset(batch, 0, 1024); 193 batch += 1024 / sizeof(*batch); 194 195 *batch++ = MI_NOOP; 196 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 197 *batch++ = lower_32_bits(vma->node.start); 198 *batch++ = upper_32_bits(vma->node.start); 199 } else if (GRAPHICS_VER(gt->i915) >= 6) { 200 *batch++ = MI_STORE_DWORD_IMM_GEN4; 201 *batch++ = 0; 202 *batch++ = lower_32_bits(hws_address(hws, rq)); 203 *batch++ = rq->fence.seqno; 204 *batch++ = MI_NOOP; 205 206 memset(batch, 0, 1024); 207 batch += 1024 / sizeof(*batch); 208 209 *batch++ = MI_NOOP; 210 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 211 *batch++ = lower_32_bits(vma->node.start); 212 } else if (GRAPHICS_VER(gt->i915) >= 4) { 213 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 214 *batch++ = 0; 215 *batch++ = lower_32_bits(hws_address(hws, rq)); 216 *batch++ = rq->fence.seqno; 217 *batch++ = MI_NOOP; 218 219 memset(batch, 0, 1024); 220 batch += 1024 / sizeof(*batch); 221 222 *batch++ = MI_NOOP; 223 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 224 *batch++ = lower_32_bits(vma->node.start); 225 } else { 226 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 227 *batch++ = lower_32_bits(hws_address(hws, rq)); 228 *batch++ = rq->fence.seqno; 229 *batch++ = MI_NOOP; 230 231 memset(batch, 0, 1024); 232 batch += 1024 / sizeof(*batch); 233 234 *batch++ = MI_NOOP; 235 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 236 *batch++ = lower_32_bits(vma->node.start); 237 } 238 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 239 intel_gt_chipset_flush(engine->gt); 240 241 if (rq->engine->emit_init_breadcrumb) { 242 err = rq->engine->emit_init_breadcrumb(rq); 243 if (err) 244 goto cancel_rq; 245 } 246 247 flags = 0; 248 if (GRAPHICS_VER(gt->i915) <= 5) 249 flags |= I915_DISPATCH_SECURE; 250 251 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 252 253 cancel_rq: 254 if (err) { 255 i915_request_set_error_once(rq, err); 256 i915_request_add(rq); 257 } 258 unpin_hws: 259 i915_vma_unpin(hws); 260 unpin_vma: 261 i915_vma_unpin(vma); 262 i915_vm_put(vm); 263 return err ? ERR_PTR(err) : rq; 264 } 265 266 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 267 { 268 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 269 } 270 271 static void hang_fini(struct hang *h) 272 { 273 *h->batch = MI_BATCH_BUFFER_END; 274 intel_gt_chipset_flush(h->gt); 275 276 i915_gem_object_unpin_map(h->obj); 277 i915_gem_object_put(h->obj); 278 279 i915_gem_object_unpin_map(h->hws); 280 i915_gem_object_put(h->hws); 281 282 kernel_context_close(h->ctx); 283 284 igt_flush_test(h->gt->i915); 285 } 286 287 static bool wait_until_running(struct hang *h, struct i915_request *rq) 288 { 289 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 290 rq->fence.seqno), 291 10) && 292 wait_for(i915_seqno_passed(hws_seqno(h, rq), 293 rq->fence.seqno), 294 1000)); 295 } 296 297 static int igt_hang_sanitycheck(void *arg) 298 { 299 struct intel_gt *gt = arg; 300 struct i915_request *rq; 301 struct intel_engine_cs *engine; 302 enum intel_engine_id id; 303 struct hang h; 304 int err; 305 306 /* Basic check that we can execute our hanging batch */ 307 308 err = hang_init(&h, gt); 309 if (err) 310 return err; 311 312 for_each_engine(engine, gt, id) { 313 struct intel_wedge_me w; 314 long timeout; 315 316 if (!intel_engine_can_store_dword(engine)) 317 continue; 318 319 rq = hang_create_request(&h, engine); 320 if (IS_ERR(rq)) { 321 err = PTR_ERR(rq); 322 pr_err("Failed to create request for %s, err=%d\n", 323 engine->name, err); 324 goto fini; 325 } 326 327 i915_request_get(rq); 328 329 *h.batch = MI_BATCH_BUFFER_END; 330 intel_gt_chipset_flush(engine->gt); 331 332 i915_request_add(rq); 333 334 timeout = 0; 335 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 336 timeout = i915_request_wait(rq, 0, 337 MAX_SCHEDULE_TIMEOUT); 338 if (intel_gt_is_wedged(gt)) 339 timeout = -EIO; 340 341 i915_request_put(rq); 342 343 if (timeout < 0) { 344 err = timeout; 345 pr_err("Wait for request failed on %s, err=%d\n", 346 engine->name, err); 347 goto fini; 348 } 349 } 350 351 fini: 352 hang_fini(&h); 353 return err; 354 } 355 356 static bool wait_for_idle(struct intel_engine_cs *engine) 357 { 358 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 359 } 360 361 static int igt_reset_nop(void *arg) 362 { 363 struct intel_gt *gt = arg; 364 struct i915_gpu_error *global = >->i915->gpu_error; 365 struct intel_engine_cs *engine; 366 unsigned int reset_count, count; 367 enum intel_engine_id id; 368 IGT_TIMEOUT(end_time); 369 int err = 0; 370 371 /* Check that we can reset during non-user portions of requests */ 372 373 reset_count = i915_reset_count(global); 374 count = 0; 375 do { 376 for_each_engine(engine, gt, id) { 377 struct intel_context *ce; 378 int i; 379 380 ce = intel_context_create(engine); 381 if (IS_ERR(ce)) { 382 err = PTR_ERR(ce); 383 pr_err("[%s] Create context failed: %d!\n", engine->name, err); 384 break; 385 } 386 387 for (i = 0; i < 16; i++) { 388 struct i915_request *rq; 389 390 rq = intel_context_create_request(ce); 391 if (IS_ERR(rq)) { 392 err = PTR_ERR(rq); 393 pr_err("[%s] Create request failed: %d!\n", 394 engine->name, err); 395 break; 396 } 397 398 i915_request_add(rq); 399 } 400 401 intel_context_put(ce); 402 } 403 404 igt_global_reset_lock(gt); 405 intel_gt_reset(gt, ALL_ENGINES, NULL); 406 igt_global_reset_unlock(gt); 407 408 if (intel_gt_is_wedged(gt)) { 409 pr_err("[%s] GT is wedged!\n", engine->name); 410 err = -EIO; 411 break; 412 } 413 414 if (i915_reset_count(global) != reset_count + ++count) { 415 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n", 416 engine->name, i915_reset_count(global), reset_count, count); 417 err = -EINVAL; 418 break; 419 } 420 421 err = igt_flush_test(gt->i915); 422 if (err) { 423 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 424 break; 425 } 426 } while (time_before(jiffies, end_time)); 427 pr_info("%s: %d resets\n", __func__, count); 428 429 if (igt_flush_test(gt->i915)) { 430 pr_err("Post flush failed: %d!\n", err); 431 err = -EIO; 432 } 433 434 return err; 435 } 436 437 static int igt_reset_nop_engine(void *arg) 438 { 439 struct intel_gt *gt = arg; 440 struct i915_gpu_error *global = >->i915->gpu_error; 441 struct intel_engine_cs *engine; 442 enum intel_engine_id id; 443 444 /* Check that we can engine-reset during non-user portions */ 445 446 if (!intel_has_reset_engine(gt)) 447 return 0; 448 449 for_each_engine(engine, gt, id) { 450 unsigned int reset_count, reset_engine_count, count; 451 struct intel_context *ce; 452 IGT_TIMEOUT(end_time); 453 int err; 454 455 if (intel_engine_uses_guc(engine)) { 456 /* Engine level resets are triggered by GuC when a hang 457 * is detected. They can't be triggered by the KMD any 458 * more. Thus a nop batch cannot be used as a reset test 459 */ 460 continue; 461 } 462 463 ce = intel_context_create(engine); 464 if (IS_ERR(ce)) { 465 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 466 return PTR_ERR(ce); 467 } 468 469 reset_count = i915_reset_count(global); 470 reset_engine_count = i915_reset_engine_count(global, engine); 471 count = 0; 472 473 st_engine_heartbeat_disable(engine); 474 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 475 do { 476 int i; 477 478 if (!wait_for_idle(engine)) { 479 pr_err("%s failed to idle before reset\n", 480 engine->name); 481 err = -EIO; 482 break; 483 } 484 485 for (i = 0; i < 16; i++) { 486 struct i915_request *rq; 487 488 rq = intel_context_create_request(ce); 489 if (IS_ERR(rq)) { 490 struct drm_printer p = 491 drm_info_printer(gt->i915->drm.dev); 492 intel_engine_dump(engine, &p, 493 "%s(%s): failed to submit request\n", 494 __func__, 495 engine->name); 496 497 GEM_TRACE("%s(%s): failed to submit request\n", 498 __func__, 499 engine->name); 500 GEM_TRACE_DUMP(); 501 502 intel_gt_set_wedged(gt); 503 504 err = PTR_ERR(rq); 505 break; 506 } 507 508 i915_request_add(rq); 509 } 510 err = intel_engine_reset(engine, NULL); 511 if (err) { 512 pr_err("intel_engine_reset(%s) failed, err:%d\n", 513 engine->name, err); 514 break; 515 } 516 517 if (i915_reset_count(global) != reset_count) { 518 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 519 err = -EINVAL; 520 break; 521 } 522 523 if (i915_reset_engine_count(global, engine) != 524 reset_engine_count + ++count) { 525 pr_err("%s engine reset not recorded!\n", 526 engine->name); 527 err = -EINVAL; 528 break; 529 } 530 } while (time_before(jiffies, end_time)); 531 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 532 st_engine_heartbeat_enable(engine); 533 534 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 535 536 intel_context_put(ce); 537 if (igt_flush_test(gt->i915)) 538 err = -EIO; 539 if (err) 540 return err; 541 } 542 543 return 0; 544 } 545 546 static void force_reset_timeout(struct intel_engine_cs *engine) 547 { 548 engine->reset_timeout.probability = 999; 549 atomic_set(&engine->reset_timeout.times, -1); 550 } 551 552 static void cancel_reset_timeout(struct intel_engine_cs *engine) 553 { 554 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 555 } 556 557 static int igt_reset_fail_engine(void *arg) 558 { 559 struct intel_gt *gt = arg; 560 struct intel_engine_cs *engine; 561 enum intel_engine_id id; 562 563 /* Check that we can recover from engine-reset failues */ 564 565 if (!intel_has_reset_engine(gt)) 566 return 0; 567 568 for_each_engine(engine, gt, id) { 569 unsigned int count; 570 struct intel_context *ce; 571 IGT_TIMEOUT(end_time); 572 int err; 573 574 /* Can't manually break the reset if i915 doesn't perform it */ 575 if (intel_engine_uses_guc(engine)) 576 continue; 577 578 ce = intel_context_create(engine); 579 if (IS_ERR(ce)) { 580 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 581 return PTR_ERR(ce); 582 } 583 584 st_engine_heartbeat_disable(engine); 585 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 586 587 force_reset_timeout(engine); 588 err = intel_engine_reset(engine, NULL); 589 cancel_reset_timeout(engine); 590 if (err == 0) /* timeouts only generated on gen8+ */ 591 goto skip; 592 593 count = 0; 594 do { 595 struct i915_request *last = NULL; 596 int i; 597 598 if (!wait_for_idle(engine)) { 599 pr_err("%s failed to idle before reset\n", 600 engine->name); 601 err = -EIO; 602 break; 603 } 604 605 for (i = 0; i < count % 15; i++) { 606 struct i915_request *rq; 607 608 rq = intel_context_create_request(ce); 609 if (IS_ERR(rq)) { 610 struct drm_printer p = 611 drm_info_printer(gt->i915->drm.dev); 612 intel_engine_dump(engine, &p, 613 "%s(%s): failed to submit request\n", 614 __func__, 615 engine->name); 616 617 GEM_TRACE("%s(%s): failed to submit request\n", 618 __func__, 619 engine->name); 620 GEM_TRACE_DUMP(); 621 622 intel_gt_set_wedged(gt); 623 if (last) 624 i915_request_put(last); 625 626 err = PTR_ERR(rq); 627 goto out; 628 } 629 630 if (last) 631 i915_request_put(last); 632 last = i915_request_get(rq); 633 i915_request_add(rq); 634 } 635 636 if (count & 1) { 637 err = intel_engine_reset(engine, NULL); 638 if (err) { 639 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 640 engine->name, err); 641 GEM_TRACE_DUMP(); 642 i915_request_put(last); 643 break; 644 } 645 } else { 646 force_reset_timeout(engine); 647 err = intel_engine_reset(engine, NULL); 648 cancel_reset_timeout(engine); 649 if (err != -ETIMEDOUT) { 650 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 651 engine->name, err); 652 i915_request_put(last); 653 break; 654 } 655 } 656 657 err = 0; 658 if (last) { 659 if (i915_request_wait(last, 0, HZ / 2) < 0) { 660 struct drm_printer p = 661 drm_info_printer(gt->i915->drm.dev); 662 663 intel_engine_dump(engine, &p, 664 "%s(%s): failed to complete request\n", 665 __func__, 666 engine->name); 667 668 GEM_TRACE("%s(%s): failed to complete request\n", 669 __func__, 670 engine->name); 671 GEM_TRACE_DUMP(); 672 673 err = -EIO; 674 } 675 i915_request_put(last); 676 } 677 count++; 678 } while (err == 0 && time_before(jiffies, end_time)); 679 out: 680 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 681 skip: 682 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 683 st_engine_heartbeat_enable(engine); 684 intel_context_put(ce); 685 686 if (igt_flush_test(gt->i915)) 687 err = -EIO; 688 if (err) 689 return err; 690 } 691 692 return 0; 693 } 694 695 static int __igt_reset_engine(struct intel_gt *gt, bool active) 696 { 697 struct i915_gpu_error *global = >->i915->gpu_error; 698 struct intel_engine_cs *engine; 699 enum intel_engine_id id; 700 struct hang h; 701 int err = 0; 702 703 /* Check that we can issue an engine reset on an idle engine (no-op) */ 704 705 if (!intel_has_reset_engine(gt)) 706 return 0; 707 708 if (active) { 709 err = hang_init(&h, gt); 710 if (err) 711 return err; 712 } 713 714 for_each_engine(engine, gt, id) { 715 unsigned int reset_count, reset_engine_count; 716 unsigned long count; 717 bool using_guc = intel_engine_uses_guc(engine); 718 IGT_TIMEOUT(end_time); 719 720 if (using_guc && !active) 721 continue; 722 723 if (active && !intel_engine_can_store_dword(engine)) 724 continue; 725 726 if (!wait_for_idle(engine)) { 727 pr_err("%s failed to idle before reset\n", 728 engine->name); 729 err = -EIO; 730 break; 731 } 732 733 reset_count = i915_reset_count(global); 734 reset_engine_count = i915_reset_engine_count(global, engine); 735 736 st_engine_heartbeat_disable(engine); 737 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 738 count = 0; 739 do { 740 struct i915_request *rq = NULL; 741 struct intel_selftest_saved_policy saved; 742 int err2; 743 744 err = intel_selftest_modify_policy(engine, &saved, 745 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 746 if (err) { 747 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 748 break; 749 } 750 751 if (active) { 752 rq = hang_create_request(&h, engine); 753 if (IS_ERR(rq)) { 754 err = PTR_ERR(rq); 755 pr_err("[%s] Create hang request failed: %d!\n", 756 engine->name, err); 757 goto restore; 758 } 759 760 i915_request_get(rq); 761 i915_request_add(rq); 762 763 if (!wait_until_running(&h, rq)) { 764 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 765 766 pr_err("%s: Failed to start request %llx, at %x\n", 767 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 768 intel_engine_dump(engine, &p, 769 "%s\n", engine->name); 770 771 i915_request_put(rq); 772 err = -EIO; 773 goto restore; 774 } 775 } 776 777 if (!using_guc) { 778 err = intel_engine_reset(engine, NULL); 779 if (err) { 780 pr_err("intel_engine_reset(%s) failed, err:%d\n", 781 engine->name, err); 782 goto skip; 783 } 784 } 785 786 if (rq) { 787 /* Ensure the reset happens and kills the engine */ 788 err = intel_selftest_wait_for_rq(rq); 789 if (err) 790 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 791 engine->name, rq->fence.context, 792 rq->fence.seqno, rq->context->guc_id.id, err); 793 } 794 795 skip: 796 if (rq) 797 i915_request_put(rq); 798 799 if (i915_reset_count(global) != reset_count) { 800 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 801 err = -EINVAL; 802 goto restore; 803 } 804 805 /* GuC based resets are not logged per engine */ 806 if (!using_guc) { 807 if (i915_reset_engine_count(global, engine) != 808 ++reset_engine_count) { 809 pr_err("%s engine reset not recorded!\n", 810 engine->name); 811 err = -EINVAL; 812 goto restore; 813 } 814 } 815 816 count++; 817 818 restore: 819 err2 = intel_selftest_restore_policy(engine, &saved); 820 if (err2) 821 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err); 822 if (err == 0) 823 err = err2; 824 if (err) 825 break; 826 } while (time_before(jiffies, end_time)); 827 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 828 st_engine_heartbeat_enable(engine); 829 pr_info("%s: Completed %lu %s resets\n", 830 engine->name, count, active ? "active" : "idle"); 831 832 if (err) 833 break; 834 835 err = igt_flush_test(gt->i915); 836 if (err) { 837 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 838 break; 839 } 840 } 841 842 if (intel_gt_is_wedged(gt)) { 843 pr_err("GT is wedged!\n"); 844 err = -EIO; 845 } 846 847 if (active) 848 hang_fini(&h); 849 850 return err; 851 } 852 853 static int igt_reset_idle_engine(void *arg) 854 { 855 return __igt_reset_engine(arg, false); 856 } 857 858 static int igt_reset_active_engine(void *arg) 859 { 860 return __igt_reset_engine(arg, true); 861 } 862 863 struct active_engine { 864 struct task_struct *task; 865 struct intel_engine_cs *engine; 866 unsigned long resets; 867 unsigned int flags; 868 }; 869 870 #define TEST_ACTIVE BIT(0) 871 #define TEST_OTHERS BIT(1) 872 #define TEST_SELF BIT(2) 873 #define TEST_PRIORITY BIT(3) 874 875 static int active_request_put(struct i915_request *rq) 876 { 877 int err = 0; 878 879 if (!rq) 880 return 0; 881 882 if (i915_request_wait(rq, 0, 10 * HZ) < 0) { 883 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 884 rq->engine->name, 885 rq->fence.context, 886 rq->fence.seqno); 887 GEM_TRACE_DUMP(); 888 889 intel_gt_set_wedged(rq->engine->gt); 890 err = -EIO; 891 } 892 893 i915_request_put(rq); 894 895 return err; 896 } 897 898 static int active_engine(void *data) 899 { 900 I915_RND_STATE(prng); 901 struct active_engine *arg = data; 902 struct intel_engine_cs *engine = arg->engine; 903 struct i915_request *rq[8] = {}; 904 struct intel_context *ce[ARRAY_SIZE(rq)]; 905 unsigned long count; 906 int err = 0; 907 908 for (count = 0; count < ARRAY_SIZE(ce); count++) { 909 ce[count] = intel_context_create(engine); 910 if (IS_ERR(ce[count])) { 911 err = PTR_ERR(ce[count]); 912 pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err); 913 while (--count) 914 intel_context_put(ce[count]); 915 return err; 916 } 917 } 918 919 count = 0; 920 while (!kthread_should_stop()) { 921 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 922 struct i915_request *old = rq[idx]; 923 struct i915_request *new; 924 925 new = intel_context_create_request(ce[idx]); 926 if (IS_ERR(new)) { 927 err = PTR_ERR(new); 928 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err); 929 break; 930 } 931 932 rq[idx] = i915_request_get(new); 933 i915_request_add(new); 934 935 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) { 936 struct i915_sched_attr attr = { 937 .priority = 938 i915_prandom_u32_max_state(512, &prng), 939 }; 940 engine->sched_engine->schedule(rq[idx], &attr); 941 } 942 943 err = active_request_put(old); 944 if (err) { 945 pr_err("[%s] Request put failed: %d!\n", engine->name, err); 946 break; 947 } 948 949 cond_resched(); 950 } 951 952 for (count = 0; count < ARRAY_SIZE(rq); count++) { 953 int err__ = active_request_put(rq[count]); 954 955 if (err) 956 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err); 957 958 /* Keep the first error */ 959 if (!err) 960 err = err__; 961 962 intel_context_put(ce[count]); 963 } 964 965 return err; 966 } 967 968 static int __igt_reset_engines(struct intel_gt *gt, 969 const char *test_name, 970 unsigned int flags) 971 { 972 struct i915_gpu_error *global = >->i915->gpu_error; 973 struct intel_engine_cs *engine, *other; 974 enum intel_engine_id id, tmp; 975 struct hang h; 976 int err = 0; 977 978 /* Check that issuing a reset on one engine does not interfere 979 * with any other engine. 980 */ 981 982 if (!intel_has_reset_engine(gt)) 983 return 0; 984 985 if (flags & TEST_ACTIVE) { 986 err = hang_init(&h, gt); 987 if (err) 988 return err; 989 990 if (flags & TEST_PRIORITY) 991 h.ctx->sched.priority = 1024; 992 } 993 994 for_each_engine(engine, gt, id) { 995 struct active_engine threads[I915_NUM_ENGINES] = {}; 996 unsigned long device = i915_reset_count(global); 997 unsigned long count = 0, reported; 998 bool using_guc = intel_engine_uses_guc(engine); 999 IGT_TIMEOUT(end_time); 1000 1001 if (flags & TEST_ACTIVE) { 1002 if (!intel_engine_can_store_dword(engine)) 1003 continue; 1004 } else if (using_guc) 1005 continue; 1006 1007 if (!wait_for_idle(engine)) { 1008 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 1009 engine->name, test_name); 1010 err = -EIO; 1011 break; 1012 } 1013 1014 memset(threads, 0, sizeof(threads)); 1015 for_each_engine(other, gt, tmp) { 1016 struct task_struct *tsk; 1017 1018 threads[tmp].resets = 1019 i915_reset_engine_count(global, other); 1020 1021 if (other == engine && !(flags & TEST_SELF)) 1022 continue; 1023 1024 if (other != engine && !(flags & TEST_OTHERS)) 1025 continue; 1026 1027 threads[tmp].engine = other; 1028 threads[tmp].flags = flags; 1029 1030 tsk = kthread_run(active_engine, &threads[tmp], 1031 "igt/%s", other->name); 1032 if (IS_ERR(tsk)) { 1033 err = PTR_ERR(tsk); 1034 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1035 goto unwind; 1036 } 1037 1038 threads[tmp].task = tsk; 1039 get_task_struct(tsk); 1040 } 1041 1042 yield(); /* start all threads before we begin */ 1043 1044 st_engine_heartbeat_disable_no_pm(engine); 1045 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 1046 do { 1047 struct i915_request *rq = NULL; 1048 struct intel_selftest_saved_policy saved; 1049 int err2; 1050 1051 err = intel_selftest_modify_policy(engine, &saved, 1052 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 1053 if (err) { 1054 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1055 break; 1056 } 1057 1058 if (flags & TEST_ACTIVE) { 1059 rq = hang_create_request(&h, engine); 1060 if (IS_ERR(rq)) { 1061 err = PTR_ERR(rq); 1062 pr_err("[%s] Create hang request failed: %d!\n", 1063 engine->name, err); 1064 goto restore; 1065 } 1066 1067 i915_request_get(rq); 1068 i915_request_add(rq); 1069 1070 if (!wait_until_running(&h, rq)) { 1071 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1072 1073 pr_err("%s: Failed to start request %llx, at %x\n", 1074 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1075 intel_engine_dump(engine, &p, 1076 "%s\n", engine->name); 1077 1078 i915_request_put(rq); 1079 err = -EIO; 1080 goto restore; 1081 } 1082 } else { 1083 intel_engine_pm_get(engine); 1084 } 1085 1086 if (!using_guc) { 1087 err = intel_engine_reset(engine, NULL); 1088 if (err) { 1089 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 1090 engine->name, test_name, err); 1091 goto restore; 1092 } 1093 } 1094 1095 if (rq) { 1096 /* Ensure the reset happens and kills the engine */ 1097 err = intel_selftest_wait_for_rq(rq); 1098 if (err) 1099 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 1100 engine->name, rq->fence.context, 1101 rq->fence.seqno, rq->context->guc_id.id, err); 1102 } 1103 1104 count++; 1105 1106 if (rq) { 1107 if (rq->fence.error != -EIO) { 1108 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n", 1109 engine->name, test_name, 1110 rq->fence.context, 1111 rq->fence.seqno, rq->context->guc_id.id); 1112 i915_request_put(rq); 1113 1114 GEM_TRACE_DUMP(); 1115 intel_gt_set_wedged(gt); 1116 err = -EIO; 1117 goto restore; 1118 } 1119 1120 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1121 struct drm_printer p = 1122 drm_info_printer(gt->i915->drm.dev); 1123 1124 pr_err("i915_reset_engine(%s:%s):" 1125 " failed to complete request %llx:%lld after reset\n", 1126 engine->name, test_name, 1127 rq->fence.context, 1128 rq->fence.seqno); 1129 intel_engine_dump(engine, &p, 1130 "%s\n", engine->name); 1131 i915_request_put(rq); 1132 1133 GEM_TRACE_DUMP(); 1134 intel_gt_set_wedged(gt); 1135 err = -EIO; 1136 goto restore; 1137 } 1138 1139 i915_request_put(rq); 1140 } 1141 1142 if (!(flags & TEST_ACTIVE)) 1143 intel_engine_pm_put(engine); 1144 1145 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1146 struct drm_printer p = 1147 drm_info_printer(gt->i915->drm.dev); 1148 1149 pr_err("i915_reset_engine(%s:%s):" 1150 " failed to idle after reset\n", 1151 engine->name, test_name); 1152 intel_engine_dump(engine, &p, 1153 "%s\n", engine->name); 1154 1155 err = -EIO; 1156 goto restore; 1157 } 1158 1159 restore: 1160 err2 = intel_selftest_restore_policy(engine, &saved); 1161 if (err2) 1162 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2); 1163 if (err == 0) 1164 err = err2; 1165 if (err) 1166 break; 1167 } while (time_before(jiffies, end_time)); 1168 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 1169 st_engine_heartbeat_enable_no_pm(engine); 1170 1171 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1172 engine->name, test_name, count); 1173 1174 /* GuC based resets are not logged per engine */ 1175 if (!using_guc) { 1176 reported = i915_reset_engine_count(global, engine); 1177 reported -= threads[engine->id].resets; 1178 if (reported != count) { 1179 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1180 engine->name, test_name, count, reported); 1181 if (!err) 1182 err = -EINVAL; 1183 } 1184 } 1185 1186 unwind: 1187 for_each_engine(other, gt, tmp) { 1188 int ret; 1189 1190 if (!threads[tmp].task) 1191 continue; 1192 1193 ret = kthread_stop(threads[tmp].task); 1194 if (ret) { 1195 pr_err("kthread for other engine %s failed, err=%d\n", 1196 other->name, ret); 1197 if (!err) 1198 err = ret; 1199 } 1200 put_task_struct(threads[tmp].task); 1201 1202 /* GuC based resets are not logged per engine */ 1203 if (!using_guc) { 1204 if (other->uabi_class != engine->uabi_class && 1205 threads[tmp].resets != 1206 i915_reset_engine_count(global, other)) { 1207 pr_err("Innocent engine %s was reset (count=%ld)\n", 1208 other->name, 1209 i915_reset_engine_count(global, other) - 1210 threads[tmp].resets); 1211 if (!err) 1212 err = -EINVAL; 1213 } 1214 } 1215 } 1216 1217 if (device != i915_reset_count(global)) { 1218 pr_err("Global reset (count=%ld)!\n", 1219 i915_reset_count(global) - device); 1220 if (!err) 1221 err = -EINVAL; 1222 } 1223 1224 if (err) 1225 break; 1226 1227 err = igt_flush_test(gt->i915); 1228 if (err) { 1229 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1230 break; 1231 } 1232 } 1233 1234 if (intel_gt_is_wedged(gt)) 1235 err = -EIO; 1236 1237 if (flags & TEST_ACTIVE) 1238 hang_fini(&h); 1239 1240 return err; 1241 } 1242 1243 static int igt_reset_engines(void *arg) 1244 { 1245 static const struct { 1246 const char *name; 1247 unsigned int flags; 1248 } phases[] = { 1249 { "idle", 0 }, 1250 { "active", TEST_ACTIVE }, 1251 { "others-idle", TEST_OTHERS }, 1252 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1253 { 1254 "others-priority", 1255 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1256 }, 1257 { 1258 "self-priority", 1259 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1260 }, 1261 { } 1262 }; 1263 struct intel_gt *gt = arg; 1264 typeof(*phases) *p; 1265 int err; 1266 1267 for (p = phases; p->name; p++) { 1268 if (p->flags & TEST_PRIORITY) { 1269 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1270 continue; 1271 } 1272 1273 err = __igt_reset_engines(arg, p->name, p->flags); 1274 if (err) 1275 return err; 1276 } 1277 1278 return 0; 1279 } 1280 1281 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1282 { 1283 u32 count = i915_reset_count(>->i915->gpu_error); 1284 1285 intel_gt_reset(gt, mask, NULL); 1286 1287 return count; 1288 } 1289 1290 static int igt_reset_wait(void *arg) 1291 { 1292 struct intel_gt *gt = arg; 1293 struct i915_gpu_error *global = >->i915->gpu_error; 1294 struct intel_engine_cs *engine = gt->engine[RCS0]; 1295 struct i915_request *rq; 1296 unsigned int reset_count; 1297 struct hang h; 1298 long timeout; 1299 int err; 1300 1301 if (!engine || !intel_engine_can_store_dword(engine)) 1302 return 0; 1303 1304 /* Check that we detect a stuck waiter and issue a reset */ 1305 1306 igt_global_reset_lock(gt); 1307 1308 err = hang_init(&h, gt); 1309 if (err) { 1310 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1311 goto unlock; 1312 } 1313 1314 rq = hang_create_request(&h, engine); 1315 if (IS_ERR(rq)) { 1316 err = PTR_ERR(rq); 1317 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1318 goto fini; 1319 } 1320 1321 i915_request_get(rq); 1322 i915_request_add(rq); 1323 1324 if (!wait_until_running(&h, rq)) { 1325 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1326 1327 pr_err("%s: Failed to start request %llx, at %x\n", 1328 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1329 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1330 1331 intel_gt_set_wedged(gt); 1332 1333 err = -EIO; 1334 goto out_rq; 1335 } 1336 1337 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1338 1339 timeout = i915_request_wait(rq, 0, 10); 1340 if (timeout < 0) { 1341 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1342 timeout); 1343 err = timeout; 1344 goto out_rq; 1345 } 1346 1347 if (i915_reset_count(global) == reset_count) { 1348 pr_err("No GPU reset recorded!\n"); 1349 err = -EINVAL; 1350 goto out_rq; 1351 } 1352 1353 out_rq: 1354 i915_request_put(rq); 1355 fini: 1356 hang_fini(&h); 1357 unlock: 1358 igt_global_reset_unlock(gt); 1359 1360 if (intel_gt_is_wedged(gt)) 1361 return -EIO; 1362 1363 return err; 1364 } 1365 1366 struct evict_vma { 1367 struct completion completion; 1368 struct i915_vma *vma; 1369 }; 1370 1371 static int evict_vma(void *data) 1372 { 1373 struct evict_vma *arg = data; 1374 struct i915_address_space *vm = arg->vma->vm; 1375 struct drm_mm_node evict = arg->vma->node; 1376 int err; 1377 1378 complete(&arg->completion); 1379 1380 mutex_lock(&vm->mutex); 1381 err = i915_gem_evict_for_node(vm, &evict, 0); 1382 mutex_unlock(&vm->mutex); 1383 1384 return err; 1385 } 1386 1387 static int evict_fence(void *data) 1388 { 1389 struct evict_vma *arg = data; 1390 int err; 1391 1392 complete(&arg->completion); 1393 1394 /* Mark the fence register as dirty to force the mmio update. */ 1395 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1396 if (err) { 1397 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1398 return err; 1399 } 1400 1401 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1402 if (err) { 1403 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1404 return err; 1405 } 1406 1407 err = i915_vma_pin_fence(arg->vma); 1408 i915_vma_unpin(arg->vma); 1409 if (err) { 1410 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1411 return err; 1412 } 1413 1414 i915_vma_unpin_fence(arg->vma); 1415 1416 return 0; 1417 } 1418 1419 static int __igt_reset_evict_vma(struct intel_gt *gt, 1420 struct i915_address_space *vm, 1421 int (*fn)(void *), 1422 unsigned int flags) 1423 { 1424 struct intel_engine_cs *engine = gt->engine[RCS0]; 1425 struct drm_i915_gem_object *obj; 1426 struct task_struct *tsk = NULL; 1427 struct i915_request *rq; 1428 struct evict_vma arg; 1429 struct hang h; 1430 unsigned int pin_flags; 1431 int err; 1432 1433 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1434 return 0; 1435 1436 if (!engine || !intel_engine_can_store_dword(engine)) 1437 return 0; 1438 1439 /* Check that we can recover an unbind stuck on a hanging request */ 1440 1441 err = hang_init(&h, gt); 1442 if (err) { 1443 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1444 return err; 1445 } 1446 1447 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1448 if (IS_ERR(obj)) { 1449 err = PTR_ERR(obj); 1450 pr_err("[%s] Create object failed: %d!\n", engine->name, err); 1451 goto fini; 1452 } 1453 1454 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1455 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1456 if (err) { 1457 pr_err("Invalid X-tiling settings; err:%d\n", err); 1458 goto out_obj; 1459 } 1460 } 1461 1462 arg.vma = i915_vma_instance(obj, vm, NULL); 1463 if (IS_ERR(arg.vma)) { 1464 err = PTR_ERR(arg.vma); 1465 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err); 1466 goto out_obj; 1467 } 1468 1469 rq = hang_create_request(&h, engine); 1470 if (IS_ERR(rq)) { 1471 err = PTR_ERR(rq); 1472 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1473 goto out_obj; 1474 } 1475 1476 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1477 1478 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1479 pin_flags |= PIN_MAPPABLE; 1480 1481 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1482 if (err) { 1483 i915_request_add(rq); 1484 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err); 1485 goto out_obj; 1486 } 1487 1488 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1489 err = i915_vma_pin_fence(arg.vma); 1490 if (err) { 1491 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1492 i915_vma_unpin(arg.vma); 1493 i915_request_add(rq); 1494 goto out_obj; 1495 } 1496 } 1497 1498 i915_vma_lock(arg.vma); 1499 err = i915_request_await_object(rq, arg.vma->obj, 1500 flags & EXEC_OBJECT_WRITE); 1501 if (err == 0) { 1502 err = i915_vma_move_to_active(arg.vma, rq, flags); 1503 if (err) 1504 pr_err("[%s] Move to active failed: %d!\n", engine->name, err); 1505 } else { 1506 pr_err("[%s] Request await failed: %d!\n", engine->name, err); 1507 } 1508 1509 i915_vma_unlock(arg.vma); 1510 1511 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1512 i915_vma_unpin_fence(arg.vma); 1513 i915_vma_unpin(arg.vma); 1514 1515 i915_request_get(rq); 1516 i915_request_add(rq); 1517 if (err) 1518 goto out_rq; 1519 1520 if (!wait_until_running(&h, rq)) { 1521 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1522 1523 pr_err("%s: Failed to start request %llx, at %x\n", 1524 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1525 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1526 1527 intel_gt_set_wedged(gt); 1528 goto out_reset; 1529 } 1530 1531 init_completion(&arg.completion); 1532 1533 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1534 if (IS_ERR(tsk)) { 1535 err = PTR_ERR(tsk); 1536 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1537 tsk = NULL; 1538 goto out_reset; 1539 } 1540 get_task_struct(tsk); 1541 1542 wait_for_completion(&arg.completion); 1543 1544 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1545 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1546 1547 pr_err("igt/evict_vma kthread did not wait\n"); 1548 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1549 1550 intel_gt_set_wedged(gt); 1551 goto out_reset; 1552 } 1553 1554 out_reset: 1555 igt_global_reset_lock(gt); 1556 fake_hangcheck(gt, rq->engine->mask); 1557 igt_global_reset_unlock(gt); 1558 1559 if (tsk) { 1560 struct intel_wedge_me w; 1561 1562 /* The reset, even indirectly, should take less than 10ms. */ 1563 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1564 err = kthread_stop(tsk); 1565 1566 put_task_struct(tsk); 1567 } 1568 1569 out_rq: 1570 i915_request_put(rq); 1571 out_obj: 1572 i915_gem_object_put(obj); 1573 fini: 1574 hang_fini(&h); 1575 if (intel_gt_is_wedged(gt)) 1576 return -EIO; 1577 1578 return err; 1579 } 1580 1581 static int igt_reset_evict_ggtt(void *arg) 1582 { 1583 struct intel_gt *gt = arg; 1584 1585 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1586 evict_vma, EXEC_OBJECT_WRITE); 1587 } 1588 1589 static int igt_reset_evict_ppgtt(void *arg) 1590 { 1591 struct intel_gt *gt = arg; 1592 struct i915_ppgtt *ppgtt; 1593 int err; 1594 1595 /* aliasing == global gtt locking, covered above */ 1596 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1597 return 0; 1598 1599 ppgtt = i915_ppgtt_create(gt, 0); 1600 if (IS_ERR(ppgtt)) 1601 return PTR_ERR(ppgtt); 1602 1603 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1604 evict_vma, EXEC_OBJECT_WRITE); 1605 i915_vm_put(&ppgtt->vm); 1606 1607 return err; 1608 } 1609 1610 static int igt_reset_evict_fence(void *arg) 1611 { 1612 struct intel_gt *gt = arg; 1613 1614 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1615 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1616 } 1617 1618 static int wait_for_others(struct intel_gt *gt, 1619 struct intel_engine_cs *exclude) 1620 { 1621 struct intel_engine_cs *engine; 1622 enum intel_engine_id id; 1623 1624 for_each_engine(engine, gt, id) { 1625 if (engine == exclude) 1626 continue; 1627 1628 if (!wait_for_idle(engine)) 1629 return -EIO; 1630 } 1631 1632 return 0; 1633 } 1634 1635 static int igt_reset_queue(void *arg) 1636 { 1637 struct intel_gt *gt = arg; 1638 struct i915_gpu_error *global = >->i915->gpu_error; 1639 struct intel_engine_cs *engine; 1640 enum intel_engine_id id; 1641 struct hang h; 1642 int err; 1643 1644 /* Check that we replay pending requests following a hang */ 1645 1646 igt_global_reset_lock(gt); 1647 1648 err = hang_init(&h, gt); 1649 if (err) 1650 goto unlock; 1651 1652 for_each_engine(engine, gt, id) { 1653 struct intel_selftest_saved_policy saved; 1654 struct i915_request *prev; 1655 IGT_TIMEOUT(end_time); 1656 unsigned int count; 1657 bool using_guc = intel_engine_uses_guc(engine); 1658 1659 if (!intel_engine_can_store_dword(engine)) 1660 continue; 1661 1662 if (using_guc) { 1663 err = intel_selftest_modify_policy(engine, &saved, 1664 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK); 1665 if (err) { 1666 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1667 goto fini; 1668 } 1669 } 1670 1671 prev = hang_create_request(&h, engine); 1672 if (IS_ERR(prev)) { 1673 err = PTR_ERR(prev); 1674 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err); 1675 goto restore; 1676 } 1677 1678 i915_request_get(prev); 1679 i915_request_add(prev); 1680 1681 count = 0; 1682 do { 1683 struct i915_request *rq; 1684 unsigned int reset_count; 1685 1686 rq = hang_create_request(&h, engine); 1687 if (IS_ERR(rq)) { 1688 err = PTR_ERR(rq); 1689 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1690 goto restore; 1691 } 1692 1693 i915_request_get(rq); 1694 i915_request_add(rq); 1695 1696 /* 1697 * XXX We don't handle resetting the kernel context 1698 * very well. If we trigger a device reset twice in 1699 * quick succession while the kernel context is 1700 * executing, we may end up skipping the breadcrumb. 1701 * This is really only a problem for the selftest as 1702 * normally there is a large interlude between resets 1703 * (hangcheck), or we focus on resetting just one 1704 * engine and so avoid repeatedly resetting innocents. 1705 */ 1706 err = wait_for_others(gt, engine); 1707 if (err) { 1708 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1709 __func__, engine->name); 1710 i915_request_put(rq); 1711 i915_request_put(prev); 1712 1713 GEM_TRACE_DUMP(); 1714 intel_gt_set_wedged(gt); 1715 goto restore; 1716 } 1717 1718 if (!wait_until_running(&h, prev)) { 1719 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1720 1721 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1722 __func__, engine->name, 1723 prev->fence.seqno, hws_seqno(&h, prev)); 1724 intel_engine_dump(engine, &p, 1725 "%s\n", engine->name); 1726 1727 i915_request_put(rq); 1728 i915_request_put(prev); 1729 1730 intel_gt_set_wedged(gt); 1731 1732 err = -EIO; 1733 goto restore; 1734 } 1735 1736 reset_count = fake_hangcheck(gt, BIT(id)); 1737 1738 if (prev->fence.error != -EIO) { 1739 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1740 prev->fence.error); 1741 i915_request_put(rq); 1742 i915_request_put(prev); 1743 err = -EINVAL; 1744 goto restore; 1745 } 1746 1747 if (rq->fence.error) { 1748 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1749 rq->fence.error); 1750 i915_request_put(rq); 1751 i915_request_put(prev); 1752 err = -EINVAL; 1753 goto restore; 1754 } 1755 1756 if (i915_reset_count(global) == reset_count) { 1757 pr_err("No GPU reset recorded!\n"); 1758 i915_request_put(rq); 1759 i915_request_put(prev); 1760 err = -EINVAL; 1761 goto restore; 1762 } 1763 1764 i915_request_put(prev); 1765 prev = rq; 1766 count++; 1767 } while (time_before(jiffies, end_time)); 1768 pr_info("%s: Completed %d queued resets\n", 1769 engine->name, count); 1770 1771 *h.batch = MI_BATCH_BUFFER_END; 1772 intel_gt_chipset_flush(engine->gt); 1773 1774 i915_request_put(prev); 1775 1776 restore: 1777 if (using_guc) { 1778 int err2 = intel_selftest_restore_policy(engine, &saved); 1779 1780 if (err2) 1781 pr_err("%s:%d> [%s] Restore policy failed: %d!\n", 1782 __func__, __LINE__, engine->name, err2); 1783 if (err == 0) 1784 err = err2; 1785 } 1786 if (err) 1787 goto fini; 1788 1789 err = igt_flush_test(gt->i915); 1790 if (err) { 1791 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1792 break; 1793 } 1794 } 1795 1796 fini: 1797 hang_fini(&h); 1798 unlock: 1799 igt_global_reset_unlock(gt); 1800 1801 if (intel_gt_is_wedged(gt)) 1802 return -EIO; 1803 1804 return err; 1805 } 1806 1807 static int igt_handle_error(void *arg) 1808 { 1809 struct intel_gt *gt = arg; 1810 struct i915_gpu_error *global = >->i915->gpu_error; 1811 struct intel_engine_cs *engine = gt->engine[RCS0]; 1812 struct hang h; 1813 struct i915_request *rq; 1814 struct i915_gpu_coredump *error; 1815 int err; 1816 1817 /* Check that we can issue a global GPU and engine reset */ 1818 1819 if (!intel_has_reset_engine(gt)) 1820 return 0; 1821 1822 if (!engine || !intel_engine_can_store_dword(engine)) 1823 return 0; 1824 1825 err = hang_init(&h, gt); 1826 if (err) { 1827 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1828 return err; 1829 } 1830 1831 rq = hang_create_request(&h, engine); 1832 if (IS_ERR(rq)) { 1833 err = PTR_ERR(rq); 1834 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1835 goto err_fini; 1836 } 1837 1838 i915_request_get(rq); 1839 i915_request_add(rq); 1840 1841 if (!wait_until_running(&h, rq)) { 1842 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1843 1844 pr_err("%s: Failed to start request %llx, at %x\n", 1845 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1846 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1847 1848 intel_gt_set_wedged(gt); 1849 1850 err = -EIO; 1851 goto err_request; 1852 } 1853 1854 /* Temporarily disable error capture */ 1855 error = xchg(&global->first_error, (void *)-1); 1856 1857 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1858 1859 xchg(&global->first_error, error); 1860 1861 if (rq->fence.error != -EIO) { 1862 pr_err("Guilty request not identified!\n"); 1863 err = -EINVAL; 1864 goto err_request; 1865 } 1866 1867 err_request: 1868 i915_request_put(rq); 1869 err_fini: 1870 hang_fini(&h); 1871 return err; 1872 } 1873 1874 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1875 const struct igt_atomic_section *p, 1876 const char *mode) 1877 { 1878 struct tasklet_struct * const t = &engine->sched_engine->tasklet; 1879 int err; 1880 1881 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1882 engine->name, mode, p->name); 1883 1884 if (t->func) 1885 tasklet_disable(t); 1886 if (strcmp(p->name, "softirq")) 1887 local_bh_disable(); 1888 p->critical_section_begin(); 1889 1890 err = __intel_engine_reset_bh(engine, NULL); 1891 1892 p->critical_section_end(); 1893 if (strcmp(p->name, "softirq")) 1894 local_bh_enable(); 1895 if (t->func) { 1896 tasklet_enable(t); 1897 tasklet_hi_schedule(t); 1898 } 1899 1900 if (err) 1901 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1902 engine->name, mode, p->name); 1903 1904 return err; 1905 } 1906 1907 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1908 const struct igt_atomic_section *p) 1909 { 1910 struct i915_request *rq; 1911 struct hang h; 1912 int err; 1913 1914 err = __igt_atomic_reset_engine(engine, p, "idle"); 1915 if (err) 1916 return err; 1917 1918 err = hang_init(&h, engine->gt); 1919 if (err) { 1920 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1921 return err; 1922 } 1923 1924 rq = hang_create_request(&h, engine); 1925 if (IS_ERR(rq)) { 1926 err = PTR_ERR(rq); 1927 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1928 goto out; 1929 } 1930 1931 i915_request_get(rq); 1932 i915_request_add(rq); 1933 1934 if (wait_until_running(&h, rq)) { 1935 err = __igt_atomic_reset_engine(engine, p, "active"); 1936 } else { 1937 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1938 __func__, engine->name, 1939 rq->fence.seqno, hws_seqno(&h, rq)); 1940 intel_gt_set_wedged(engine->gt); 1941 err = -EIO; 1942 } 1943 1944 if (err == 0) { 1945 struct intel_wedge_me w; 1946 1947 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1948 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1949 if (intel_gt_is_wedged(engine->gt)) 1950 err = -EIO; 1951 } 1952 1953 i915_request_put(rq); 1954 out: 1955 hang_fini(&h); 1956 return err; 1957 } 1958 1959 static int igt_reset_engines_atomic(void *arg) 1960 { 1961 struct intel_gt *gt = arg; 1962 const typeof(*igt_atomic_phases) *p; 1963 int err = 0; 1964 1965 /* Check that the engines resets are usable from atomic context */ 1966 1967 if (!intel_has_reset_engine(gt)) 1968 return 0; 1969 1970 if (intel_uc_uses_guc_submission(>->uc)) 1971 return 0; 1972 1973 igt_global_reset_lock(gt); 1974 1975 /* Flush any requests before we get started and check basics */ 1976 if (!igt_force_reset(gt)) 1977 goto unlock; 1978 1979 for (p = igt_atomic_phases; p->name; p++) { 1980 struct intel_engine_cs *engine; 1981 enum intel_engine_id id; 1982 1983 for_each_engine(engine, gt, id) { 1984 err = igt_atomic_reset_engine(engine, p); 1985 if (err) 1986 goto out; 1987 } 1988 } 1989 1990 out: 1991 /* As we poke around the guts, do a full reset before continuing. */ 1992 igt_force_reset(gt); 1993 unlock: 1994 igt_global_reset_unlock(gt); 1995 1996 return err; 1997 } 1998 1999 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 2000 { 2001 static const struct i915_subtest tests[] = { 2002 SUBTEST(igt_hang_sanitycheck), 2003 SUBTEST(igt_reset_nop), 2004 SUBTEST(igt_reset_nop_engine), 2005 SUBTEST(igt_reset_idle_engine), 2006 SUBTEST(igt_reset_active_engine), 2007 SUBTEST(igt_reset_fail_engine), 2008 SUBTEST(igt_reset_engines), 2009 SUBTEST(igt_reset_engines_atomic), 2010 SUBTEST(igt_reset_queue), 2011 SUBTEST(igt_reset_wait), 2012 SUBTEST(igt_reset_evict_ggtt), 2013 SUBTEST(igt_reset_evict_ppgtt), 2014 SUBTEST(igt_reset_evict_fence), 2015 SUBTEST(igt_handle_error), 2016 }; 2017 struct intel_gt *gt = &i915->gt; 2018 intel_wakeref_t wakeref; 2019 int err; 2020 2021 if (!intel_has_gpu_reset(gt)) 2022 return 0; 2023 2024 if (intel_gt_is_wedged(gt)) 2025 return -EIO; /* we're long past hope of a successful reset */ 2026 2027 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 2028 2029 err = intel_gt_live_subtests(tests, gt); 2030 2031 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 2032 2033 return err; 2034 } 2035